Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 39ea071f2005ec99808fe1e09994b23aa17f70eb [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
				86	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	88
				89	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	90	static PyUnicodeObject *unicode_freelist;
				91	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	93	/* Default encoding to use and assume when NULL is passed as encoding
				94	parameter; it is initialized by _PyUnicode_Init().
				95
				96	Always use the PyUnicode_SetDefaultEncoding() and
				97	PyUnicode_GetDefaultEncoding() APIs to access this global.
				98
				99	*/
				100
				101	static char unicode_default_encoding[100];
				102
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	103	/* --- Unicode Object ----------------------------------------------------- */
				104
				105	static
				106	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				107	int length)
				108	{
				109	void *oldstr;
				110
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	111	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	112	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	113	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	114
				115	/* Resizing unicode_empty is not allowed. */
				116	if (unicode == unicode_empty) {
				117	PyErr_SetString(PyExc_SystemError,
				118	"can't resize empty unicode object");
				119	return -1;
				120	}
				121
				122	/* We allocate one more byte to make sure the string is
				123	Ux0000 terminated -- XXX is this needed ? */
				124	oldstr = unicode->str;
				125	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				126	if (!unicode->str) {
				127	unicode->str = oldstr;
				128	PyErr_NoMemory();
				129	return -1;
				130	}
				131	unicode->str[length] = 0;
				132	unicode->length = length;
				133
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	134	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	136	if (unicode->defenc) {
				137	Py_DECREF(unicode->defenc);
				138	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	}
				140	unicode->hash = -1;
				141
				142	return 0;
				143	}
				144
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	int PyUnicode_Resize(PyObject **unicode,
				146	int length)
				147	{
				148	PyUnicodeObject *v;
				149
				150	if (unicode == NULL) {
				151	PyErr_BadInternalCall();
				152	return -1;
				153	}
				154	v = (PyUnicodeObject )unicode;
				155	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				156	PyErr_BadInternalCall();
				157	return -1;
				158	}
				159	return _PyUnicode_Resize(v, length);
				160	}
				161
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	162	/* We allocate one more byte to make sure the string is
				163	Ux0000 terminated -- XXX is this needed ?
				164
				165	XXX This allocator could further be enhanced by assuring that the
				166	free list never reduces its size below 1.
				167
				168	*/
				169
				170	static
				171	PyUnicodeObject *_PyUnicode_New(int length)
				172	{
				173	register PyUnicodeObject *unicode;
				174
				175	/* Optimization for empty strings */
				176	if (length == 0 && unicode_empty != NULL) {
				177	Py_INCREF(unicode_empty);
				178	return unicode_empty;
				179	}
				180
				181	/* Unicode freelist & memory allocation */
				182	if (unicode_freelist) {
				183	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	184	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	185	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	186	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	187	/* Keep-Alive optimization: we only upsize the buffer,
				188	never downsize it. */
				189	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	190	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	191	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	192	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	193	}
				194	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	195	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	197	}
				198	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	}
				200	else {
				201	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				202	if (unicode == NULL)
				203	return NULL;
				204	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				205	}
				206
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	207	if (!unicode->str) {
				208	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	209	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	211	unicode->str[length] = 0;
				212	unicode->length = length;
				213	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	214	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	215	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	216
				217	onError:
				218	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	219	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	220	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	221	}
				222
				223	static
				224	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				225	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	226	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	227	/* Keep-Alive optimization */
				228	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	229	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	230	unicode->str = NULL;
				231	unicode->length = 0;
				232	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	233	if (unicode->defenc) {
				234	Py_DECREF(unicode->defenc);
				235	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	236	}
				237	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	238	(PyUnicodeObject *)unicode = unicode_freelist;
				239	unicode_freelist = unicode;
				240	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	}
				242	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	243	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	244	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	246	}
				247	}
				248
				249	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				250	int size)
				251	{
				252	PyUnicodeObject *unicode;
				253
				254	unicode = _PyUnicode_New(size);
				255	if (!unicode)
				256	return NULL;
				257
				258	/* Copy the Unicode data into the new object */
				259	if (u != NULL)
				260	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				261
				262	return (PyObject *)unicode;
				263	}
				264
				265	#ifdef HAVE_WCHAR_H
				266
				267	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				268	int size)
				269	{
				270	PyUnicodeObject *unicode;
				271
				272	if (w == NULL) {
				273	PyErr_BadInternalCall();
				274	return NULL;
				275	}
				276
				277	unicode = _PyUnicode_New(size);
				278	if (!unicode)
				279	return NULL;
				280
				281	/* Copy the wchar_t data into the new object */
				282	#ifdef HAVE_USABLE_WCHAR_T
				283	memcpy(unicode->str, w, size * sizeof(wchar_t));
				284	#else
				285	{
				286	register Py_UNICODE *u;
				287	register int i;
				288	u = PyUnicode_AS_UNICODE(unicode);
				289	for (i = size; i >= 0; i--)
				290	u++ = w++;
				291	}
				292	#endif
				293
				294	return (PyObject *)unicode;
				295	}
				296
				297	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				298	register wchar_t *w,
				299	int size)
				300	{
				301	if (unicode == NULL) {
				302	PyErr_BadInternalCall();
				303	return -1;
				304	}
				305	if (size > PyUnicode_GET_SIZE(unicode))
				306	size = PyUnicode_GET_SIZE(unicode);
				307	#ifdef HAVE_USABLE_WCHAR_T
				308	memcpy(w, unicode->str, size * sizeof(wchar_t));
				309	#else
				310	{
				311	register Py_UNICODE *u;
				312	register int i;
				313	u = PyUnicode_AS_UNICODE(unicode);
				314	for (i = size; i >= 0; i--)
				315	w++ = u++;
				316	}
				317	#endif
				318
				319	return size;
				320	}
				321
				322	#endif
				323
				324	PyObject PyUnicode_FromObject(register PyObject obj)
				325	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	326	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				327	}
				328
				329	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				330	const char *encoding,
				331	const char *errors)
				332	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	333	const char *s;
				334	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	335	int owned = 0;
				336	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	337
				338	if (obj == NULL) {
				339	PyErr_BadInternalCall();
				340	return NULL;
				341	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	342
				343	/* Coerce object */
				344	if (PyInstance_Check(obj)) {
				345	PyObject *func;
				346	func = PyObject_GetAttrString(obj, "__str__");
				347	if (func == NULL) {
				348	PyErr_SetString(PyExc_TypeError,
				349	"coercing to Unicode: instance doesn't define __str__");
				350	return NULL;
				351	}
				352	obj = PyEval_CallObject(func, NULL);
				353	Py_DECREF(func);
				354	if (obj == NULL)
				355	return NULL;
				356	owned = 1;
				357	}
				358	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	359	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	360	v = obj;
				361	if (encoding) {
				362	PyErr_SetString(PyExc_TypeError,
				363	"decoding Unicode is not supported");
				364	return NULL;
				365	}
				366	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	367	}
				368	else if (PyString_Check(obj)) {
				369	s = PyString_AS_STRING(obj);
				370	len = PyString_GET_SIZE(obj);
				371	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	372	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				373	/* Overwrite the error message with something more useful in
				374	case of a TypeError. */
				375	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	376	PyErr_Format(PyExc_TypeError,
				377	"coercing to Unicode: need string or buffer, "
				378	"%.80s found",
				379	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	380	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	381	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	382
				383	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	384	if (len == 0) {
				385	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	386	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	387	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	388	else
				389	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	390
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	391	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	392	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	393	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	394	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	395	return v;
				396
				397	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	398	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	399	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	400	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	401	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	402	}
				403
				404	PyObject PyUnicode_Decode(const char s,
				405	int size,
				406	const char *encoding,
				407	const char *errors)
				408	{
				409	PyObject buffer = NULL, unicode;
				410
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	411	if (encoding == NULL)
				412	encoding = PyUnicode_GetDefaultEncoding();
				413
				414	/* Shortcuts for common default encodings */
				415	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	416	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	417	else if (strcmp(encoding, "latin-1") == 0)
				418	return PyUnicode_DecodeLatin1(s, size, errors);
				419	else if (strcmp(encoding, "ascii") == 0)
				420	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	421
				422	/* Decode via the codec registry */
				423	buffer = PyBuffer_FromMemory((void *)s, size);
				424	if (buffer == NULL)
				425	goto onError;
				426	unicode = PyCodec_Decode(buffer, encoding, errors);
				427	if (unicode == NULL)
				428	goto onError;
				429	if (!PyUnicode_Check(unicode)) {
				430	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	431	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	432	unicode->ob_type->tp_name);
				433	Py_DECREF(unicode);
				434	goto onError;
				435	}
				436	Py_DECREF(buffer);
				437	return unicode;
				438
				439	onError:
				440	Py_XDECREF(buffer);
				441	return NULL;
				442	}
				443
				444	PyObject PyUnicode_Encode(const Py_UNICODE s,
				445	int size,
				446	const char *encoding,
				447	const char *errors)
				448	{
				449	PyObject v, unicode;
				450
				451	unicode = PyUnicode_FromUnicode(s, size);
				452	if (unicode == NULL)
				453	return NULL;
				454	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				455	Py_DECREF(unicode);
				456	return v;
				457	}
				458
				459	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				460	const char *encoding,
				461	const char *errors)
				462	{
				463	PyObject *v;
				464
				465	if (!PyUnicode_Check(unicode)) {
				466	PyErr_BadArgument();
				467	goto onError;
				468	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	469
				470	if (encoding == NULL)
				471	encoding = PyUnicode_GetDefaultEncoding();
				472
				473	/* Shortcuts for common default encodings */
				474	if (errors == NULL) {
				475	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	476	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	477	else if (strcmp(encoding, "latin-1") == 0)
				478	return PyUnicode_AsLatin1String(unicode);
				479	else if (strcmp(encoding, "ascii") == 0)
				480	return PyUnicode_AsASCIIString(unicode);
				481	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	482
				483	/* Encode via the codec registry */
				484	v = PyCodec_Encode(unicode, encoding, errors);
				485	if (v == NULL)
				486	goto onError;
				487	/* XXX Should we really enforce this ? */
				488	if (!PyString_Check(v)) {
				489	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	490	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	491	v->ob_type->tp_name);
				492	Py_DECREF(v);
				493	goto onError;
				494	}
				495	return v;
				496
				497	onError:
				498	return NULL;
				499	}
				500
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	501	/* Return a Python string holding the default encoded value of the
				502	Unicode object.
				503
				504	The resulting string is cached in the Unicode object for subsequent
				505	usage by this function. The cached version is needed to implement
				506	the character buffer interface and will live (at least) as long as
				507	the Unicode object itself.
				508
				509	The refcount of the string is not incremented.
				510
				511	* Exported for internal use by the interpreter only !!! *
				512
				513	*/
				514
				515	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				516	const char *errors)
				517	{
				518	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				519
				520	if (v)
				521	return v;
				522	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				523	if (v && errors == NULL)
				524	((PyUnicodeObject *)unicode)->defenc = v;
				525	return v;
				526	}
				527
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	528	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				529	{
				530	if (!PyUnicode_Check(unicode)) {
				531	PyErr_BadArgument();
				532	goto onError;
				533	}
				534	return PyUnicode_AS_UNICODE(unicode);
				535
				536	onError:
				537	return NULL;
				538	}
				539
				540	int PyUnicode_GetSize(PyObject *unicode)
				541	{
				542	if (!PyUnicode_Check(unicode)) {
				543	PyErr_BadArgument();
				544	goto onError;
				545	}
				546	return PyUnicode_GET_SIZE(unicode);
				547
				548	onError:
				549	return -1;
				550	}
				551
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	552	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	553	{
				554	return unicode_default_encoding;
				555	}
				556
				557	int PyUnicode_SetDefaultEncoding(const char *encoding)
				558	{
				559	PyObject *v;
				560
				561	/* Make sure the encoding is valid. As side effect, this also
				562	loads the encoding into the codec registry cache. */
				563	v = _PyCodec_Lookup(encoding);
				564	if (v == NULL)
				565	goto onError;
				566	Py_DECREF(v);
				567	strncpy(unicode_default_encoding,
				568	encoding,
				569	sizeof(unicode_default_encoding));
				570	return 0;
				571
				572	onError:
				573	return -1;
				574	}
				575
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	576	/* --- UTF-8 Codec -------------------------------------------------------- */
				577
				578	static
				579	char utf8_code_length[256] = {
				580	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				581	illegal prefix. see RFC 2279 for details */
				582	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				583	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				584	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				585	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				586	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				587	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				588	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				589	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				590	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				591	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				592	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				593	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				594	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				595	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				596	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				597	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				598	};
				599
				600	static
				601	int utf8_decoding_error(const char **source,
				602	Py_UNICODE **dest,
				603	const char *errors,
				604	const char *details)
				605	{
				606	if ((errors == NULL) \|\|
				607	(strcmp(errors,"strict") == 0)) {
				608	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	609	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	610	details);
				611	return -1;
				612	}
				613	else if (strcmp(errors,"ignore") == 0) {
				614	(*source)++;
				615	return 0;
				616	}
				617	else if (strcmp(errors,"replace") == 0) {
				618	(*source)++;
				619	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				620	(*dest)++;
				621	return 0;
				622	}
				623	else {
				624	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	625	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	626	errors);
				627	return -1;
				628	}
				629	}
				630
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	631	PyObject PyUnicode_DecodeUTF8(const char s,
				632	int size,
				633	const char *errors)
				634	{
				635	int n;
				636	const char *e;
				637	PyUnicodeObject *unicode;
				638	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	639	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	640
				641	/* Note: size will always be longer than the resulting Unicode
				642	character count */
				643	unicode = _PyUnicode_New(size);
				644	if (!unicode)
				645	return NULL;
				646	if (size == 0)
				647	return (PyObject *)unicode;
				648
				649	/* Unpack UTF-8 encoded data */
				650	p = unicode->str;
				651	e = s + size;
				652
				653	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	654	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	655
				656	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	657	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	658	s++;
				659	continue;
				660	}
				661
				662	n = utf8_code_length[ch];
				663
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	664	if (s + n > e) {
				665	errmsg = "unexpected end of data";
				666	goto utf8Error;
				667	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	668
				669	switch (n) {
				670
				671	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	672	errmsg = "unexpected code byte";
				673	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	674	break;
				675
				676	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	677	errmsg = "internal error";
				678	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	679	break;
				680
				681	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	682	if ((s[1] & 0xc0) != 0x80) {
				683	errmsg = "invalid data";
				684	goto utf8Error;
				685	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	686	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	687	if (ch < 0x80) {
				688	errmsg = "illegal encoding";
				689	goto utf8Error;
				690	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	691	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	692	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	693	break;
				694
				695	case 3:
				696	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	697	(s[2] & 0xc0) != 0x80) {
				698	errmsg = "invalid data";
				699	goto utf8Error;
				700	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	701	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	702	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				703	errmsg = "illegal encoding";
				704	goto utf8Error;
				705	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	706	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	707	*p++ = (Py_UNICODE)ch;
				708	break;
				709
				710	case 4:
				711	if ((s[1] & 0xc0) != 0x80 \|\|
				712	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	713	(s[3] & 0xc0) != 0x80) {
				714	errmsg = "invalid data";
				715	goto utf8Error;
				716	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	717	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				718	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				719	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	720	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				721	byte encoding */
				722	(ch > 0x10ffff)) { /* maximum value allowed for
				723	UTF-16 */
				724	errmsg = "illegal encoding";
				725	goto utf8Error;
				726	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	727	/* compute and append the two surrogates: */
				728
				729	/* translate from 10000..10FFFF to 0..FFFF */
				730	ch -= 0x10000;
				731
				732	/* high surrogate = top 10 bits added to D800 */
				733	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				734
				735	/* low surrogate = bottom 10 bits added to DC00 */
				736	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	737	break;
				738
				739	default:
				740	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	741	errmsg = "unsupported Unicode code range";
				742	goto utf8Error;
				743	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	744	}
				745	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	746	continue;
				747
				748	utf8Error:
				749	if (utf8_decoding_error(&s, &p, errors, errmsg))
				750	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	751	}
				752
				753	/* Adjust length */
				754	if (_PyUnicode_Resize(unicode, p - unicode->str))
				755	goto onError;
				756
				757	return (PyObject *)unicode;
				758
				759	onError:
				760	Py_DECREF(unicode);
				761	return NULL;
				762	}
				763
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	764	/* Not used anymore, now that the encoder supports UTF-16
				765	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	766	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	767	static
				768	int utf8_encoding_error(const Py_UNICODE **source,
				769	char **dest,
				770	const char *errors,
				771	const char *details)
				772	{
				773	if ((errors == NULL) \|\|
				774	(strcmp(errors,"strict") == 0)) {
				775	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	776	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	777	details);
				778	return -1;
				779	}
				780	else if (strcmp(errors,"ignore") == 0) {
				781	return 0;
				782	}
				783	else if (strcmp(errors,"replace") == 0) {
				784	**dest = '?';
				785	(*dest)++;
				786	return 0;
				787	}
				788	else {
				789	PyErr_Format(PyExc_ValueError,
				790	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	791	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	792	errors);
				793	return -1;
				794	}
				795	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	796	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	797
				798	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				799	int size,
				800	const char *errors)
				801	{
				802	PyObject *v;
				803	char *p;
				804	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	805	Py_UCS4 ch2;
				806	unsigned int cbAllocated = 3 * size;
				807	unsigned int cbWritten = 0;
				808	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	809
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	810	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	811	if (v == NULL)
				812	return NULL;
				813	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	814	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	815
				816	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	817	while (i < size) {
				818	Py_UCS4 ch = s[i++];
				819	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	820	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	821	cbWritten++;
				822	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	823	else if (ch < 0x0800) {
				824	*p++ = 0xc0 \| (ch >> 6);
				825	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	826	cbWritten += 2;
				827	}
				828	else {
				829	/* Check for high surrogate */
				830	if (0xD800 <= ch && ch <= 0xDBFF) {
				831	if (i != size) {
				832	ch2 = s[i];
				833	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				834
				835	if (cbWritten >= (cbAllocated - 4)) {
				836	/* Provide enough room for some more
				837	surrogates */
				838	cbAllocated += 4*10;
				839	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	840	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	841	}
				842
				843	/* combine the two values */
				844	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				845
				846	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	847	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	848	i++;
				849	cbWritten += 4;
				850	}
				851	}
				852	}
				853	else {
				854	*p++ = (char)(0xe0 \| (ch >> 12));
				855	cbWritten += 3;
				856	}
				857	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				858	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	859	}
				860	}
				861	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	862	if (_PyString_Resize(&v, p - q))
				863	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	864	return v;
				865
				866	onError:
				867	Py_DECREF(v);
				868	return NULL;
				869	}
				870
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	871	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				872	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	873	if (!PyUnicode_Check(unicode)) {
				874	PyErr_BadArgument();
				875	return NULL;
				876	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	877	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				878	PyUnicode_GET_SIZE(unicode),
				879	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	880	}
				881
				882	/* --- UTF-16 Codec ------------------------------------------------------- */
				883
				884	static
				885	int utf16_decoding_error(const Py_UNICODE **source,
				886	Py_UNICODE **dest,
				887	const char *errors,
				888	const char *details)
				889	{
				890	if ((errors == NULL) \|\|
				891	(strcmp(errors,"strict") == 0)) {
				892	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	893	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	894	details);
				895	return -1;
				896	}
				897	else if (strcmp(errors,"ignore") == 0) {
				898	return 0;
				899	}
				900	else if (strcmp(errors,"replace") == 0) {
				901	if (dest) {
				902	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				903	(*dest)++;
				904	}
				905	return 0;
				906	}
				907	else {
				908	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	909	"UTF-16 decoding error; "
				910	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	911	errors);
				912	return -1;
				913	}
				914	}
				915
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	916	PyObject PyUnicode_DecodeUTF16(const char s,
				917	int size,
				918	const char *errors,
				919	int *byteorder)
				920	{
				921	PyUnicodeObject *unicode;
				922	Py_UNICODE *p;
				923	const Py_UNICODE q, e;
				924	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	925	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	926
				927	/* size should be an even number */
				928	if (size % sizeof(Py_UNICODE) != 0) {
				929	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				930	return NULL;
				931	/* The remaining input chars are ignored if we fall through
				932	here... */
				933	}
				934
				935	/* Note: size will always be longer than the resulting Unicode
				936	character count */
				937	unicode = _PyUnicode_New(size);
				938	if (!unicode)
				939	return NULL;
				940	if (size == 0)
				941	return (PyObject *)unicode;
				942
				943	/* Unpack UTF-16 encoded data */
				944	p = unicode->str;
				945	q = (Py_UNICODE *)s;
				946	e = q + (size / sizeof(Py_UNICODE));
				947
				948	if (byteorder)
				949	bo = *byteorder;
				950
				951	while (q < e) {
				952	register Py_UNICODE ch = *q++;
				953
				954	/* Check for BOM marks (U+FEFF) in the input and adjust
				955	current byte order setting accordingly. Swap input
				956	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				957	!) */
				958	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				959	if (ch == 0xFEFF) {
				960	bo = -1;
				961	continue;
				962	} else if (ch == 0xFFFE) {
				963	bo = 1;
				964	continue;
				965	}
				966	if (bo == 1)
				967	ch = (ch >> 8) \| (ch << 8);
				968	#else
				969	if (ch == 0xFEFF) {
				970	bo = 1;
				971	continue;
				972	} else if (ch == 0xFFFE) {
				973	bo = -1;
				974	continue;
				975	}
				976	if (bo == -1)
				977	ch = (ch >> 8) \| (ch << 8);
				978	#endif
				979	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				980	*p++ = ch;
				981	continue;
				982	}
				983
				984	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	985	if (q >= e) {
				986	errmsg = "unexpected end of data";
				987	goto utf16Error;
				988	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	989	if (0xDC00 <= q && q <= 0xDFFF) {
				990	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	991	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	992	/* This is valid data (a UTF-16 surrogate pair), but
				993	we are not able to store this information since our
				994	Py_UNICODE type only has 16 bits... this might
				995	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	996	errmsg = "code pairs are not supported";
				997	goto utf16Error;
				998	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	999	else
				1000	continue;
				1001	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1002	errmsg = "illegal encoding";
				1003	/* Fall through to report the error */
				1004
				1005	utf16Error:
				1006	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1007	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1008	}
				1009
				1010	if (byteorder)
				1011	*byteorder = bo;
				1012
				1013	/* Adjust length */
				1014	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1015	goto onError;
				1016
				1017	return (PyObject *)unicode;
				1018
				1019	onError:
				1020	Py_DECREF(unicode);
				1021	return NULL;
				1022	}
				1023
				1024	#undef UTF16_ERROR
				1025
				1026	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1027	int size,
				1028	const char *errors,
				1029	int byteorder)
				1030	{
				1031	PyObject *v;
				1032	Py_UNICODE *p;
				1033	char *q;
				1034
				1035	/* We don't create UTF-16 pairs... */
				1036	v = PyString_FromStringAndSize(NULL,
				1037	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1038	if (v == NULL)
				1039	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1040
				1041	q = PyString_AS_STRING(v);
				1042	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1043	if (byteorder == 0)
				1044	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1045	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1046	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1047	if (byteorder == 0 \|\|
				1048	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1049	byteorder == -1
				1050	#else
				1051	byteorder == 1
				1052	#endif
				1053	)
				1054	memcpy(p, s, size * sizeof(Py_UNICODE));
				1055	else
				1056	while (size-- > 0) {
				1057	Py_UNICODE ch = *s++;
				1058	*p++ = (ch >> 8) \| (ch << 8);
				1059	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1060	return v;
				1061	}
				1062
				1063	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1064	{
				1065	if (!PyUnicode_Check(unicode)) {
				1066	PyErr_BadArgument();
				1067	return NULL;
				1068	}
				1069	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1070	PyUnicode_GET_SIZE(unicode),
				1071	NULL,
				1072	0);
				1073	}
				1074
				1075	/* --- Unicode Escape Codec ----------------------------------------------- */
				1076
				1077	static
				1078	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1079	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1080	const char *errors,
				1081	const char *details)
				1082	{
				1083	if ((errors == NULL) \|\|
				1084	(strcmp(errors,"strict") == 0)) {
				1085	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1086	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1087	details);
				1088	return -1;
				1089	}
				1090	else if (strcmp(errors,"ignore") == 0) {
				1091	return 0;
				1092	}
				1093	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1094	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1095	return 0;
				1096	}
				1097	else {
				1098	PyErr_Format(PyExc_ValueError,
				1099	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1100	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1101	errors);
				1102	return -1;
				1103	}
				1104	}
				1105
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame^]	1106	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1107
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1108	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1109	int size,
				1110	const char *errors)
				1111	{
				1112	PyUnicodeObject *v;
				1113	Py_UNICODE p = NULL, buf = NULL;
				1114	const char *end;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1115	Py_UCS4 chr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1116
				1117	/* Escaped strings will always be longer than the resulting
				1118	Unicode string, so we start with size here and then reduce the
				1119	length after conversion to the true value. */
				1120	v = _PyUnicode_New(size);
				1121	if (v == NULL)
				1122	goto onError;
				1123	if (size == 0)
				1124	return (PyObject *)v;
				1125	p = buf = PyUnicode_AS_UNICODE(v);
				1126	end = s + size;
				1127	while (s < end) {
				1128	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1129	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1130	int i;
				1131
				1132	/* Non-escape characters are interpreted as Unicode ordinals */
				1133	if (*s != '\\') {
				1134	p++ = (unsigned char)s++;
				1135	continue;
				1136	}
				1137
				1138	/* \ - Escapes */
				1139	s++;
				1140	switch (*s++) {
				1141
				1142	/* \x escapes */
				1143	case '\n': break;
				1144	case '\\': *p++ = '\\'; break;
				1145	case '\'': *p++ = '\''; break;
				1146	case '\"': *p++ = '\"'; break;
				1147	case 'b': *p++ = '\b'; break;
				1148	case 'f': p++ = '\014'; break; / FF */
				1149	case 't': *p++ = '\t'; break;
				1150	case 'n': *p++ = '\n'; break;
				1151	case 'r': *p++ = '\r'; break;
				1152	case 'v': p++ = '\013'; break; / VT */
				1153	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1154
				1155	/* \OOO (octal) escapes */
				1156	case '0': case '1': case '2': case '3':
				1157	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1158	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1159	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1160	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1161	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1162	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1163	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1164	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1165	break;
				1166
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1167	/* \xXX with two hex digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1168	case 'x':
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1169	for (x = 0, i = 0; i < 2; i++) {
				1170	c = (unsigned char)s[i];
				1171	if (!isxdigit(c)) {
				1172	if (unicodeescape_decoding_error(&s, &x, errors,
				1173	"truncated \\xXX"))
				1174	goto onError;
				1175	i++;
				1176	break;
				1177	}
				1178	x = (x<<4) & ~0xF;
				1179	if (c >= '0' && c <= '9')
				1180	x += c - '0';
				1181	else if (c >= 'a' && c <= 'f')
				1182	x += 10 + c - 'a';
				1183	else
				1184	x += 10 + c - 'A';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1185	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1186	s += i;
				1187	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1188	break;
				1189
				1190	/* \uXXXX with 4 hex digits */
				1191	case 'u':
				1192	for (x = 0, i = 0; i < 4; i++) {
				1193	c = (unsigned char)s[i];
				1194	if (!isxdigit(c)) {
				1195	if (unicodeescape_decoding_error(&s, &x, errors,
				1196	"truncated \\uXXXX"))
				1197	goto onError;
				1198	i++;
				1199	break;
				1200	}
				1201	x = (x<<4) & ~0xF;
				1202	if (c >= '0' && c <= '9')
				1203	x += c - '0';
				1204	else if (c >= 'a' && c <= 'f')
				1205	x += 10 + c - 'a';
				1206	else
				1207	x += 10 + c - 'A';
				1208	}
				1209	s += i;
				1210	*p++ = x;
				1211	break;
				1212
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1213	/* \UXXXXXXXX with 8 hex digits */
				1214	case 'U':
				1215	for (chr = 0, i = 0; i < 8; i++) {
				1216	c = (unsigned char)s[i];
				1217	if (!isxdigit(c)) {
				1218	if (unicodeescape_decoding_error(&s, &x, errors,
				1219	"truncated \\uXXXX"))
				1220	goto onError;
				1221	i++;
				1222	break;
				1223	}
				1224	chr = (chr<<4) & ~0xF;
				1225	if (c >= '0' && c <= '9')
				1226	chr += c - '0';
				1227	else if (c >= 'a' && c <= 'f')
				1228	chr += 10 + c - 'a';
				1229	else
				1230	chr += 10 + c - 'A';
				1231	}
				1232	s += i;
				1233	goto store;
				1234
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1235	case 'N':
				1236	/* Ok, we need to deal with Unicode Character Names now,
				1237	* make sure we've imported the hash table data...
				1238	*/
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame^]	1239	if (ucnhash_CAPI == NULL) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1240	PyObject mod = 0, v = 0;
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame^]	1241	mod = PyImport_ImportModule("unicodedata");
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1242	if (mod == NULL)
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1243	goto ucnhashError;
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame^]	1244	v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1245	Py_DECREF(mod);
				1246	if (v == NULL)
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1247	goto ucnhashError;
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame^]	1248	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1249	Py_DECREF(v);
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame^]	1250	if (ucnhash_CAPI == NULL)
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1251	goto ucnhashError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1252	}
				1253
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1254	if (*s == '{') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1255	const char *start = s + 1;
				1256	const char *endBrace = start;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1257
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1258	/* look for the closing brace */
				1259	while (*endBrace != '}' && endBrace < end)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1260	endBrace++;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1261	if (endBrace != end && *endBrace == '}') {
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame^]	1262	if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1263	if (unicodeescape_decoding_error(
				1264	&s, &x, errors,
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1265	"Invalid Unicode Character Name")
				1266	)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1267	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1268	goto ucnFallthrough;
				1269	}
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1270	s = endBrace + 1;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1271	goto store;
				1272	} else {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1273	if (unicodeescape_decoding_error(
				1274	&s, &x, errors,
				1275	"Unicode name missing closing brace"))
				1276	goto onError;
				1277	goto ucnFallthrough;
				1278	}
				1279	break;
				1280	}
				1281	if (unicodeescape_decoding_error(
				1282	&s, &x, errors,
				1283	"Missing opening brace for Unicode Character Name escape"))
				1284	goto onError;
				1285	ucnFallthrough:
				1286	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1287	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1288	*p++ = '\\';
				1289	*p++ = (unsigned char)s[-1];
				1290	break;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1291	store:
				1292	/* when we get here, chr is a 32-bit unicode character */
				1293	if (chr <= 0xffff)
				1294	/* UCS-2 character */
				1295	*p++ = (Py_UNICODE) chr;
				1296	else if (chr <= 0x10ffff) {
				1297	/* UCS-4 character. store as two surrogate characters */
				1298	chr -= 0x10000L;
				1299	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
				1300	*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
				1301	} else {
				1302	if (unicodeescape_decoding_error(
				1303	&s, &x, errors,
				1304	"Illegal Unicode character")
				1305	)
				1306	goto onError;
				1307	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1308	}
				1309	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1310	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1311	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1312	return (PyObject *)v;
				1313
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1314	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame^]	1315	PyErr_SetString(
				1316	PyExc_UnicodeError,
				1317	"\\N escapes not supported (can't load unicodedata module)"
				1318	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1319	return NULL;
				1320
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1321	onError:
				1322	Py_XDECREF(v);
				1323	return NULL;
				1324	}
				1325
				1326	/* Return a Unicode-Escape string version of the Unicode object.
				1327
				1328	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1329	appropriate.
				1330
				1331	*/
				1332
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1333	static const Py_UNICODE findchar(const Py_UNICODE s,
				1334	int size,
				1335	Py_UNICODE ch);
				1336
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1337	static
				1338	PyObject unicodeescape_string(const Py_UNICODE s,
				1339	int size,
				1340	int quotes)
				1341	{
				1342	PyObject *repr;
				1343	char *p;
				1344	char *q;
				1345
				1346	static const char *hexdigit = "0123456789ABCDEF";
				1347
				1348	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1349	if (repr == NULL)
				1350	return NULL;
				1351
				1352	p = q = PyString_AS_STRING(repr);
				1353
				1354	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1355	*p++ = 'u';
				1356	*p++ = (findchar(s, size, '\'') &&
				1357	!findchar(s, size, '"')) ? '"' : '\'';
				1358	}
				1359	while (size-- > 0) {
				1360	Py_UNICODE ch = *s++;
				1361	/* Escape quotes */
				1362	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1363	*p++ = '\\';
				1364	*p++ = (char) ch;
				1365	}
				1366	/* Map 16-bit characters to '\uxxxx' */
				1367	else if (ch >= 256) {
				1368	*p++ = '\\';
				1369	*p++ = 'u';
				1370	*p++ = hexdigit[(ch >> 12) & 0xf];
				1371	*p++ = hexdigit[(ch >> 8) & 0xf];
				1372	*p++ = hexdigit[(ch >> 4) & 0xf];
				1373	*p++ = hexdigit[ch & 15];
				1374	}
				1375	/* Map non-printable US ASCII to '\ooo' */
				1376	else if (ch < ' ' \|\| ch >= 128) {
				1377	*p++ = '\\';
				1378	*p++ = hexdigit[(ch >> 6) & 7];
				1379	*p++ = hexdigit[(ch >> 3) & 7];
				1380	*p++ = hexdigit[ch & 7];
				1381	}
				1382	/* Copy everything else as-is */
				1383	else
				1384	*p++ = (char) ch;
				1385	}
				1386	if (quotes)
				1387	*p++ = q[1];
				1388
				1389	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1390	if (_PyString_Resize(&repr, p - q))
				1391	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1392
				1393	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1394
				1395	onError:
				1396	Py_DECREF(repr);
				1397	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1398	}
				1399
				1400	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1401	int size)
				1402	{
				1403	return unicodeescape_string(s, size, 0);
				1404	}
				1405
				1406	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1407	{
				1408	if (!PyUnicode_Check(unicode)) {
				1409	PyErr_BadArgument();
				1410	return NULL;
				1411	}
				1412	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1413	PyUnicode_GET_SIZE(unicode));
				1414	}
				1415
				1416	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1417
				1418	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1419	int size,
				1420	const char *errors)
				1421	{
				1422	PyUnicodeObject *v;
				1423	Py_UNICODE p, buf;
				1424	const char *end;
				1425	const char *bs;
				1426
				1427	/* Escaped strings will always be longer than the resulting
				1428	Unicode string, so we start with size here and then reduce the
				1429	length after conversion to the true value. */
				1430	v = _PyUnicode_New(size);
				1431	if (v == NULL)
				1432	goto onError;
				1433	if (size == 0)
				1434	return (PyObject *)v;
				1435	p = buf = PyUnicode_AS_UNICODE(v);
				1436	end = s + size;
				1437	while (s < end) {
				1438	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1439	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1440	int i;
				1441
				1442	/* Non-escape characters are interpreted as Unicode ordinals */
				1443	if (*s != '\\') {
				1444	p++ = (unsigned char)s++;
				1445	continue;
				1446	}
				1447
				1448	/* \u-escapes are only interpreted iff the number of leading
				1449	backslashes if odd */
				1450	bs = s;
				1451	for (;s < end;) {
				1452	if (*s != '\\')
				1453	break;
				1454	p++ = (unsigned char)s++;
				1455	}
				1456	if (((s - bs) & 1) == 0 \|\|
				1457	s >= end \|\|
				1458	*s != 'u') {
				1459	continue;
				1460	}
				1461	p--;
				1462	s++;
				1463
				1464	/* \uXXXX with 4 hex digits */
				1465	for (x = 0, i = 0; i < 4; i++) {
				1466	c = (unsigned char)s[i];
				1467	if (!isxdigit(c)) {
				1468	if (unicodeescape_decoding_error(&s, &x, errors,
				1469	"truncated \\uXXXX"))
				1470	goto onError;
				1471	i++;
				1472	break;
				1473	}
				1474	x = (x<<4) & ~0xF;
				1475	if (c >= '0' && c <= '9')
				1476	x += c - '0';
				1477	else if (c >= 'a' && c <= 'f')
				1478	x += 10 + c - 'a';
				1479	else
				1480	x += 10 + c - 'A';
				1481	}
				1482	s += i;
				1483	*p++ = x;
				1484	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1485	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1486	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1487	return (PyObject *)v;
				1488
				1489	onError:
				1490	Py_XDECREF(v);
				1491	return NULL;
				1492	}
				1493
				1494	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1495	int size)
				1496	{
				1497	PyObject *repr;
				1498	char *p;
				1499	char *q;
				1500
				1501	static const char *hexdigit = "0123456789ABCDEF";
				1502
				1503	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1504	if (repr == NULL)
				1505	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1506	if (size == 0)
				1507	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1508
				1509	p = q = PyString_AS_STRING(repr);
				1510	while (size-- > 0) {
				1511	Py_UNICODE ch = *s++;
				1512	/* Map 16-bit characters to '\uxxxx' */
				1513	if (ch >= 256) {
				1514	*p++ = '\\';
				1515	*p++ = 'u';
				1516	*p++ = hexdigit[(ch >> 12) & 0xf];
				1517	*p++ = hexdigit[(ch >> 8) & 0xf];
				1518	*p++ = hexdigit[(ch >> 4) & 0xf];
				1519	*p++ = hexdigit[ch & 15];
				1520	}
				1521	/* Copy everything else as-is */
				1522	else
				1523	*p++ = (char) ch;
				1524	}
				1525	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1526	if (_PyString_Resize(&repr, p - q))
				1527	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1528
				1529	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1530
				1531	onError:
				1532	Py_DECREF(repr);
				1533	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1534	}
				1535
				1536	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1537	{
				1538	if (!PyUnicode_Check(unicode)) {
				1539	PyErr_BadArgument();
				1540	return NULL;
				1541	}
				1542	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1543	PyUnicode_GET_SIZE(unicode));
				1544	}
				1545
				1546	/* --- Latin-1 Codec ------------------------------------------------------ */
				1547
				1548	PyObject PyUnicode_DecodeLatin1(const char s,
				1549	int size,
				1550	const char *errors)
				1551	{
				1552	PyUnicodeObject *v;
				1553	Py_UNICODE *p;
				1554
				1555	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1556	v = _PyUnicode_New(size);
				1557	if (v == NULL)
				1558	goto onError;
				1559	if (size == 0)
				1560	return (PyObject *)v;
				1561	p = PyUnicode_AS_UNICODE(v);
				1562	while (size-- > 0)
				1563	p++ = (unsigned char)s++;
				1564	return (PyObject *)v;
				1565
				1566	onError:
				1567	Py_XDECREF(v);
				1568	return NULL;
				1569	}
				1570
				1571	static
				1572	int latin1_encoding_error(const Py_UNICODE **source,
				1573	char **dest,
				1574	const char *errors,
				1575	const char *details)
				1576	{
				1577	if ((errors == NULL) \|\|
				1578	(strcmp(errors,"strict") == 0)) {
				1579	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1580	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1581	details);
				1582	return -1;
				1583	}
				1584	else if (strcmp(errors,"ignore") == 0) {
				1585	return 0;
				1586	}
				1587	else if (strcmp(errors,"replace") == 0) {
				1588	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1589	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1590	return 0;
				1591	}
				1592	else {
				1593	PyErr_Format(PyExc_ValueError,
				1594	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1595	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1596	errors);
				1597	return -1;
				1598	}
				1599	}
				1600
				1601	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1602	int size,
				1603	const char *errors)
				1604	{
				1605	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1606	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1607
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1608	repr = PyString_FromStringAndSize(NULL, size);
				1609	if (repr == NULL)
				1610	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1611	if (size == 0)
				1612	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1613
				1614	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1615	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1616	while (size-- > 0) {
				1617	Py_UNICODE ch = *p++;
				1618	if (ch >= 256) {
				1619	if (latin1_encoding_error(&p, &s, errors,
				1620	"ordinal not in range(256)"))
				1621	goto onError;
				1622	}
				1623	else
				1624	*s++ = (char)ch;
				1625	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1626	/* Resize if error handling skipped some characters */
				1627	if (s - start < PyString_GET_SIZE(repr))
				1628	if (_PyString_Resize(&repr, s - start))
				1629	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1630	return repr;
				1631
				1632	onError:
				1633	Py_DECREF(repr);
				1634	return NULL;
				1635	}
				1636
				1637	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1638	{
				1639	if (!PyUnicode_Check(unicode)) {
				1640	PyErr_BadArgument();
				1641	return NULL;
				1642	}
				1643	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1644	PyUnicode_GET_SIZE(unicode),
				1645	NULL);
				1646	}
				1647
				1648	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1649
				1650	static
				1651	int ascii_decoding_error(const char **source,
				1652	Py_UNICODE **dest,
				1653	const char *errors,
				1654	const char *details)
				1655	{
				1656	if ((errors == NULL) \|\|
				1657	(strcmp(errors,"strict") == 0)) {
				1658	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1659	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1660	details);
				1661	return -1;
				1662	}
				1663	else if (strcmp(errors,"ignore") == 0) {
				1664	return 0;
				1665	}
				1666	else if (strcmp(errors,"replace") == 0) {
				1667	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1668	(*dest)++;
				1669	return 0;
				1670	}
				1671	else {
				1672	PyErr_Format(PyExc_ValueError,
				1673	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1674	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1675	errors);
				1676	return -1;
				1677	}
				1678	}
				1679
				1680	PyObject PyUnicode_DecodeASCII(const char s,
				1681	int size,
				1682	const char *errors)
				1683	{
				1684	PyUnicodeObject *v;
				1685	Py_UNICODE *p;
				1686
				1687	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1688	v = _PyUnicode_New(size);
				1689	if (v == NULL)
				1690	goto onError;
				1691	if (size == 0)
				1692	return (PyObject *)v;
				1693	p = PyUnicode_AS_UNICODE(v);
				1694	while (size-- > 0) {
				1695	register unsigned char c;
				1696
				1697	c = (unsigned char)*s++;
				1698	if (c < 128)
				1699	*p++ = c;
				1700	else if (ascii_decoding_error(&s, &p, errors,
				1701	"ordinal not in range(128)"))
				1702	goto onError;
				1703	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1704	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1705	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1706	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1707	return (PyObject *)v;
				1708
				1709	onError:
				1710	Py_XDECREF(v);
				1711	return NULL;
				1712	}
				1713
				1714	static
				1715	int ascii_encoding_error(const Py_UNICODE **source,
				1716	char **dest,
				1717	const char *errors,
				1718	const char *details)
				1719	{
				1720	if ((errors == NULL) \|\|
				1721	(strcmp(errors,"strict") == 0)) {
				1722	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1723	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1724	details);
				1725	return -1;
				1726	}
				1727	else if (strcmp(errors,"ignore") == 0) {
				1728	return 0;
				1729	}
				1730	else if (strcmp(errors,"replace") == 0) {
				1731	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1732	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1733	return 0;
				1734	}
				1735	else {
				1736	PyErr_Format(PyExc_ValueError,
				1737	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1738	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1739	errors);
				1740	return -1;
				1741	}
				1742	}
				1743
				1744	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1745	int size,
				1746	const char *errors)
				1747	{
				1748	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1749	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1750
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1751	repr = PyString_FromStringAndSize(NULL, size);
				1752	if (repr == NULL)
				1753	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1754	if (size == 0)
				1755	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1756
				1757	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1758	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1759	while (size-- > 0) {
				1760	Py_UNICODE ch = *p++;
				1761	if (ch >= 128) {
				1762	if (ascii_encoding_error(&p, &s, errors,
				1763	"ordinal not in range(128)"))
				1764	goto onError;
				1765	}
				1766	else
				1767	*s++ = (char)ch;
				1768	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1769	/* Resize if error handling skipped some characters */
				1770	if (s - start < PyString_GET_SIZE(repr))
				1771	if (_PyString_Resize(&repr, s - start))
				1772	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1773	return repr;
				1774
				1775	onError:
				1776	Py_DECREF(repr);
				1777	return NULL;
				1778	}
				1779
				1780	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1781	{
				1782	if (!PyUnicode_Check(unicode)) {
				1783	PyErr_BadArgument();
				1784	return NULL;
				1785	}
				1786	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1787	PyUnicode_GET_SIZE(unicode),
				1788	NULL);
				1789	}
				1790
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1791	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1792
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1793	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1794
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1795	PyObject PyUnicode_DecodeMBCS(const char s,
				1796	int size,
				1797	const char *errors)
				1798	{
				1799	PyUnicodeObject *v;
				1800	Py_UNICODE *p;
				1801
				1802	/* First get the size of the result */
				1803	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1804	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1805	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1806
				1807	v = _PyUnicode_New(usize);
				1808	if (v == NULL)
				1809	return NULL;
				1810	if (usize == 0)
				1811	return (PyObject *)v;
				1812	p = PyUnicode_AS_UNICODE(v);
				1813	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1814	Py_DECREF(v);
				1815	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1816	}
				1817
				1818	return (PyObject *)v;
				1819	}
				1820
				1821	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1822	int size,
				1823	const char *errors)
				1824	{
				1825	PyObject *repr;
				1826	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1827	DWORD mbcssize;
				1828
				1829	/* If there are no characters, bail now! */
				1830	if (size==0)
				1831	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1832
				1833	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1834	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1835	if (mbcssize==0)
				1836	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1837
				1838	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1839	if (repr == NULL)
				1840	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1841	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1842	return repr;
				1843
				1844	/* Do the conversion */
				1845	s = PyString_AS_STRING(repr);
				1846	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1847	Py_DECREF(repr);
				1848	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1849	}
				1850	return repr;
				1851	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1852
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1853	#endif /* MS_WIN32 */
				1854
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1855	/* --- Character Mapping Codec -------------------------------------------- */
				1856
				1857	static
				1858	int charmap_decoding_error(const char **source,
				1859	Py_UNICODE **dest,
				1860	const char *errors,
				1861	const char *details)
				1862	{
				1863	if ((errors == NULL) \|\|
				1864	(strcmp(errors,"strict") == 0)) {
				1865	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1866	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1867	details);
				1868	return -1;
				1869	}
				1870	else if (strcmp(errors,"ignore") == 0) {
				1871	return 0;
				1872	}
				1873	else if (strcmp(errors,"replace") == 0) {
				1874	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1875	(*dest)++;
				1876	return 0;
				1877	}
				1878	else {
				1879	PyErr_Format(PyExc_ValueError,
				1880	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1881	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1882	errors);
				1883	return -1;
				1884	}
				1885	}
				1886
				1887	PyObject PyUnicode_DecodeCharmap(const char s,
				1888	int size,
				1889	PyObject *mapping,
				1890	const char *errors)
				1891	{
				1892	PyUnicodeObject *v;
				1893	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1894	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1895
				1896	/* Default to Latin-1 */
				1897	if (mapping == NULL)
				1898	return PyUnicode_DecodeLatin1(s, size, errors);
				1899
				1900	v = _PyUnicode_New(size);
				1901	if (v == NULL)
				1902	goto onError;
				1903	if (size == 0)
				1904	return (PyObject *)v;
				1905	p = PyUnicode_AS_UNICODE(v);
				1906	while (size-- > 0) {
				1907	unsigned char ch = *s++;
				1908	PyObject w, x;
				1909
				1910	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1911	w = PyInt_FromLong((long)ch);
				1912	if (w == NULL)
				1913	goto onError;
				1914	x = PyObject_GetItem(mapping, w);
				1915	Py_DECREF(w);
				1916	if (x == NULL) {
				1917	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1918	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1919	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1920	x = Py_None;
				1921	Py_INCREF(x);
				1922	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1923	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1924	}
				1925
				1926	/* Apply mapping */
				1927	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1928	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1929	if (value < 0 \|\| value > 65535) {
				1930	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1931	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1932	Py_DECREF(x);
				1933	goto onError;
				1934	}
				1935	*p++ = (Py_UNICODE)value;
				1936	}
				1937	else if (x == Py_None) {
				1938	/* undefined mapping */
				1939	if (charmap_decoding_error(&s, &p, errors,
				1940	"character maps to <undefined>")) {
				1941	Py_DECREF(x);
				1942	goto onError;
				1943	}
				1944	}
				1945	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1946	int targetsize = PyUnicode_GET_SIZE(x);
				1947
				1948	if (targetsize == 1)
				1949	/* 1-1 mapping */
				1950	p++ = PyUnicode_AS_UNICODE(x);
				1951
				1952	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1953	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1954	if (targetsize > extrachars) {
				1955	/* resize first */
				1956	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				1957	int needed = (targetsize - extrachars) + \
				1958	(targetsize << 2);
				1959	extrachars += needed;
				1960	if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1961	Py_DECREF(x);
				1962	goto onError;
				1963	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1964	p = PyUnicode_AS_UNICODE(v) + oldpos;
				1965	}
				1966	Py_UNICODE_COPY(p,
				1967	PyUnicode_AS_UNICODE(x),
				1968	targetsize);
				1969	p += targetsize;
				1970	extrachars -= targetsize;
				1971	}
				1972	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1973	}
				1974	else {
				1975	/* wrong return value */
				1976	PyErr_SetString(PyExc_TypeError,
				1977	"character mapping must return integer, None or unicode");
				1978	Py_DECREF(x);
				1979	goto onError;
				1980	}
				1981	Py_DECREF(x);
				1982	}
				1983	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1984	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1985	goto onError;
				1986	return (PyObject *)v;
				1987
				1988	onError:
				1989	Py_XDECREF(v);
				1990	return NULL;
				1991	}
				1992
				1993	static
				1994	int charmap_encoding_error(const Py_UNICODE **source,
				1995	char **dest,
				1996	const char *errors,
				1997	const char *details)
				1998	{
				1999	if ((errors == NULL) \|\|
				2000	(strcmp(errors,"strict") == 0)) {
				2001	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2002	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2003	details);
				2004	return -1;
				2005	}
				2006	else if (strcmp(errors,"ignore") == 0) {
				2007	return 0;
				2008	}
				2009	else if (strcmp(errors,"replace") == 0) {
				2010	**dest = '?';
				2011	(*dest)++;
				2012	return 0;
				2013	}
				2014	else {
				2015	PyErr_Format(PyExc_ValueError,
				2016	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2017	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2018	errors);
				2019	return -1;
				2020	}
				2021	}
				2022
				2023	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2024	int size,
				2025	PyObject *mapping,
				2026	const char *errors)
				2027	{
				2028	PyObject *v;
				2029	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2030	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2031
				2032	/* Default to Latin-1 */
				2033	if (mapping == NULL)
				2034	return PyUnicode_EncodeLatin1(p, size, errors);
				2035
				2036	v = PyString_FromStringAndSize(NULL, size);
				2037	if (v == NULL)
				2038	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2039	if (size == 0)
				2040	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2041	s = PyString_AS_STRING(v);
				2042	while (size-- > 0) {
				2043	Py_UNICODE ch = *p++;
				2044	PyObject w, x;
				2045
				2046	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2047	w = PyInt_FromLong((long)ch);
				2048	if (w == NULL)
				2049	goto onError;
				2050	x = PyObject_GetItem(mapping, w);
				2051	Py_DECREF(w);
				2052	if (x == NULL) {
				2053	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2054	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2055	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2056	x = Py_None;
				2057	Py_INCREF(x);
				2058	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2059	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2060	}
				2061
				2062	/* Apply mapping */
				2063	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2064	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2065	if (value < 0 \|\| value > 255) {
				2066	PyErr_SetString(PyExc_TypeError,
				2067	"character mapping must be in range(256)");
				2068	Py_DECREF(x);
				2069	goto onError;
				2070	}
				2071	*s++ = (char)value;
				2072	}
				2073	else if (x == Py_None) {
				2074	/* undefined mapping */
				2075	if (charmap_encoding_error(&p, &s, errors,
				2076	"character maps to <undefined>")) {
				2077	Py_DECREF(x);
				2078	goto onError;
				2079	}
				2080	}
				2081	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2082	int targetsize = PyString_GET_SIZE(x);
				2083
				2084	if (targetsize == 1)
				2085	/* 1-1 mapping */
				2086	s++ = PyString_AS_STRING(x);
				2087
				2088	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2089	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2090	if (targetsize > extrachars) {
				2091	/* resize first */
				2092	int oldpos = (int)(s - PyString_AS_STRING(v));
				2093	int needed = (targetsize - extrachars) + \
				2094	(targetsize << 2);
				2095	extrachars += needed;
				2096	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2097	Py_DECREF(x);
				2098	goto onError;
				2099	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2100	s = PyString_AS_STRING(v) + oldpos;
				2101	}
				2102	memcpy(s,
				2103	PyString_AS_STRING(x),
				2104	targetsize);
				2105	s += targetsize;
				2106	extrachars -= targetsize;
				2107	}
				2108	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2109	}
				2110	else {
				2111	/* wrong return value */
				2112	PyErr_SetString(PyExc_TypeError,
				2113	"character mapping must return integer, None or unicode");
				2114	Py_DECREF(x);
				2115	goto onError;
				2116	}
				2117	Py_DECREF(x);
				2118	}
				2119	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2120	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2121	goto onError;
				2122	return v;
				2123
				2124	onError:
				2125	Py_DECREF(v);
				2126	return NULL;
				2127	}
				2128
				2129	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2130	PyObject *mapping)
				2131	{
				2132	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2133	PyErr_BadArgument();
				2134	return NULL;
				2135	}
				2136	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2137	PyUnicode_GET_SIZE(unicode),
				2138	mapping,
				2139	NULL);
				2140	}
				2141
				2142	static
				2143	int translate_error(const Py_UNICODE **source,
				2144	Py_UNICODE **dest,
				2145	const char *errors,
				2146	const char *details)
				2147	{
				2148	if ((errors == NULL) \|\|
				2149	(strcmp(errors,"strict") == 0)) {
				2150	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2151	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2152	details);
				2153	return -1;
				2154	}
				2155	else if (strcmp(errors,"ignore") == 0) {
				2156	return 0;
				2157	}
				2158	else if (strcmp(errors,"replace") == 0) {
				2159	**dest = '?';
				2160	(*dest)++;
				2161	return 0;
				2162	}
				2163	else {
				2164	PyErr_Format(PyExc_ValueError,
				2165	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2166	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2167	errors);
				2168	return -1;
				2169	}
				2170	}
				2171
				2172	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2173	int size,
				2174	PyObject *mapping,
				2175	const char *errors)
				2176	{
				2177	PyUnicodeObject *v;
				2178	Py_UNICODE *p;
				2179
				2180	if (mapping == NULL) {
				2181	PyErr_BadArgument();
				2182	return NULL;
				2183	}
				2184
				2185	/* Output will never be longer than input */
				2186	v = _PyUnicode_New(size);
				2187	if (v == NULL)
				2188	goto onError;
				2189	if (size == 0)
				2190	goto done;
				2191	p = PyUnicode_AS_UNICODE(v);
				2192	while (size-- > 0) {
				2193	Py_UNICODE ch = *s++;
				2194	PyObject w, x;
				2195
				2196	/* Get mapping */
				2197	w = PyInt_FromLong(ch);
				2198	if (w == NULL)
				2199	goto onError;
				2200	x = PyObject_GetItem(mapping, w);
				2201	Py_DECREF(w);
				2202	if (x == NULL) {
				2203	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2204	/* No mapping found: default to 1-1 mapping */
				2205	PyErr_Clear();
				2206	*p++ = ch;
				2207	continue;
				2208	}
				2209	goto onError;
				2210	}
				2211
				2212	/* Apply mapping */
				2213	if (PyInt_Check(x))
				2214	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2215	else if (x == Py_None) {
				2216	/* undefined mapping */
				2217	if (translate_error(&s, &p, errors,
				2218	"character maps to <undefined>")) {
				2219	Py_DECREF(x);
				2220	goto onError;
				2221	}
				2222	}
				2223	else if (PyUnicode_Check(x)) {
				2224	if (PyUnicode_GET_SIZE(x) != 1) {
				2225	/* 1-n mapping */
				2226	PyErr_SetString(PyExc_NotImplementedError,
				2227	"1-n mappings are currently not implemented");
				2228	Py_DECREF(x);
				2229	goto onError;
				2230	}
				2231	p++ = PyUnicode_AS_UNICODE(x);
				2232	}
				2233	else {
				2234	/* wrong return value */
				2235	PyErr_SetString(PyExc_TypeError,
				2236	"translate mapping must return integer, None or unicode");
				2237	Py_DECREF(x);
				2238	goto onError;
				2239	}
				2240	Py_DECREF(x);
				2241	}
				2242	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2243	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2244	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2245
				2246	done:
				2247	return (PyObject *)v;
				2248
				2249	onError:
				2250	Py_XDECREF(v);
				2251	return NULL;
				2252	}
				2253
				2254	PyObject PyUnicode_Translate(PyObject str,
				2255	PyObject *mapping,
				2256	const char *errors)
				2257	{
				2258	PyObject *result;
				2259
				2260	str = PyUnicode_FromObject(str);
				2261	if (str == NULL)
				2262	goto onError;
				2263	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2264	PyUnicode_GET_SIZE(str),
				2265	mapping,
				2266	errors);
				2267	Py_DECREF(str);
				2268	return result;
				2269
				2270	onError:
				2271	Py_XDECREF(str);
				2272	return NULL;
				2273	}
				2274
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2275	/* --- Decimal Encoder ---------------------------------------------------- */
				2276
				2277	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2278	int length,
				2279	char *output,
				2280	const char *errors)
				2281	{
				2282	Py_UNICODE p, end;
				2283
				2284	if (output == NULL) {
				2285	PyErr_BadArgument();
				2286	return -1;
				2287	}
				2288
				2289	p = s;
				2290	end = s + length;
				2291	while (p < end) {
				2292	register Py_UNICODE ch = *p++;
				2293	int decimal;
				2294
				2295	if (Py_UNICODE_ISSPACE(ch)) {
				2296	*output++ = ' ';
				2297	continue;
				2298	}
				2299	decimal = Py_UNICODE_TODECIMAL(ch);
				2300	if (decimal >= 0) {
				2301	*output++ = '0' + decimal;
				2302	continue;
				2303	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2304	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2305	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2306	continue;
				2307	}
				2308	/* All other characters are considered invalid */
				2309	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2310	PyErr_SetString(PyExc_ValueError,
				2311	"invalid decimal Unicode string");
				2312	goto onError;
				2313	}
				2314	else if (strcmp(errors, "ignore") == 0)
				2315	continue;
				2316	else if (strcmp(errors, "replace") == 0) {
				2317	*output++ = '?';
				2318	continue;
				2319	}
				2320	}
				2321	/* 0-terminate the output string */
				2322	*output++ = '\0';
				2323	return 0;
				2324
				2325	onError:
				2326	return -1;
				2327	}
				2328
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2329	/* --- Helpers ------------------------------------------------------------ */
				2330
				2331	static
				2332	int count(PyUnicodeObject *self,
				2333	int start,
				2334	int end,
				2335	PyUnicodeObject *substring)
				2336	{
				2337	int count = 0;
				2338
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2339	if (start < 0)
				2340	start += self->length;
				2341	if (start < 0)
				2342	start = 0;
				2343	if (end > self->length)
				2344	end = self->length;
				2345	if (end < 0)
				2346	end += self->length;
				2347	if (end < 0)
				2348	end = 0;
				2349
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2350	if (substring->length == 0)
				2351	return (end - start + 1);
				2352
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2353	end -= substring->length;
				2354
				2355	while (start <= end)
				2356	if (Py_UNICODE_MATCH(self, start, substring)) {
				2357	count++;
				2358	start += substring->length;
				2359	} else
				2360	start++;
				2361
				2362	return count;
				2363	}
				2364
				2365	int PyUnicode_Count(PyObject *str,
				2366	PyObject *substr,
				2367	int start,
				2368	int end)
				2369	{
				2370	int result;
				2371
				2372	str = PyUnicode_FromObject(str);
				2373	if (str == NULL)
				2374	return -1;
				2375	substr = PyUnicode_FromObject(substr);
				2376	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2377	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2378	return -1;
				2379	}
				2380
				2381	result = count((PyUnicodeObject *)str,
				2382	start, end,
				2383	(PyUnicodeObject *)substr);
				2384
				2385	Py_DECREF(str);
				2386	Py_DECREF(substr);
				2387	return result;
				2388	}
				2389
				2390	static
				2391	int findstring(PyUnicodeObject *self,
				2392	PyUnicodeObject *substring,
				2393	int start,
				2394	int end,
				2395	int direction)
				2396	{
				2397	if (start < 0)
				2398	start += self->length;
				2399	if (start < 0)
				2400	start = 0;
				2401
				2402	if (substring->length == 0)
				2403	return start;
				2404
				2405	if (end > self->length)
				2406	end = self->length;
				2407	if (end < 0)
				2408	end += self->length;
				2409	if (end < 0)
				2410	end = 0;
				2411
				2412	end -= substring->length;
				2413
				2414	if (direction < 0) {
				2415	for (; end >= start; end--)
				2416	if (Py_UNICODE_MATCH(self, end, substring))
				2417	return end;
				2418	} else {
				2419	for (; start <= end; start++)
				2420	if (Py_UNICODE_MATCH(self, start, substring))
				2421	return start;
				2422	}
				2423
				2424	return -1;
				2425	}
				2426
				2427	int PyUnicode_Find(PyObject *str,
				2428	PyObject *substr,
				2429	int start,
				2430	int end,
				2431	int direction)
				2432	{
				2433	int result;
				2434
				2435	str = PyUnicode_FromObject(str);
				2436	if (str == NULL)
				2437	return -1;
				2438	substr = PyUnicode_FromObject(substr);
				2439	if (substr == NULL) {
				2440	Py_DECREF(substr);
				2441	return -1;
				2442	}
				2443
				2444	result = findstring((PyUnicodeObject *)str,
				2445	(PyUnicodeObject *)substr,
				2446	start, end, direction);
				2447	Py_DECREF(str);
				2448	Py_DECREF(substr);
				2449	return result;
				2450	}
				2451
				2452	static
				2453	int tailmatch(PyUnicodeObject *self,
				2454	PyUnicodeObject *substring,
				2455	int start,
				2456	int end,
				2457	int direction)
				2458	{
				2459	if (start < 0)
				2460	start += self->length;
				2461	if (start < 0)
				2462	start = 0;
				2463
				2464	if (substring->length == 0)
				2465	return 1;
				2466
				2467	if (end > self->length)
				2468	end = self->length;
				2469	if (end < 0)
				2470	end += self->length;
				2471	if (end < 0)
				2472	end = 0;
				2473
				2474	end -= substring->length;
				2475	if (end < start)
				2476	return 0;
				2477
				2478	if (direction > 0) {
				2479	if (Py_UNICODE_MATCH(self, end, substring))
				2480	return 1;
				2481	} else {
				2482	if (Py_UNICODE_MATCH(self, start, substring))
				2483	return 1;
				2484	}
				2485
				2486	return 0;
				2487	}
				2488
				2489	int PyUnicode_Tailmatch(PyObject *str,
				2490	PyObject *substr,
				2491	int start,
				2492	int end,
				2493	int direction)
				2494	{
				2495	int result;
				2496
				2497	str = PyUnicode_FromObject(str);
				2498	if (str == NULL)
				2499	return -1;
				2500	substr = PyUnicode_FromObject(substr);
				2501	if (substr == NULL) {
				2502	Py_DECREF(substr);
				2503	return -1;
				2504	}
				2505
				2506	result = tailmatch((PyUnicodeObject *)str,
				2507	(PyUnicodeObject *)substr,
				2508	start, end, direction);
				2509	Py_DECREF(str);
				2510	Py_DECREF(substr);
				2511	return result;
				2512	}
				2513
				2514	static
				2515	const Py_UNICODE findchar(const Py_UNICODE s,
				2516	int size,
				2517	Py_UNICODE ch)
				2518	{
				2519	/* like wcschr, but doesn't stop at NULL characters */
				2520
				2521	while (size-- > 0) {
				2522	if (*s == ch)
				2523	return s;
				2524	s++;
				2525	}
				2526
				2527	return NULL;
				2528	}
				2529
				2530	/* Apply fixfct filter to the Unicode object self and return a
				2531	reference to the modified object */
				2532
				2533	static
				2534	PyObject fixup(PyUnicodeObject self,
				2535	int (fixfct)(PyUnicodeObject s))
				2536	{
				2537
				2538	PyUnicodeObject *u;
				2539
				2540	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2541	self->length);
				2542	if (u == NULL)
				2543	return NULL;
				2544	if (!fixfct(u)) {
				2545	/* fixfct should return TRUE if it modified the buffer. If
				2546	FALSE, return a reference to the original buffer instead
				2547	(to save space, not time) */
				2548	Py_INCREF(self);
				2549	Py_DECREF(u);
				2550	return (PyObject*) self;
				2551	}
				2552	return (PyObject*) u;
				2553	}
				2554
				2555	static
				2556	int fixupper(PyUnicodeObject *self)
				2557	{
				2558	int len = self->length;
				2559	Py_UNICODE *s = self->str;
				2560	int status = 0;
				2561
				2562	while (len-- > 0) {
				2563	register Py_UNICODE ch;
				2564
				2565	ch = Py_UNICODE_TOUPPER(*s);
				2566	if (ch != *s) {
				2567	status = 1;
				2568	*s = ch;
				2569	}
				2570	s++;
				2571	}
				2572
				2573	return status;
				2574	}
				2575
				2576	static
				2577	int fixlower(PyUnicodeObject *self)
				2578	{
				2579	int len = self->length;
				2580	Py_UNICODE *s = self->str;
				2581	int status = 0;
				2582
				2583	while (len-- > 0) {
				2584	register Py_UNICODE ch;
				2585
				2586	ch = Py_UNICODE_TOLOWER(*s);
				2587	if (ch != *s) {
				2588	status = 1;
				2589	*s = ch;
				2590	}
				2591	s++;
				2592	}
				2593
				2594	return status;
				2595	}
				2596
				2597	static
				2598	int fixswapcase(PyUnicodeObject *self)
				2599	{
				2600	int len = self->length;
				2601	Py_UNICODE *s = self->str;
				2602	int status = 0;
				2603
				2604	while (len-- > 0) {
				2605	if (Py_UNICODE_ISUPPER(*s)) {
				2606	s = Py_UNICODE_TOLOWER(s);
				2607	status = 1;
				2608	} else if (Py_UNICODE_ISLOWER(*s)) {
				2609	s = Py_UNICODE_TOUPPER(s);
				2610	status = 1;
				2611	}
				2612	s++;
				2613	}
				2614
				2615	return status;
				2616	}
				2617
				2618	static
				2619	int fixcapitalize(PyUnicodeObject *self)
				2620	{
				2621	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2622	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2623	return 1;
				2624	}
				2625	return 0;
				2626	}
				2627
				2628	static
				2629	int fixtitle(PyUnicodeObject *self)
				2630	{
				2631	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2632	register Py_UNICODE *e;
				2633	int previous_is_cased;
				2634
				2635	/* Shortcut for single character strings */
				2636	if (PyUnicode_GET_SIZE(self) == 1) {
				2637	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2638	if (*p != ch) {
				2639	*p = ch;
				2640	return 1;
				2641	}
				2642	else
				2643	return 0;
				2644	}
				2645
				2646	e = p + PyUnicode_GET_SIZE(self);
				2647	previous_is_cased = 0;
				2648	for (; p < e; p++) {
				2649	register const Py_UNICODE ch = *p;
				2650
				2651	if (previous_is_cased)
				2652	*p = Py_UNICODE_TOLOWER(ch);
				2653	else
				2654	*p = Py_UNICODE_TOTITLE(ch);
				2655
				2656	if (Py_UNICODE_ISLOWER(ch) \|\|
				2657	Py_UNICODE_ISUPPER(ch) \|\|
				2658	Py_UNICODE_ISTITLE(ch))
				2659	previous_is_cased = 1;
				2660	else
				2661	previous_is_cased = 0;
				2662	}
				2663	return 1;
				2664	}
				2665
				2666	PyObject PyUnicode_Join(PyObject separator,
				2667	PyObject *seq)
				2668	{
				2669	Py_UNICODE *sep;
				2670	int seplen;
				2671	PyUnicodeObject *res = NULL;
				2672	int reslen = 0;
				2673	Py_UNICODE *p;
				2674	int seqlen = 0;
				2675	int sz = 100;
				2676	int i;
				2677
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2678	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2679	if (seqlen < 0 && PyErr_Occurred())
				2680	return NULL;
				2681
				2682	if (separator == NULL) {
				2683	Py_UNICODE blank = ' ';
				2684	sep = &blank;
				2685	seplen = 1;
				2686	}
				2687	else {
				2688	separator = PyUnicode_FromObject(separator);
				2689	if (separator == NULL)
				2690	return NULL;
				2691	sep = PyUnicode_AS_UNICODE(separator);
				2692	seplen = PyUnicode_GET_SIZE(separator);
				2693	}
				2694
				2695	res = _PyUnicode_New(sz);
				2696	if (res == NULL)
				2697	goto onError;
				2698	p = PyUnicode_AS_UNICODE(res);
				2699	reslen = 0;
				2700
				2701	for (i = 0; i < seqlen; i++) {
				2702	int itemlen;
				2703	PyObject *item;
				2704
				2705	item = PySequence_GetItem(seq, i);
				2706	if (item == NULL)
				2707	goto onError;
				2708	if (!PyUnicode_Check(item)) {
				2709	PyObject *v;
				2710	v = PyUnicode_FromObject(item);
				2711	Py_DECREF(item);
				2712	item = v;
				2713	if (item == NULL)
				2714	goto onError;
				2715	}
				2716	itemlen = PyUnicode_GET_SIZE(item);
				2717	while (reslen + itemlen + seplen >= sz) {
				2718	if (_PyUnicode_Resize(res, sz*2))
				2719	goto onError;
				2720	sz *= 2;
				2721	p = PyUnicode_AS_UNICODE(res) + reslen;
				2722	}
				2723	if (i > 0) {
				2724	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2725	p += seplen;
				2726	reslen += seplen;
				2727	}
				2728	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2729	p += itemlen;
				2730	reslen += itemlen;
				2731	Py_DECREF(item);
				2732	}
				2733	if (_PyUnicode_Resize(res, reslen))
				2734	goto onError;
				2735
				2736	Py_XDECREF(separator);
				2737	return (PyObject *)res;
				2738
				2739	onError:
				2740	Py_XDECREF(separator);
				2741	Py_DECREF(res);
				2742	return NULL;
				2743	}
				2744
				2745	static
				2746	PyUnicodeObject pad(PyUnicodeObject self,
				2747	int left,
				2748	int right,
				2749	Py_UNICODE fill)
				2750	{
				2751	PyUnicodeObject *u;
				2752
				2753	if (left < 0)
				2754	left = 0;
				2755	if (right < 0)
				2756	right = 0;
				2757
				2758	if (left == 0 && right == 0) {
				2759	Py_INCREF(self);
				2760	return self;
				2761	}
				2762
				2763	u = _PyUnicode_New(left + self->length + right);
				2764	if (u) {
				2765	if (left)
				2766	Py_UNICODE_FILL(u->str, fill, left);
				2767	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2768	if (right)
				2769	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2770	}
				2771
				2772	return u;
				2773	}
				2774
				2775	#define SPLIT_APPEND(data, left, right) \
				2776	str = PyUnicode_FromUnicode(data + left, right - left); \
				2777	if (!str) \
				2778	goto onError; \
				2779	if (PyList_Append(list, str)) { \
				2780	Py_DECREF(str); \
				2781	goto onError; \
				2782	} \
				2783	else \
				2784	Py_DECREF(str);
				2785
				2786	static
				2787	PyObject split_whitespace(PyUnicodeObject self,
				2788	PyObject *list,
				2789	int maxcount)
				2790	{
				2791	register int i;
				2792	register int j;
				2793	int len = self->length;
				2794	PyObject *str;
				2795
				2796	for (i = j = 0; i < len; ) {
				2797	/* find a token */
				2798	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2799	i++;
				2800	j = i;
				2801	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2802	i++;
				2803	if (j < i) {
				2804	if (maxcount-- <= 0)
				2805	break;
				2806	SPLIT_APPEND(self->str, j, i);
				2807	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2808	i++;
				2809	j = i;
				2810	}
				2811	}
				2812	if (j < len) {
				2813	SPLIT_APPEND(self->str, j, len);
				2814	}
				2815	return list;
				2816
				2817	onError:
				2818	Py_DECREF(list);
				2819	return NULL;
				2820	}
				2821
				2822	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2823	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2824	{
				2825	register int i;
				2826	register int j;
				2827	int len;
				2828	PyObject *list;
				2829	PyObject *str;
				2830	Py_UNICODE *data;
				2831
				2832	string = PyUnicode_FromObject(string);
				2833	if (string == NULL)
				2834	return NULL;
				2835	data = PyUnicode_AS_UNICODE(string);
				2836	len = PyUnicode_GET_SIZE(string);
				2837
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2838	list = PyList_New(0);
				2839	if (!list)
				2840	goto onError;
				2841
				2842	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2843	int eol;
				2844
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2845	/* Find a line and append it */
				2846	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2847	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2848
				2849	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2850	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2851	if (i < len) {
				2852	if (data[i] == '\r' && i + 1 < len &&
				2853	data[i+1] == '\n')
				2854	i += 2;
				2855	else
				2856	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2857	if (keepends)
				2858	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2859	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2860	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2861	j = i;
				2862	}
				2863	if (j < len) {
				2864	SPLIT_APPEND(data, j, len);
				2865	}
				2866
				2867	Py_DECREF(string);
				2868	return list;
				2869
				2870	onError:
				2871	Py_DECREF(list);
				2872	Py_DECREF(string);
				2873	return NULL;
				2874	}
				2875
				2876	static
				2877	PyObject split_char(PyUnicodeObject self,
				2878	PyObject *list,
				2879	Py_UNICODE ch,
				2880	int maxcount)
				2881	{
				2882	register int i;
				2883	register int j;
				2884	int len = self->length;
				2885	PyObject *str;
				2886
				2887	for (i = j = 0; i < len; ) {
				2888	if (self->str[i] == ch) {
				2889	if (maxcount-- <= 0)
				2890	break;
				2891	SPLIT_APPEND(self->str, j, i);
				2892	i = j = i + 1;
				2893	} else
				2894	i++;
				2895	}
				2896	if (j <= len) {
				2897	SPLIT_APPEND(self->str, j, len);
				2898	}
				2899	return list;
				2900
				2901	onError:
				2902	Py_DECREF(list);
				2903	return NULL;
				2904	}
				2905
				2906	static
				2907	PyObject split_substring(PyUnicodeObject self,
				2908	PyObject *list,
				2909	PyUnicodeObject *substring,
				2910	int maxcount)
				2911	{
				2912	register int i;
				2913	register int j;
				2914	int len = self->length;
				2915	int sublen = substring->length;
				2916	PyObject *str;
				2917
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	2918	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2919	if (Py_UNICODE_MATCH(self, i, substring)) {
				2920	if (maxcount-- <= 0)
				2921	break;
				2922	SPLIT_APPEND(self->str, j, i);
				2923	i = j = i + sublen;
				2924	} else
				2925	i++;
				2926	}
				2927	if (j <= len) {
				2928	SPLIT_APPEND(self->str, j, len);
				2929	}
				2930	return list;
				2931
				2932	onError:
				2933	Py_DECREF(list);
				2934	return NULL;
				2935	}
				2936
				2937	#undef SPLIT_APPEND
				2938
				2939	static
				2940	PyObject split(PyUnicodeObject self,
				2941	PyUnicodeObject *substring,
				2942	int maxcount)
				2943	{
				2944	PyObject *list;
				2945
				2946	if (maxcount < 0)
				2947	maxcount = INT_MAX;
				2948
				2949	list = PyList_New(0);
				2950	if (!list)
				2951	return NULL;
				2952
				2953	if (substring == NULL)
				2954	return split_whitespace(self,list,maxcount);
				2955
				2956	else if (substring->length == 1)
				2957	return split_char(self,list,substring->str[0],maxcount);
				2958
				2959	else if (substring->length == 0) {
				2960	Py_DECREF(list);
				2961	PyErr_SetString(PyExc_ValueError, "empty separator");
				2962	return NULL;
				2963	}
				2964	else
				2965	return split_substring(self,list,substring,maxcount);
				2966	}
				2967
				2968	static
				2969	PyObject strip(PyUnicodeObject self,
				2970	int left,
				2971	int right)
				2972	{
				2973	Py_UNICODE *p = self->str;
				2974	int start = 0;
				2975	int end = self->length;
				2976
				2977	if (left)
				2978	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2979	start++;
				2980
				2981	if (right)
				2982	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2983	end--;
				2984
				2985	if (start == 0 && end == self->length) {
				2986	/* couldn't strip anything off, return original string */
				2987	Py_INCREF(self);
				2988	return (PyObject*) self;
				2989	}
				2990
				2991	return (PyObject*) PyUnicode_FromUnicode(
				2992	self->str + start,
				2993	end - start
				2994	);
				2995	}
				2996
				2997	static
				2998	PyObject replace(PyUnicodeObject self,
				2999	PyUnicodeObject *str1,
				3000	PyUnicodeObject *str2,
				3001	int maxcount)
				3002	{
				3003	PyUnicodeObject *u;
				3004
				3005	if (maxcount < 0)
				3006	maxcount = INT_MAX;
				3007
				3008	if (str1->length == 1 && str2->length == 1) {
				3009	int i;
				3010
				3011	/* replace characters */
				3012	if (!findchar(self->str, self->length, str1->str[0])) {
				3013	/* nothing to replace, return original string */
				3014	Py_INCREF(self);
				3015	u = self;
				3016	} else {
				3017	Py_UNICODE u1 = str1->str[0];
				3018	Py_UNICODE u2 = str2->str[0];
				3019
				3020	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3021	self->str,
				3022	self->length
				3023	);
				3024	if (u)
				3025	for (i = 0; i < u->length; i++)
				3026	if (u->str[i] == u1) {
				3027	if (--maxcount < 0)
				3028	break;
				3029	u->str[i] = u2;
				3030	}
				3031	}
				3032
				3033	} else {
				3034	int n, i;
				3035	Py_UNICODE *p;
				3036
				3037	/* replace strings */
				3038	n = count(self, 0, self->length, str1);
				3039	if (n > maxcount)
				3040	n = maxcount;
				3041	if (n == 0) {
				3042	/* nothing to replace, return original string */
				3043	Py_INCREF(self);
				3044	u = self;
				3045	} else {
				3046	u = _PyUnicode_New(
				3047	self->length + n * (str2->length - str1->length));
				3048	if (u) {
				3049	i = 0;
				3050	p = u->str;
				3051	while (i <= self->length - str1->length)
				3052	if (Py_UNICODE_MATCH(self, i, str1)) {
				3053	/* replace string segment */
				3054	Py_UNICODE_COPY(p, str2->str, str2->length);
				3055	p += str2->length;
				3056	i += str1->length;
				3057	if (--n <= 0) {
				3058	/* copy remaining part */
				3059	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3060	break;
				3061	}
				3062	} else
				3063	*p++ = self->str[i++];
				3064	}
				3065	}
				3066	}
				3067
				3068	return (PyObject *) u;
				3069	}
				3070
				3071	/* --- Unicode Object Methods --------------------------------------------- */
				3072
				3073	static char title__doc__[] =
				3074	"S.title() -> unicode\n\
				3075	\n\
				3076	Return a titlecased version of S, i.e. words start with title case\n\
				3077	characters, all remaining cased characters have lower case.";
				3078
				3079	static PyObject*
				3080	unicode_title(PyUnicodeObject self, PyObject args)
				3081	{
				3082	if (!PyArg_NoArgs(args))
				3083	return NULL;
				3084	return fixup(self, fixtitle);
				3085	}
				3086
				3087	static char capitalize__doc__[] =
				3088	"S.capitalize() -> unicode\n\
				3089	\n\
				3090	Return a capitalized version of S, i.e. make the first character\n\
				3091	have upper case.";
				3092
				3093	static PyObject*
				3094	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3095	{
				3096	if (!PyArg_NoArgs(args))
				3097	return NULL;
				3098	return fixup(self, fixcapitalize);
				3099	}
				3100
				3101	#if 0
				3102	static char capwords__doc__[] =
				3103	"S.capwords() -> unicode\n\
				3104	\n\
				3105	Apply .capitalize() to all words in S and return the result with\n\
				3106	normalized whitespace (all whitespace strings are replaced by ' ').";
				3107
				3108	static PyObject*
				3109	unicode_capwords(PyUnicodeObject self, PyObject args)
				3110	{
				3111	PyObject *list;
				3112	PyObject *item;
				3113	int i;
				3114
				3115	if (!PyArg_NoArgs(args))
				3116	return NULL;
				3117
				3118	/* Split into words */
				3119	list = split(self, NULL, -1);
				3120	if (!list)
				3121	return NULL;
				3122
				3123	/* Capitalize each word */
				3124	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3125	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3126	fixcapitalize);
				3127	if (item == NULL)
				3128	goto onError;
				3129	Py_DECREF(PyList_GET_ITEM(list, i));
				3130	PyList_SET_ITEM(list, i, item);
				3131	}
				3132
				3133	/* Join the words to form a new string */
				3134	item = PyUnicode_Join(NULL, list);
				3135
				3136	onError:
				3137	Py_DECREF(list);
				3138	return (PyObject *)item;
				3139	}
				3140	#endif
				3141
				3142	static char center__doc__[] =
				3143	"S.center(width) -> unicode\n\
				3144	\n\
				3145	Return S centered in a Unicode string of length width. Padding is done\n\
				3146	using spaces.";
				3147
				3148	static PyObject *
				3149	unicode_center(PyUnicodeObject self, PyObject args)
				3150	{
				3151	int marg, left;
				3152	int width;
				3153
				3154	if (!PyArg_ParseTuple(args, "i:center", &width))
				3155	return NULL;
				3156
				3157	if (self->length >= width) {
				3158	Py_INCREF(self);
				3159	return (PyObject*) self;
				3160	}
				3161
				3162	marg = width - self->length;
				3163	left = marg / 2 + (marg & width & 1);
				3164
				3165	return (PyObject*) pad(self, left, marg - left, ' ');
				3166	}
				3167
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3168	#if 0
				3169
				3170	/* This code should go into some future Unicode collation support
				3171	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3172	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3173
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3174	/* speedy UTF-16 code point order comparison */
				3175	/* gleaned from: */
				3176	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3177
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3178	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3179	{
				3180	0, 0, 0, 0, 0, 0, 0, 0,
				3181	0, 0, 0, 0, 0, 0, 0, 0,
				3182	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3183	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3184	};
				3185
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3186	static int
				3187	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3188	{
				3189	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3190
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3191	Py_UNICODE *s1 = str1->str;
				3192	Py_UNICODE *s2 = str2->str;
				3193
				3194	len1 = str1->length;
				3195	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3196
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3197	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3198	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3199	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3200
				3201	c1 = *s1++;
				3202	c2 = *s2++;
				3203	if (c1 > (1<<11) * 26)
				3204	c1 += utf16Fixup[c1>>11];
				3205	if (c2 > (1<<11) * 26)
				3206	c2 += utf16Fixup[c2>>11];
				3207
				3208	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3209	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3210	if (diff)
				3211	return (diff < 0) ? -1 : (diff != 0);
				3212	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3213	}
				3214
				3215	return (len1 < len2) ? -1 : (len1 != len2);
				3216	}
				3217
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3218	#else
				3219
				3220	static int
				3221	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3222	{
				3223	register int len1, len2;
				3224
				3225	Py_UNICODE *s1 = str1->str;
				3226	Py_UNICODE *s2 = str2->str;
				3227
				3228	len1 = str1->length;
				3229	len2 = str2->length;
				3230
				3231	while (len1 > 0 && len2 > 0) {
				3232	register long diff;
				3233
				3234	diff = (long)s1++ - (long)s2++;
				3235	if (diff)
				3236	return (diff < 0) ? -1 : (diff != 0);
				3237	len1--; len2--;
				3238	}
				3239
				3240	return (len1 < len2) ? -1 : (len1 != len2);
				3241	}
				3242
				3243	#endif
				3244
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3245	int PyUnicode_Compare(PyObject *left,
				3246	PyObject *right)
				3247	{
				3248	PyUnicodeObject u = NULL, v = NULL;
				3249	int result;
				3250
				3251	/* Coerce the two arguments */
				3252	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3253	if (u == NULL)
				3254	goto onError;
				3255	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3256	if (v == NULL)
				3257	goto onError;
				3258
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3259	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3260	if (v == u) {
				3261	Py_DECREF(u);
				3262	Py_DECREF(v);
				3263	return 0;
				3264	}
				3265
				3266	result = unicode_compare(u, v);
				3267
				3268	Py_DECREF(u);
				3269	Py_DECREF(v);
				3270	return result;
				3271
				3272	onError:
				3273	Py_XDECREF(u);
				3274	Py_XDECREF(v);
				3275	return -1;
				3276	}
				3277
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3278	int PyUnicode_Contains(PyObject *container,
				3279	PyObject *element)
				3280	{
				3281	PyUnicodeObject u = NULL, v = NULL;
				3282	int result;
				3283	register const Py_UNICODE p, e;
				3284	register Py_UNICODE ch;
				3285
				3286	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3287	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3288	if (v == NULL) {
				3289	PyErr_SetString(PyExc_TypeError,
				3290	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3291	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3292	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3293	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3294	if (u == NULL) {
				3295	Py_DECREF(v);
				3296	goto onError;
				3297	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3298
				3299	/* Check v in u */
				3300	if (PyUnicode_GET_SIZE(v) != 1) {
				3301	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3302	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3303	goto onError;
				3304	}
				3305	ch = *PyUnicode_AS_UNICODE(v);
				3306	p = PyUnicode_AS_UNICODE(u);
				3307	e = p + PyUnicode_GET_SIZE(u);
				3308	result = 0;
				3309	while (p < e) {
				3310	if (*p++ == ch) {
				3311	result = 1;
				3312	break;
				3313	}
				3314	}
				3315
				3316	Py_DECREF(u);
				3317	Py_DECREF(v);
				3318	return result;
				3319
				3320	onError:
				3321	Py_XDECREF(u);
				3322	Py_XDECREF(v);
				3323	return -1;
				3324	}
				3325
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3326	/* Concat to string or Unicode object giving a new Unicode object. */
				3327
				3328	PyObject PyUnicode_Concat(PyObject left,
				3329	PyObject *right)
				3330	{
				3331	PyUnicodeObject u = NULL, v = NULL, *w;
				3332
				3333	/* Coerce the two arguments */
				3334	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3335	if (u == NULL)
				3336	goto onError;
				3337	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3338	if (v == NULL)
				3339	goto onError;
				3340
				3341	/* Shortcuts */
				3342	if (v == unicode_empty) {
				3343	Py_DECREF(v);
				3344	return (PyObject *)u;
				3345	}
				3346	if (u == unicode_empty) {
				3347	Py_DECREF(u);
				3348	return (PyObject *)v;
				3349	}
				3350
				3351	/* Concat the two Unicode strings */
				3352	w = _PyUnicode_New(u->length + v->length);
				3353	if (w == NULL)
				3354	goto onError;
				3355	Py_UNICODE_COPY(w->str, u->str, u->length);
				3356	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3357
				3358	Py_DECREF(u);
				3359	Py_DECREF(v);
				3360	return (PyObject *)w;
				3361
				3362	onError:
				3363	Py_XDECREF(u);
				3364	Py_XDECREF(v);
				3365	return NULL;
				3366	}
				3367
				3368	static char count__doc__[] =
				3369	"S.count(sub[, start[, end]]) -> int\n\
				3370	\n\
				3371	Return the number of occurrences of substring sub in Unicode string\n\
				3372	S[start:end]. Optional arguments start and end are\n\
				3373	interpreted as in slice notation.";
				3374
				3375	static PyObject *
				3376	unicode_count(PyUnicodeObject self, PyObject args)
				3377	{
				3378	PyUnicodeObject *substring;
				3379	int start = 0;
				3380	int end = INT_MAX;
				3381	PyObject *result;
				3382
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3383	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3384	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3385	return NULL;
				3386
				3387	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3388	(PyObject *)substring);
				3389	if (substring == NULL)
				3390	return NULL;
				3391
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3392	if (start < 0)
				3393	start += self->length;
				3394	if (start < 0)
				3395	start = 0;
				3396	if (end > self->length)
				3397	end = self->length;
				3398	if (end < 0)
				3399	end += self->length;
				3400	if (end < 0)
				3401	end = 0;
				3402
				3403	result = PyInt_FromLong((long) count(self, start, end, substring));
				3404
				3405	Py_DECREF(substring);
				3406	return result;
				3407	}
				3408
				3409	static char encode__doc__[] =
				3410	"S.encode([encoding[,errors]]) -> string\n\
				3411	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3412	Return an encoded string version of S. Default encoding is the current\n\
				3413	default string encoding. errors may be given to set a different error\n\
				3414	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3415	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3416
				3417	static PyObject *
				3418	unicode_encode(PyUnicodeObject self, PyObject args)
				3419	{
				3420	char *encoding = NULL;
				3421	char *errors = NULL;
				3422	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3423	return NULL;
				3424	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3425	}
				3426
				3427	static char expandtabs__doc__[] =
				3428	"S.expandtabs([tabsize]) -> unicode\n\
				3429	\n\
				3430	Return a copy of S where all tab characters are expanded using spaces.\n\
				3431	If tabsize is not given, a tab size of 8 characters is assumed.";
				3432
				3433	static PyObject*
				3434	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3435	{
				3436	Py_UNICODE *e;
				3437	Py_UNICODE *p;
				3438	Py_UNICODE *q;
				3439	int i, j;
				3440	PyUnicodeObject *u;
				3441	int tabsize = 8;
				3442
				3443	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3444	return NULL;
				3445
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3446	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3447	i = j = 0;
				3448	e = self->str + self->length;
				3449	for (p = self->str; p < e; p++)
				3450	if (*p == '\t') {
				3451	if (tabsize > 0)
				3452	j += tabsize - (j % tabsize);
				3453	}
				3454	else {
				3455	j++;
				3456	if (p == '\n' \|\| p == '\r') {
				3457	i += j;
				3458	j = 0;
				3459	}
				3460	}
				3461
				3462	/* Second pass: create output string and fill it */
				3463	u = _PyUnicode_New(i + j);
				3464	if (!u)
				3465	return NULL;
				3466
				3467	j = 0;
				3468	q = u->str;
				3469
				3470	for (p = self->str; p < e; p++)
				3471	if (*p == '\t') {
				3472	if (tabsize > 0) {
				3473	i = tabsize - (j % tabsize);
				3474	j += i;
				3475	while (i--)
				3476	*q++ = ' ';
				3477	}
				3478	}
				3479	else {
				3480	j++;
				3481	q++ = p;
				3482	if (p == '\n' \|\| p == '\r')
				3483	j = 0;
				3484	}
				3485
				3486	return (PyObject*) u;
				3487	}
				3488
				3489	static char find__doc__[] =
				3490	"S.find(sub [,start [,end]]) -> int\n\
				3491	\n\
				3492	Return the lowest index in S where substring sub is found,\n\
				3493	such that sub is contained within s[start,end]. Optional\n\
				3494	arguments start and end are interpreted as in slice notation.\n\
				3495	\n\
				3496	Return -1 on failure.";
				3497
				3498	static PyObject *
				3499	unicode_find(PyUnicodeObject self, PyObject args)
				3500	{
				3501	PyUnicodeObject *substring;
				3502	int start = 0;
				3503	int end = INT_MAX;
				3504	PyObject *result;
				3505
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3506	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3507	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3508	return NULL;
				3509	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3510	(PyObject *)substring);
				3511	if (substring == NULL)
				3512	return NULL;
				3513
				3514	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3515
				3516	Py_DECREF(substring);
				3517	return result;
				3518	}
				3519
				3520	static PyObject *
				3521	unicode_getitem(PyUnicodeObject *self, int index)
				3522	{
				3523	if (index < 0 \|\| index >= self->length) {
				3524	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3525	return NULL;
				3526	}
				3527
				3528	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3529	}
				3530
				3531	static long
				3532	unicode_hash(PyUnicodeObject *self)
				3533	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3534	/* Since Unicode objects compare equal to their ASCII string
				3535	counterparts, they should use the individual character values
				3536	as basis for their hash value. This is needed to assure that
				3537	strings and Unicode objects behave in the same way as
				3538	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3539
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3540	register int len;
				3541	register Py_UNICODE *p;
				3542	register long x;
				3543
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3544	if (self->hash != -1)
				3545	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3546	len = PyUnicode_GET_SIZE(self);
				3547	p = PyUnicode_AS_UNICODE(self);
				3548	x = *p << 7;
				3549	while (--len >= 0)
				3550	x = (1000003x) ^ p++;
				3551	x ^= PyUnicode_GET_SIZE(self);
				3552	if (x == -1)
				3553	x = -2;
				3554	self->hash = x;
				3555	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3556	}
				3557
				3558	static char index__doc__[] =
				3559	"S.index(sub [,start [,end]]) -> int\n\
				3560	\n\
				3561	Like S.find() but raise ValueError when the substring is not found.";
				3562
				3563	static PyObject *
				3564	unicode_index(PyUnicodeObject self, PyObject args)
				3565	{
				3566	int result;
				3567	PyUnicodeObject *substring;
				3568	int start = 0;
				3569	int end = INT_MAX;
				3570
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3571	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3572	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3573	return NULL;
				3574
				3575	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3576	(PyObject *)substring);
				3577	if (substring == NULL)
				3578	return NULL;
				3579
				3580	result = findstring(self, substring, start, end, 1);
				3581
				3582	Py_DECREF(substring);
				3583	if (result < 0) {
				3584	PyErr_SetString(PyExc_ValueError, "substring not found");
				3585	return NULL;
				3586	}
				3587	return PyInt_FromLong(result);
				3588	}
				3589
				3590	static char islower__doc__[] =
				3591	"S.islower() -> int\n\
				3592	\n\
				3593	Return 1 if all cased characters in S are lowercase and there is\n\
				3594	at least one cased character in S, 0 otherwise.";
				3595
				3596	static PyObject*
				3597	unicode_islower(PyUnicodeObject self, PyObject args)
				3598	{
				3599	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3600	register const Py_UNICODE *e;
				3601	int cased;
				3602
				3603	if (!PyArg_NoArgs(args))
				3604	return NULL;
				3605
				3606	/* Shortcut for single character strings */
				3607	if (PyUnicode_GET_SIZE(self) == 1)
				3608	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3609
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3610	/* Special case for empty strings */
				3611	if (PyString_GET_SIZE(self) == 0)
				3612	return PyInt_FromLong(0);
				3613
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3614	e = p + PyUnicode_GET_SIZE(self);
				3615	cased = 0;
				3616	for (; p < e; p++) {
				3617	register const Py_UNICODE ch = *p;
				3618
				3619	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3620	return PyInt_FromLong(0);
				3621	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3622	cased = 1;
				3623	}
				3624	return PyInt_FromLong(cased);
				3625	}
				3626
				3627	static char isupper__doc__[] =
				3628	"S.isupper() -> int\n\
				3629	\n\
				3630	Return 1 if all cased characters in S are uppercase and there is\n\
				3631	at least one cased character in S, 0 otherwise.";
				3632
				3633	static PyObject*
				3634	unicode_isupper(PyUnicodeObject self, PyObject args)
				3635	{
				3636	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3637	register const Py_UNICODE *e;
				3638	int cased;
				3639
				3640	if (!PyArg_NoArgs(args))
				3641	return NULL;
				3642
				3643	/* Shortcut for single character strings */
				3644	if (PyUnicode_GET_SIZE(self) == 1)
				3645	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3646
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3647	/* Special case for empty strings */
				3648	if (PyString_GET_SIZE(self) == 0)
				3649	return PyInt_FromLong(0);
				3650
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3651	e = p + PyUnicode_GET_SIZE(self);
				3652	cased = 0;
				3653	for (; p < e; p++) {
				3654	register const Py_UNICODE ch = *p;
				3655
				3656	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3657	return PyInt_FromLong(0);
				3658	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3659	cased = 1;
				3660	}
				3661	return PyInt_FromLong(cased);
				3662	}
				3663
				3664	static char istitle__doc__[] =
				3665	"S.istitle() -> int\n\
				3666	\n\
				3667	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3668	may only follow uncased characters and lowercase characters only cased\n\
				3669	ones. Return 0 otherwise.";
				3670
				3671	static PyObject*
				3672	unicode_istitle(PyUnicodeObject self, PyObject args)
				3673	{
				3674	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3675	register const Py_UNICODE *e;
				3676	int cased, previous_is_cased;
				3677
				3678	if (!PyArg_NoArgs(args))
				3679	return NULL;
				3680
				3681	/* Shortcut for single character strings */
				3682	if (PyUnicode_GET_SIZE(self) == 1)
				3683	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3684	(Py_UNICODE_ISUPPER(*p) != 0));
				3685
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3686	/* Special case for empty strings */
				3687	if (PyString_GET_SIZE(self) == 0)
				3688	return PyInt_FromLong(0);
				3689
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3690	e = p + PyUnicode_GET_SIZE(self);
				3691	cased = 0;
				3692	previous_is_cased = 0;
				3693	for (; p < e; p++) {
				3694	register const Py_UNICODE ch = *p;
				3695
				3696	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3697	if (previous_is_cased)
				3698	return PyInt_FromLong(0);
				3699	previous_is_cased = 1;
				3700	cased = 1;
				3701	}
				3702	else if (Py_UNICODE_ISLOWER(ch)) {
				3703	if (!previous_is_cased)
				3704	return PyInt_FromLong(0);
				3705	previous_is_cased = 1;
				3706	cased = 1;
				3707	}
				3708	else
				3709	previous_is_cased = 0;
				3710	}
				3711	return PyInt_FromLong(cased);
				3712	}
				3713
				3714	static char isspace__doc__[] =
				3715	"S.isspace() -> int\n\
				3716	\n\
				3717	Return 1 if there are only whitespace characters in S,\n\
				3718	0 otherwise.";
				3719
				3720	static PyObject*
				3721	unicode_isspace(PyUnicodeObject self, PyObject args)
				3722	{
				3723	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3724	register const Py_UNICODE *e;
				3725
				3726	if (!PyArg_NoArgs(args))
				3727	return NULL;
				3728
				3729	/* Shortcut for single character strings */
				3730	if (PyUnicode_GET_SIZE(self) == 1 &&
				3731	Py_UNICODE_ISSPACE(*p))
				3732	return PyInt_FromLong(1);
				3733
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3734	/* Special case for empty strings */
				3735	if (PyString_GET_SIZE(self) == 0)
				3736	return PyInt_FromLong(0);
				3737
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3738	e = p + PyUnicode_GET_SIZE(self);
				3739	for (; p < e; p++) {
				3740	if (!Py_UNICODE_ISSPACE(*p))
				3741	return PyInt_FromLong(0);
				3742	}
				3743	return PyInt_FromLong(1);
				3744	}
				3745
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3746	static char isalpha__doc__[] =
				3747	"S.isalpha() -> int\n\
				3748	\n\
				3749	Return 1 if all characters in S are alphabetic\n\
				3750	and there is at least one character in S, 0 otherwise.";
				3751
				3752	static PyObject*
				3753	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3754	{
				3755	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3756	register const Py_UNICODE *e;
				3757
				3758	if (!PyArg_NoArgs(args))
				3759	return NULL;
				3760
				3761	/* Shortcut for single character strings */
				3762	if (PyUnicode_GET_SIZE(self) == 1 &&
				3763	Py_UNICODE_ISALPHA(*p))
				3764	return PyInt_FromLong(1);
				3765
				3766	/* Special case for empty strings */
				3767	if (PyString_GET_SIZE(self) == 0)
				3768	return PyInt_FromLong(0);
				3769
				3770	e = p + PyUnicode_GET_SIZE(self);
				3771	for (; p < e; p++) {
				3772	if (!Py_UNICODE_ISALPHA(*p))
				3773	return PyInt_FromLong(0);
				3774	}
				3775	return PyInt_FromLong(1);
				3776	}
				3777
				3778	static char isalnum__doc__[] =
				3779	"S.isalnum() -> int\n\
				3780	\n\
				3781	Return 1 if all characters in S are alphanumeric\n\
				3782	and there is at least one character in S, 0 otherwise.";
				3783
				3784	static PyObject*
				3785	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3786	{
				3787	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3788	register const Py_UNICODE *e;
				3789
				3790	if (!PyArg_NoArgs(args))
				3791	return NULL;
				3792
				3793	/* Shortcut for single character strings */
				3794	if (PyUnicode_GET_SIZE(self) == 1 &&
				3795	Py_UNICODE_ISALNUM(*p))
				3796	return PyInt_FromLong(1);
				3797
				3798	/* Special case for empty strings */
				3799	if (PyString_GET_SIZE(self) == 0)
				3800	return PyInt_FromLong(0);
				3801
				3802	e = p + PyUnicode_GET_SIZE(self);
				3803	for (; p < e; p++) {
				3804	if (!Py_UNICODE_ISALNUM(*p))
				3805	return PyInt_FromLong(0);
				3806	}
				3807	return PyInt_FromLong(1);
				3808	}
				3809
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3810	static char isdecimal__doc__[] =
				3811	"S.isdecimal() -> int\n\
				3812	\n\
				3813	Return 1 if there are only decimal characters in S,\n\
				3814	0 otherwise.";
				3815
				3816	static PyObject*
				3817	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3818	{
				3819	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3820	register const Py_UNICODE *e;
				3821
				3822	if (!PyArg_NoArgs(args))
				3823	return NULL;
				3824
				3825	/* Shortcut for single character strings */
				3826	if (PyUnicode_GET_SIZE(self) == 1 &&
				3827	Py_UNICODE_ISDECIMAL(*p))
				3828	return PyInt_FromLong(1);
				3829
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3830	/* Special case for empty strings */
				3831	if (PyString_GET_SIZE(self) == 0)
				3832	return PyInt_FromLong(0);
				3833
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3834	e = p + PyUnicode_GET_SIZE(self);
				3835	for (; p < e; p++) {
				3836	if (!Py_UNICODE_ISDECIMAL(*p))
				3837	return PyInt_FromLong(0);
				3838	}
				3839	return PyInt_FromLong(1);
				3840	}
				3841
				3842	static char isdigit__doc__[] =
				3843	"S.isdigit() -> int\n\
				3844	\n\
				3845	Return 1 if there are only digit characters in S,\n\
				3846	0 otherwise.";
				3847
				3848	static PyObject*
				3849	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3850	{
				3851	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3852	register const Py_UNICODE *e;
				3853
				3854	if (!PyArg_NoArgs(args))
				3855	return NULL;
				3856
				3857	/* Shortcut for single character strings */
				3858	if (PyUnicode_GET_SIZE(self) == 1 &&
				3859	Py_UNICODE_ISDIGIT(*p))
				3860	return PyInt_FromLong(1);
				3861
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3862	/* Special case for empty strings */
				3863	if (PyString_GET_SIZE(self) == 0)
				3864	return PyInt_FromLong(0);
				3865
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3866	e = p + PyUnicode_GET_SIZE(self);
				3867	for (; p < e; p++) {
				3868	if (!Py_UNICODE_ISDIGIT(*p))
				3869	return PyInt_FromLong(0);
				3870	}
				3871	return PyInt_FromLong(1);
				3872	}
				3873
				3874	static char isnumeric__doc__[] =
				3875	"S.isnumeric() -> int\n\
				3876	\n\
				3877	Return 1 if there are only numeric characters in S,\n\
				3878	0 otherwise.";
				3879
				3880	static PyObject*
				3881	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3882	{
				3883	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3884	register const Py_UNICODE *e;
				3885
				3886	if (!PyArg_NoArgs(args))
				3887	return NULL;
				3888
				3889	/* Shortcut for single character strings */
				3890	if (PyUnicode_GET_SIZE(self) == 1 &&
				3891	Py_UNICODE_ISNUMERIC(*p))
				3892	return PyInt_FromLong(1);
				3893
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3894	/* Special case for empty strings */
				3895	if (PyString_GET_SIZE(self) == 0)
				3896	return PyInt_FromLong(0);
				3897
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3898	e = p + PyUnicode_GET_SIZE(self);
				3899	for (; p < e; p++) {
				3900	if (!Py_UNICODE_ISNUMERIC(*p))
				3901	return PyInt_FromLong(0);
				3902	}
				3903	return PyInt_FromLong(1);
				3904	}
				3905
				3906	static char join__doc__[] =
				3907	"S.join(sequence) -> unicode\n\
				3908	\n\
				3909	Return a string which is the concatenation of the strings in the\n\
				3910	sequence. The separator between elements is S.";
				3911
				3912	static PyObject*
				3913	unicode_join(PyUnicodeObject self, PyObject args)
				3914	{
				3915	PyObject *data;
				3916	if (!PyArg_ParseTuple(args, "O:join", &data))
				3917	return NULL;
				3918
				3919	return PyUnicode_Join((PyObject *)self, data);
				3920	}
				3921
				3922	static int
				3923	unicode_length(PyUnicodeObject *self)
				3924	{
				3925	return self->length;
				3926	}
				3927
				3928	static char ljust__doc__[] =
				3929	"S.ljust(width) -> unicode\n\
				3930	\n\
				3931	Return S left justified in a Unicode string of length width. Padding is\n\
				3932	done using spaces.";
				3933
				3934	static PyObject *
				3935	unicode_ljust(PyUnicodeObject self, PyObject args)
				3936	{
				3937	int width;
				3938	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3939	return NULL;
				3940
				3941	if (self->length >= width) {
				3942	Py_INCREF(self);
				3943	return (PyObject*) self;
				3944	}
				3945
				3946	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3947	}
				3948
				3949	static char lower__doc__[] =
				3950	"S.lower() -> unicode\n\
				3951	\n\
				3952	Return a copy of the string S converted to lowercase.";
				3953
				3954	static PyObject*
				3955	unicode_lower(PyUnicodeObject self, PyObject args)
				3956	{
				3957	if (!PyArg_NoArgs(args))
				3958	return NULL;
				3959	return fixup(self, fixlower);
				3960	}
				3961
				3962	static char lstrip__doc__[] =
				3963	"S.lstrip() -> unicode\n\
				3964	\n\
				3965	Return a copy of the string S with leading whitespace removed.";
				3966
				3967	static PyObject *
				3968	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3969	{
				3970	if (!PyArg_NoArgs(args))
				3971	return NULL;
				3972	return strip(self, 1, 0);
				3973	}
				3974
				3975	static PyObject*
				3976	unicode_repeat(PyUnicodeObject *str, int len)
				3977	{
				3978	PyUnicodeObject *u;
				3979	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3980	int nchars;
				3981	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3982
				3983	if (len < 0)
				3984	len = 0;
				3985
				3986	if (len == 1) {
				3987	/* no repeat, return original string */
				3988	Py_INCREF(str);
				3989	return (PyObject*) str;
				3990	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3991
				3992	/* ensure # of chars needed doesn't overflow int and # of bytes
				3993	* needed doesn't overflow size_t
				3994	*/
				3995	nchars = len * str->length;
				3996	if (len && nchars / len != str->length) {
				3997	PyErr_SetString(PyExc_OverflowError,
				3998	"repeated string is too long");
				3999	return NULL;
				4000	}
				4001	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4002	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4003	PyErr_SetString(PyExc_OverflowError,
				4004	"repeated string is too long");
				4005	return NULL;
				4006	}
				4007	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4008	if (!u)
				4009	return NULL;
				4010
				4011	p = u->str;
				4012
				4013	while (len-- > 0) {
				4014	Py_UNICODE_COPY(p, str->str, str->length);
				4015	p += str->length;
				4016	}
				4017
				4018	return (PyObject*) u;
				4019	}
				4020
				4021	PyObject PyUnicode_Replace(PyObject obj,
				4022	PyObject *subobj,
				4023	PyObject *replobj,
				4024	int maxcount)
				4025	{
				4026	PyObject *self;
				4027	PyObject *str1;
				4028	PyObject *str2;
				4029	PyObject *result;
				4030
				4031	self = PyUnicode_FromObject(obj);
				4032	if (self == NULL)
				4033	return NULL;
				4034	str1 = PyUnicode_FromObject(subobj);
				4035	if (str1 == NULL) {
				4036	Py_DECREF(self);
				4037	return NULL;
				4038	}
				4039	str2 = PyUnicode_FromObject(replobj);
				4040	if (str2 == NULL) {
				4041	Py_DECREF(self);
				4042	Py_DECREF(str1);
				4043	return NULL;
				4044	}
				4045	result = replace((PyUnicodeObject *)self,
				4046	(PyUnicodeObject *)str1,
				4047	(PyUnicodeObject *)str2,
				4048	maxcount);
				4049	Py_DECREF(self);
				4050	Py_DECREF(str1);
				4051	Py_DECREF(str2);
				4052	return result;
				4053	}
				4054
				4055	static char replace__doc__[] =
				4056	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4057	\n\
				4058	Return a copy of S with all occurrences of substring\n\
				4059	old replaced by new. If the optional argument maxsplit is\n\
				4060	given, only the first maxsplit occurrences are replaced.";
				4061
				4062	static PyObject*
				4063	unicode_replace(PyUnicodeObject self, PyObject args)
				4064	{
				4065	PyUnicodeObject *str1;
				4066	PyUnicodeObject *str2;
				4067	int maxcount = -1;
				4068	PyObject *result;
				4069
				4070	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4071	return NULL;
				4072	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4073	if (str1 == NULL)
				4074	return NULL;
				4075	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4076	if (str2 == NULL)
				4077	return NULL;
				4078
				4079	result = replace(self, str1, str2, maxcount);
				4080
				4081	Py_DECREF(str1);
				4082	Py_DECREF(str2);
				4083	return result;
				4084	}
				4085
				4086	static
				4087	PyObject unicode_repr(PyObject unicode)
				4088	{
				4089	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4090	PyUnicode_GET_SIZE(unicode),
				4091	1);
				4092	}
				4093
				4094	static char rfind__doc__[] =
				4095	"S.rfind(sub [,start [,end]]) -> int\n\
				4096	\n\
				4097	Return the highest index in S where substring sub is found,\n\
				4098	such that sub is contained within s[start,end]. Optional\n\
				4099	arguments start and end are interpreted as in slice notation.\n\
				4100	\n\
				4101	Return -1 on failure.";
				4102
				4103	static PyObject *
				4104	unicode_rfind(PyUnicodeObject self, PyObject args)
				4105	{
				4106	PyUnicodeObject *substring;
				4107	int start = 0;
				4108	int end = INT_MAX;
				4109	PyObject *result;
				4110
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4111	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4112	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4113	return NULL;
				4114	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4115	(PyObject *)substring);
				4116	if (substring == NULL)
				4117	return NULL;
				4118
				4119	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4120
				4121	Py_DECREF(substring);
				4122	return result;
				4123	}
				4124
				4125	static char rindex__doc__[] =
				4126	"S.rindex(sub [,start [,end]]) -> int\n\
				4127	\n\
				4128	Like S.rfind() but raise ValueError when the substring is not found.";
				4129
				4130	static PyObject *
				4131	unicode_rindex(PyUnicodeObject self, PyObject args)
				4132	{
				4133	int result;
				4134	PyUnicodeObject *substring;
				4135	int start = 0;
				4136	int end = INT_MAX;
				4137
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4138	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4139	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4140	return NULL;
				4141	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4142	(PyObject *)substring);
				4143	if (substring == NULL)
				4144	return NULL;
				4145
				4146	result = findstring(self, substring, start, end, -1);
				4147
				4148	Py_DECREF(substring);
				4149	if (result < 0) {
				4150	PyErr_SetString(PyExc_ValueError, "substring not found");
				4151	return NULL;
				4152	}
				4153	return PyInt_FromLong(result);
				4154	}
				4155
				4156	static char rjust__doc__[] =
				4157	"S.rjust(width) -> unicode\n\
				4158	\n\
				4159	Return S right justified in a Unicode string of length width. Padding is\n\
				4160	done using spaces.";
				4161
				4162	static PyObject *
				4163	unicode_rjust(PyUnicodeObject self, PyObject args)
				4164	{
				4165	int width;
				4166	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4167	return NULL;
				4168
				4169	if (self->length >= width) {
				4170	Py_INCREF(self);
				4171	return (PyObject*) self;
				4172	}
				4173
				4174	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4175	}
				4176
				4177	static char rstrip__doc__[] =
				4178	"S.rstrip() -> unicode\n\
				4179	\n\
				4180	Return a copy of the string S with trailing whitespace removed.";
				4181
				4182	static PyObject *
				4183	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4184	{
				4185	if (!PyArg_NoArgs(args))
				4186	return NULL;
				4187	return strip(self, 0, 1);
				4188	}
				4189
				4190	static PyObject*
				4191	unicode_slice(PyUnicodeObject *self, int start, int end)
				4192	{
				4193	/* standard clamping */
				4194	if (start < 0)
				4195	start = 0;
				4196	if (end < 0)
				4197	end = 0;
				4198	if (end > self->length)
				4199	end = self->length;
				4200	if (start == 0 && end == self->length) {
				4201	/* full slice, return original string */
				4202	Py_INCREF(self);
				4203	return (PyObject*) self;
				4204	}
				4205	if (start > end)
				4206	start = end;
				4207	/* copy slice */
				4208	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4209	end - start);
				4210	}
				4211
				4212	PyObject PyUnicode_Split(PyObject s,
				4213	PyObject *sep,
				4214	int maxsplit)
				4215	{
				4216	PyObject *result;
				4217
				4218	s = PyUnicode_FromObject(s);
				4219	if (s == NULL)
				4220	return NULL;
				4221	if (sep != NULL) {
				4222	sep = PyUnicode_FromObject(sep);
				4223	if (sep == NULL) {
				4224	Py_DECREF(s);
				4225	return NULL;
				4226	}
				4227	}
				4228
				4229	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4230
				4231	Py_DECREF(s);
				4232	Py_XDECREF(sep);
				4233	return result;
				4234	}
				4235
				4236	static char split__doc__[] =
				4237	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4238	\n\
				4239	Return a list of the words in S, using sep as the\n\
				4240	delimiter string. If maxsplit is given, at most maxsplit\n\
				4241	splits are done. If sep is not specified, any whitespace string\n\
				4242	is a separator.";
				4243
				4244	static PyObject*
				4245	unicode_split(PyUnicodeObject self, PyObject args)
				4246	{
				4247	PyObject *substring = Py_None;
				4248	int maxcount = -1;
				4249
				4250	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4251	return NULL;
				4252
				4253	if (substring == Py_None)
				4254	return split(self, NULL, maxcount);
				4255	else if (PyUnicode_Check(substring))
				4256	return split(self, (PyUnicodeObject *)substring, maxcount);
				4257	else
				4258	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4259	}
				4260
				4261	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4262	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4263	\n\
				4264	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4265	Line breaks are not included in the resulting list unless keepends\n\
				4266	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4267
				4268	static PyObject*
				4269	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4270	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4271	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4272
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4273	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4274	return NULL;
				4275
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4276	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4277	}
				4278
				4279	static
				4280	PyObject unicode_str(PyUnicodeObject self)
				4281	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4282	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4283	}
				4284
				4285	static char strip__doc__[] =
				4286	"S.strip() -> unicode\n\
				4287	\n\
				4288	Return a copy of S with leading and trailing whitespace removed.";
				4289
				4290	static PyObject *
				4291	unicode_strip(PyUnicodeObject self, PyObject args)
				4292	{
				4293	if (!PyArg_NoArgs(args))
				4294	return NULL;
				4295	return strip(self, 1, 1);
				4296	}
				4297
				4298	static char swapcase__doc__[] =
				4299	"S.swapcase() -> unicode\n\
				4300	\n\
				4301	Return a copy of S with uppercase characters converted to lowercase\n\
				4302	and vice versa.";
				4303
				4304	static PyObject*
				4305	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4306	{
				4307	if (!PyArg_NoArgs(args))
				4308	return NULL;
				4309	return fixup(self, fixswapcase);
				4310	}
				4311
				4312	static char translate__doc__[] =
				4313	"S.translate(table) -> unicode\n\
				4314	\n\
				4315	Return a copy of the string S, where all characters have been mapped\n\
				4316	through the given translation table, which must be a mapping of\n\
				4317	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4318	are left untouched. Characters mapped to None are deleted.";
				4319
				4320	static PyObject*
				4321	unicode_translate(PyUnicodeObject self, PyObject args)
				4322	{
				4323	PyObject *table;
				4324
				4325	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4326	return NULL;
				4327	return PyUnicode_TranslateCharmap(self->str,
				4328	self->length,
				4329	table,
				4330	"ignore");
				4331	}
				4332
				4333	static char upper__doc__[] =
				4334	"S.upper() -> unicode\n\
				4335	\n\
				4336	Return a copy of S converted to uppercase.";
				4337
				4338	static PyObject*
				4339	unicode_upper(PyUnicodeObject self, PyObject args)
				4340	{
				4341	if (!PyArg_NoArgs(args))
				4342	return NULL;
				4343	return fixup(self, fixupper);
				4344	}
				4345
				4346	#if 0
				4347	static char zfill__doc__[] =
				4348	"S.zfill(width) -> unicode\n\
				4349	\n\
				4350	Pad a numeric string x with zeros on the left, to fill a field\n\
				4351	of the specified width. The string x is never truncated.";
				4352
				4353	static PyObject *
				4354	unicode_zfill(PyUnicodeObject self, PyObject args)
				4355	{
				4356	int fill;
				4357	PyUnicodeObject *u;
				4358
				4359	int width;
				4360	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4361	return NULL;
				4362
				4363	if (self->length >= width) {
				4364	Py_INCREF(self);
				4365	return (PyObject*) self;
				4366	}
				4367
				4368	fill = width - self->length;
				4369
				4370	u = pad(self, fill, 0, '0');
				4371
				4372	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4373	/* move sign to beginning of string */
				4374	u->str[0] = u->str[fill];
				4375	u->str[fill] = '0';
				4376	}
				4377
				4378	return (PyObject*) u;
				4379	}
				4380	#endif
				4381
				4382	#if 0
				4383	static PyObject*
				4384	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4385	{
				4386	if (!PyArg_NoArgs(args))
				4387	return NULL;
				4388	return PyInt_FromLong(unicode_freelist_size);
				4389	}
				4390	#endif
				4391
				4392	static char startswith__doc__[] =
				4393	"S.startswith(prefix[, start[, end]]) -> int\n\
				4394	\n\
				4395	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4396	optional start, test S beginning at that position. With optional end, stop\n\
				4397	comparing S at that position.";
				4398
				4399	static PyObject *
				4400	unicode_startswith(PyUnicodeObject *self,
				4401	PyObject *args)
				4402	{
				4403	PyUnicodeObject *substring;
				4404	int start = 0;
				4405	int end = INT_MAX;
				4406	PyObject *result;
				4407
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4408	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4409	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4410	return NULL;
				4411	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4412	(PyObject *)substring);
				4413	if (substring == NULL)
				4414	return NULL;
				4415
				4416	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4417
				4418	Py_DECREF(substring);
				4419	return result;
				4420	}
				4421
				4422
				4423	static char endswith__doc__[] =
				4424	"S.endswith(suffix[, start[, end]]) -> int\n\
				4425	\n\
				4426	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4427	optional start, test S beginning at that position. With optional end, stop\n\
				4428	comparing S at that position.";
				4429
				4430	static PyObject *
				4431	unicode_endswith(PyUnicodeObject *self,
				4432	PyObject *args)
				4433	{
				4434	PyUnicodeObject *substring;
				4435	int start = 0;
				4436	int end = INT_MAX;
				4437	PyObject *result;
				4438
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4439	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4440	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4441	return NULL;
				4442	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4443	(PyObject *)substring);
				4444	if (substring == NULL)
				4445	return NULL;
				4446
				4447	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4448
				4449	Py_DECREF(substring);
				4450	return result;
				4451	}
				4452
				4453
				4454	static PyMethodDef unicode_methods[] = {
				4455
				4456	/* Order is according to common usage: often used methods should
				4457	appear first, since lookup is done sequentially. */
				4458
				4459	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4460	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4461	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4462	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4463	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4464	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4465	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4466	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4467	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4468	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4469	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4470	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4471	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4472	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4473	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4474	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4475	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4476	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4477	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4478	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4479	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4480	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4481	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4482	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4483	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4484	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4485	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4486	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4487	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4488	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4489	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4490	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4491	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4492	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4493	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4494	#if 0
				4495	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4496	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4497	#endif
				4498
				4499	#if 0
				4500	/* This one is just used for debugging the implementation. */
				4501	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4502	#endif
				4503
				4504	{NULL, NULL}
				4505	};
				4506
				4507	static PyObject *
				4508	unicode_getattr(PyUnicodeObject self, char name)
				4509	{
				4510	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4511	}
				4512
				4513	static PySequenceMethods unicode_as_sequence = {
				4514	(inquiry) unicode_length, /* sq_length */
				4515	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4516	(intargfunc) unicode_repeat, /* sq_repeat */
				4517	(intargfunc) unicode_getitem, /* sq_item */
				4518	(intintargfunc) unicode_slice, /* sq_slice */
				4519	0, /* sq_ass_item */
				4520	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4521	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4522	};
				4523
				4524	static int
				4525	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4526	int index,
				4527	const void **ptr)
				4528	{
				4529	if (index != 0) {
				4530	PyErr_SetString(PyExc_SystemError,
				4531	"accessing non-existent unicode segment");
				4532	return -1;
				4533	}
				4534	ptr = (void ) self->str;
				4535	return PyUnicode_GET_DATA_SIZE(self);
				4536	}
				4537
				4538	static int
				4539	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4540	const void **ptr)
				4541	{
				4542	PyErr_SetString(PyExc_TypeError,
				4543	"cannot use unicode as modifyable buffer");
				4544	return -1;
				4545	}
				4546
				4547	static int
				4548	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4549	int *lenp)
				4550	{
				4551	if (lenp)
				4552	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4553	return 1;
				4554	}
				4555
				4556	static int
				4557	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4558	int index,
				4559	const void **ptr)
				4560	{
				4561	PyObject *str;
				4562
				4563	if (index != 0) {
				4564	PyErr_SetString(PyExc_SystemError,
				4565	"accessing non-existent unicode segment");
				4566	return -1;
				4567	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4568	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4569	if (str == NULL)
				4570	return -1;
				4571	ptr = (void ) PyString_AS_STRING(str);
				4572	return PyString_GET_SIZE(str);
				4573	}
				4574
				4575	/* Helpers for PyUnicode_Format() */
				4576
				4577	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4578	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4579	{
				4580	int argidx = *p_argidx;
				4581	if (argidx < arglen) {
				4582	(*p_argidx)++;
				4583	if (arglen < 0)
				4584	return args;
				4585	else
				4586	return PyTuple_GetItem(args, argidx);
				4587	}
				4588	PyErr_SetString(PyExc_TypeError,
				4589	"not enough arguments for format string");
				4590	return NULL;
				4591	}
				4592
				4593	#define F_LJUST (1<<0)
				4594	#define F_SIGN (1<<1)
				4595	#define F_BLANK (1<<2)
				4596	#define F_ALT (1<<3)
				4597	#define F_ZERO (1<<4)
				4598
				4599	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4600	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4601	{
				4602	register int i;
				4603	int len;
				4604	va_list va;
				4605	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4606	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4607
				4608	/* First, format the string as char array, then expand to Py_UNICODE
				4609	array. */
				4610	charbuffer = (char *)buffer;
				4611	len = vsprintf(charbuffer, format, va);
				4612	for (i = len - 1; i >= 0; i--)
				4613	buffer[i] = (Py_UNICODE) charbuffer[i];
				4614
				4615	va_end(va);
				4616	return len;
				4617	}
				4618
				4619	static int
				4620	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4621	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4622	int flags,
				4623	int prec,
				4624	int type,
				4625	PyObject *v)
				4626	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4627	/* fmt = '%#.' + `prec` + `type`
				4628	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4629	char fmt[20];
				4630	double x;
				4631
				4632	x = PyFloat_AsDouble(v);
				4633	if (x == -1.0 && PyErr_Occurred())
				4634	return -1;
				4635	if (prec < 0)
				4636	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4637	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4638	type = 'g';
				4639	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4640	/* worst case length calc to ensure no buffer overrun:
				4641	fmt = %#.<prec>g
				4642	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4643	for any double rep.)
				4644	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4645	If prec=0 the effective precision is 1 (the leading digit is
				4646	always given), therefore increase by one to 10+prec. */
				4647	if (buflen <= (size_t)10 + (size_t)prec) {
				4648	PyErr_SetString(PyExc_OverflowError,
				4649	"formatted float is too long (precision too long?)");
				4650	return -1;
				4651	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4652	return usprintf(buf, fmt, x);
				4653	}
				4654
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4655	static PyObject*
				4656	formatlong(PyObject *val, int flags, int prec, int type)
				4657	{
				4658	char *buf;
				4659	int i, len;
				4660	PyObject str; / temporary string object. */
				4661	PyUnicodeObject *result;
				4662
				4663	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4664	if (!str)
				4665	return NULL;
				4666	result = _PyUnicode_New(len);
				4667	for (i = 0; i < len; i++)
				4668	result->str[i] = buf[i];
				4669	result->str[len] = 0;
				4670	Py_DECREF(str);
				4671	return (PyObject*)result;
				4672	}
				4673
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4674	static int
				4675	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4676	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4677	int flags,
				4678	int prec,
				4679	int type,
				4680	PyObject *v)
				4681	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4682	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4683	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4684	+ 1 + 1 = 24*/
				4685	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4686	long x;
				4687
				4688	x = PyInt_AsLong(v);
				4689	if (x == -1 && PyErr_Occurred())
				4690	return -1;
				4691	if (prec < 0)
				4692	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4693	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4694	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4695	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4696	PyErr_SetString(PyExc_OverflowError,
				4697	"formatted integer is too long (precision too long?)");
				4698	return -1;
				4699	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4700	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4701	return usprintf(buf, fmt, x);
				4702	}
				4703
				4704	static int
				4705	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4706	size_t buflen,
				4707	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4708	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4709	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4710	if (PyUnicode_Check(v)) {
				4711	if (PyUnicode_GET_SIZE(v) != 1)
				4712	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4713	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4714	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4715
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4716	else if (PyString_Check(v)) {
				4717	if (PyString_GET_SIZE(v) != 1)
				4718	goto onError;
				4719	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4720	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4721
				4722	else {
				4723	/* Integer input truncated to a character */
				4724	long x;
				4725	x = PyInt_AsLong(v);
				4726	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4727	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4728	buf[0] = (char) x;
				4729	}
				4730	buf[1] = '\0';
				4731	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4732
				4733	onError:
				4734	PyErr_SetString(PyExc_TypeError,
				4735	"%c requires int or char");
				4736	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4737	}
				4738
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4739	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4740
				4741	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4742	chars are formatted. XXX This is a magic number. Each formatting
				4743	routine does bounds checking to ensure no overflow, but a better
				4744	solution may be to malloc a buffer of appropriate size for each
				4745	format. For now, the current solution is sufficient.
				4746	*/
				4747	#define FORMATBUFLEN (size_t)120
				4748
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4749	PyObject PyUnicode_Format(PyObject format,
				4750	PyObject *args)
				4751	{
				4752	Py_UNICODE fmt, res;
				4753	int fmtcnt, rescnt, reslen, arglen, argidx;
				4754	int args_owned = 0;
				4755	PyUnicodeObject *result = NULL;
				4756	PyObject *dict = NULL;
				4757	PyObject *uformat;
				4758
				4759	if (format == NULL \|\| args == NULL) {
				4760	PyErr_BadInternalCall();
				4761	return NULL;
				4762	}
				4763	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4764	if (uformat == NULL)
				4765	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4766	fmt = PyUnicode_AS_UNICODE(uformat);
				4767	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4768
				4769	reslen = rescnt = fmtcnt + 100;
				4770	result = _PyUnicode_New(reslen);
				4771	if (result == NULL)
				4772	goto onError;
				4773	res = PyUnicode_AS_UNICODE(result);
				4774
				4775	if (PyTuple_Check(args)) {
				4776	arglen = PyTuple_Size(args);
				4777	argidx = 0;
				4778	}
				4779	else {
				4780	arglen = -1;
				4781	argidx = -2;
				4782	}
				4783	if (args->ob_type->tp_as_mapping)
				4784	dict = args;
				4785
				4786	while (--fmtcnt >= 0) {
				4787	if (*fmt != '%') {
				4788	if (--rescnt < 0) {
				4789	rescnt = fmtcnt + 100;
				4790	reslen += rescnt;
				4791	if (_PyUnicode_Resize(result, reslen) < 0)
				4792	return NULL;
				4793	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4794	--rescnt;
				4795	}
				4796	res++ = fmt++;
				4797	}
				4798	else {
				4799	/* Got a format specifier */
				4800	int flags = 0;
				4801	int width = -1;
				4802	int prec = -1;
				4803	int size = 0;
				4804	Py_UNICODE c = '\0';
				4805	Py_UNICODE fill;
				4806	PyObject *v = NULL;
				4807	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4808	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4809	Py_UNICODE sign;
				4810	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4811	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4812
				4813	fmt++;
				4814	if (*fmt == '(') {
				4815	Py_UNICODE *keystart;
				4816	int keylen;
				4817	PyObject *key;
				4818	int pcount = 1;
				4819
				4820	if (dict == NULL) {
				4821	PyErr_SetString(PyExc_TypeError,
				4822	"format requires a mapping");
				4823	goto onError;
				4824	}
				4825	++fmt;
				4826	--fmtcnt;
				4827	keystart = fmt;
				4828	/* Skip over balanced parentheses */
				4829	while (pcount > 0 && --fmtcnt >= 0) {
				4830	if (*fmt == ')')
				4831	--pcount;
				4832	else if (*fmt == '(')
				4833	++pcount;
				4834	fmt++;
				4835	}
				4836	keylen = fmt - keystart - 1;
				4837	if (fmtcnt < 0 \|\| pcount > 0) {
				4838	PyErr_SetString(PyExc_ValueError,
				4839	"incomplete format key");
				4840	goto onError;
				4841	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4842	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4843	then looked up since Python uses strings to hold
				4844	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4845	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4846	key = PyUnicode_EncodeUTF8(keystart,
				4847	keylen,
				4848	NULL);
				4849	if (key == NULL)
				4850	goto onError;
				4851	if (args_owned) {
				4852	Py_DECREF(args);
				4853	args_owned = 0;
				4854	}
				4855	args = PyObject_GetItem(dict, key);
				4856	Py_DECREF(key);
				4857	if (args == NULL) {
				4858	goto onError;
				4859	}
				4860	args_owned = 1;
				4861	arglen = -1;
				4862	argidx = -2;
				4863	}
				4864	while (--fmtcnt >= 0) {
				4865	switch (c = *fmt++) {
				4866	case '-': flags \|= F_LJUST; continue;
				4867	case '+': flags \|= F_SIGN; continue;
				4868	case ' ': flags \|= F_BLANK; continue;
				4869	case '#': flags \|= F_ALT; continue;
				4870	case '0': flags \|= F_ZERO; continue;
				4871	}
				4872	break;
				4873	}
				4874	if (c == '*') {
				4875	v = getnextarg(args, arglen, &argidx);
				4876	if (v == NULL)
				4877	goto onError;
				4878	if (!PyInt_Check(v)) {
				4879	PyErr_SetString(PyExc_TypeError,
				4880	"* wants int");
				4881	goto onError;
				4882	}
				4883	width = PyInt_AsLong(v);
				4884	if (width < 0) {
				4885	flags \|= F_LJUST;
				4886	width = -width;
				4887	}
				4888	if (--fmtcnt >= 0)
				4889	c = *fmt++;
				4890	}
				4891	else if (c >= '0' && c <= '9') {
				4892	width = c - '0';
				4893	while (--fmtcnt >= 0) {
				4894	c = *fmt++;
				4895	if (c < '0' \|\| c > '9')
				4896	break;
				4897	if ((width*10) / 10 != width) {
				4898	PyErr_SetString(PyExc_ValueError,
				4899	"width too big");
				4900	goto onError;
				4901	}
				4902	width = width*10 + (c - '0');
				4903	}
				4904	}
				4905	if (c == '.') {
				4906	prec = 0;
				4907	if (--fmtcnt >= 0)
				4908	c = *fmt++;
				4909	if (c == '*') {
				4910	v = getnextarg(args, arglen, &argidx);
				4911	if (v == NULL)
				4912	goto onError;
				4913	if (!PyInt_Check(v)) {
				4914	PyErr_SetString(PyExc_TypeError,
				4915	"* wants int");
				4916	goto onError;
				4917	}
				4918	prec = PyInt_AsLong(v);
				4919	if (prec < 0)
				4920	prec = 0;
				4921	if (--fmtcnt >= 0)
				4922	c = *fmt++;
				4923	}
				4924	else if (c >= '0' && c <= '9') {
				4925	prec = c - '0';
				4926	while (--fmtcnt >= 0) {
				4927	c = Py_CHARMASK(*fmt++);
				4928	if (c < '0' \|\| c > '9')
				4929	break;
				4930	if ((prec*10) / 10 != prec) {
				4931	PyErr_SetString(PyExc_ValueError,
				4932	"prec too big");
				4933	goto onError;
				4934	}
				4935	prec = prec*10 + (c - '0');
				4936	}
				4937	}
				4938	} /* prec */
				4939	if (fmtcnt >= 0) {
				4940	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4941	size = c;
				4942	if (--fmtcnt >= 0)
				4943	c = *fmt++;
				4944	}
				4945	}
				4946	if (fmtcnt < 0) {
				4947	PyErr_SetString(PyExc_ValueError,
				4948	"incomplete format");
				4949	goto onError;
				4950	}
				4951	if (c != '%') {
				4952	v = getnextarg(args, arglen, &argidx);
				4953	if (v == NULL)
				4954	goto onError;
				4955	}
				4956	sign = 0;
				4957	fill = ' ';
				4958	switch (c) {
				4959
				4960	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4961	pbuf = formatbuf;
				4962	/* presume that buffer length is at least 1 */
				4963	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4964	len = 1;
				4965	break;
				4966
				4967	case 's':
				4968	case 'r':
				4969	if (PyUnicode_Check(v) && c == 's') {
				4970	temp = v;
				4971	Py_INCREF(temp);
				4972	}
				4973	else {
				4974	PyObject *unicode;
				4975	if (c == 's')
				4976	temp = PyObject_Str(v);
				4977	else
				4978	temp = PyObject_Repr(v);
				4979	if (temp == NULL)
				4980	goto onError;
				4981	if (!PyString_Check(temp)) {
				4982	/* XXX Note: this should never happen, since
				4983	PyObject_Repr() and PyObject_Str() assure
				4984	this */
				4985	Py_DECREF(temp);
				4986	PyErr_SetString(PyExc_TypeError,
				4987	"%s argument has non-string str()");
				4988	goto onError;
				4989	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4990	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4991	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4992	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4993	"strict");
				4994	Py_DECREF(temp);
				4995	temp = unicode;
				4996	if (temp == NULL)
				4997	goto onError;
				4998	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4999	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5000	len = PyUnicode_GET_SIZE(temp);
				5001	if (prec >= 0 && len > prec)
				5002	len = prec;
				5003	break;
				5004
				5005	case 'i':
				5006	case 'd':
				5007	case 'u':
				5008	case 'o':
				5009	case 'x':
				5010	case 'X':
				5011	if (c == 'i')
				5012	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5013	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5014	temp = formatlong(v, flags, prec, c);
				5015	if (!temp)
				5016	goto onError;
				5017	pbuf = PyUnicode_AS_UNICODE(temp);
				5018	len = PyUnicode_GET_SIZE(temp);
				5019	/* unbounded ints can always produce
				5020	a sign character! */
				5021	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5022	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5023	else {
				5024	pbuf = formatbuf;
				5025	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5026	flags, prec, c, v);
				5027	if (len < 0)
				5028	goto onError;
				5029	/* only d conversion is signed */
				5030	sign = c == 'd';
				5031	}
				5032	if (flags & F_ZERO)
				5033	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5034	break;
				5035
				5036	case 'e':
				5037	case 'E':
				5038	case 'f':
				5039	case 'g':
				5040	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5041	pbuf = formatbuf;
				5042	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5043	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5044	if (len < 0)
				5045	goto onError;
				5046	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5047	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5048	fill = '0';
				5049	break;
				5050
				5051	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5052	pbuf = formatbuf;
				5053	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5054	if (len < 0)
				5055	goto onError;
				5056	break;
				5057
				5058	default:
				5059	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5060	"unsupported format character '%c' (0x%x) "
				5061	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5062	(31<=c && c<=126) ? c : '?',
				5063	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5064	goto onError;
				5065	}
				5066	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5067	if (pbuf == '-' \|\| pbuf == '+') {
				5068	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5069	len--;
				5070	}
				5071	else if (flags & F_SIGN)
				5072	sign = '+';
				5073	else if (flags & F_BLANK)
				5074	sign = ' ';
				5075	else
				5076	sign = 0;
				5077	}
				5078	if (width < len)
				5079	width = len;
				5080	if (rescnt < width + (sign != 0)) {
				5081	reslen -= rescnt;
				5082	rescnt = width + fmtcnt + 100;
				5083	reslen += rescnt;
				5084	if (_PyUnicode_Resize(result, reslen) < 0)
				5085	return NULL;
				5086	res = PyUnicode_AS_UNICODE(result)
				5087	+ reslen - rescnt;
				5088	}
				5089	if (sign) {
				5090	if (fill != ' ')
				5091	*res++ = sign;
				5092	rescnt--;
				5093	if (width > len)
				5094	width--;
				5095	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5096	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5097	assert(pbuf[0] == '0');
				5098	assert(pbuf[1] == c);
				5099	if (fill != ' ') {
				5100	res++ = pbuf++;
				5101	res++ = pbuf++;
				5102	}
				5103	rescnt -= 2;
				5104	width -= 2;
				5105	if (width < 0)
				5106	width = 0;
				5107	len -= 2;
				5108	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5109	if (width > len && !(flags & F_LJUST)) {
				5110	do {
				5111	--rescnt;
				5112	*res++ = fill;
				5113	} while (--width > len);
				5114	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5115	if (fill == ' ') {
				5116	if (sign)
				5117	*res++ = sign;
				5118	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5119	assert(pbuf[0] == '0');
				5120	assert(pbuf[1] == c);
				5121	res++ = pbuf++;
				5122	res++ = pbuf++;
				5123	}
				5124	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5125	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5126	res += len;
				5127	rescnt -= len;
				5128	while (--width >= len) {
				5129	--rescnt;
				5130	*res++ = ' ';
				5131	}
				5132	if (dict && (argidx < arglen) && c != '%') {
				5133	PyErr_SetString(PyExc_TypeError,
				5134	"not all arguments converted");
				5135	goto onError;
				5136	}
				5137	Py_XDECREF(temp);
				5138	} /* '%' */
				5139	} /* until end */
				5140	if (argidx < arglen && !dict) {
				5141	PyErr_SetString(PyExc_TypeError,
				5142	"not all arguments converted");
				5143	goto onError;
				5144	}
				5145
				5146	if (args_owned) {
				5147	Py_DECREF(args);
				5148	}
				5149	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5150	if (_PyUnicode_Resize(result, reslen - rescnt))
				5151	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5152	return (PyObject *)result;
				5153
				5154	onError:
				5155	Py_XDECREF(result);
				5156	Py_DECREF(uformat);
				5157	if (args_owned) {
				5158	Py_DECREF(args);
				5159	}
				5160	return NULL;
				5161	}
				5162
				5163	static PyBufferProcs unicode_as_buffer = {
				5164	(getreadbufferproc) unicode_buffer_getreadbuf,
				5165	(getwritebufferproc) unicode_buffer_getwritebuf,
				5166	(getsegcountproc) unicode_buffer_getsegcount,
				5167	(getcharbufferproc) unicode_buffer_getcharbuf,
				5168	};
				5169
				5170	PyTypeObject PyUnicode_Type = {
				5171	PyObject_HEAD_INIT(&PyType_Type)
				5172	0, /* ob_size */
				5173	"unicode", /* tp_name */
				5174	sizeof(PyUnicodeObject), /* tp_size */
				5175	0, /* tp_itemsize */
				5176	/* Slots */
				5177	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5178	0, /* tp_print */
				5179	(getattrfunc)unicode_getattr, /* tp_getattr */
				5180	0, /* tp_setattr */
				5181	(cmpfunc) unicode_compare, /* tp_compare */
				5182	(reprfunc) unicode_repr, /* tp_repr */
				5183	0, /* tp_as_number */
				5184	&unicode_as_sequence, /* tp_as_sequence */
				5185	0, /* tp_as_mapping */
				5186	(hashfunc) unicode_hash, /* tp_hash*/
				5187	0, /* tp_call*/
				5188	(reprfunc) unicode_str, /* tp_str */
				5189	(getattrofunc) NULL, /* tp_getattro */
				5190	(setattrofunc) NULL, /* tp_setattro */
				5191	&unicode_as_buffer, /* tp_as_buffer */
				5192	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5193	};
				5194
				5195	/* Initialize the Unicode implementation */
				5196
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5197	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5198	{
				5199	/* Doublecheck the configuration... */
				5200	if (sizeof(Py_UNICODE) != 2)
				5201	Py_FatalError("Unicode configuration error: "
				5202	"sizeof(Py_UNICODE) != 2 bytes");
				5203
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5204	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5205	unicode_freelist = NULL;
				5206	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5207	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5208	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5209	}
				5210
				5211	/* Finalize the Unicode implementation */
				5212
				5213	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5214	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5215	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5216	PyUnicodeObject *u;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5217
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5218	Py_XDECREF(unicode_empty);
				5219	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5220
				5221	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5222	PyUnicodeObject *v = u;
				5223	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5224	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5225	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5226	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5227	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5228	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5229	unicode_freelist = NULL;
				5230	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5231	}