Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: c237789a79edfef77e4ecb224b24e7781dc2f263 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
				86	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	88
				89	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	90	static PyUnicodeObject *unicode_freelist;
				91	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	93	/* Default encoding to use and assume when NULL is passed as encoding
				94	parameter; it is initialized by _PyUnicode_Init().
				95
				96	Always use the PyUnicode_SetDefaultEncoding() and
				97	PyUnicode_GetDefaultEncoding() APIs to access this global.
				98
				99	*/
				100
				101	static char unicode_default_encoding[100];
				102
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	103	/* --- Unicode Object ----------------------------------------------------- */
				104
				105	static
				106	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				107	int length)
				108	{
				109	void *oldstr;
				110
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	111	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	112	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	113	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	114
				115	/* Resizing unicode_empty is not allowed. */
				116	if (unicode == unicode_empty) {
				117	PyErr_SetString(PyExc_SystemError,
				118	"can't resize empty unicode object");
				119	return -1;
				120	}
				121
				122	/* We allocate one more byte to make sure the string is
				123	Ux0000 terminated -- XXX is this needed ? */
				124	oldstr = unicode->str;
				125	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				126	if (!unicode->str) {
				127	unicode->str = oldstr;
				128	PyErr_NoMemory();
				129	return -1;
				130	}
				131	unicode->str[length] = 0;
				132	unicode->length = length;
				133
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	134	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	136	if (unicode->defenc) {
				137	Py_DECREF(unicode->defenc);
				138	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	}
				140	unicode->hash = -1;
				141
				142	return 0;
				143	}
				144
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	int PyUnicode_Resize(PyObject **unicode,
				146	int length)
				147	{
				148	PyUnicodeObject *v;
				149
				150	if (unicode == NULL) {
				151	PyErr_BadInternalCall();
				152	return -1;
				153	}
				154	v = (PyUnicodeObject )unicode;
				155	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				156	PyErr_BadInternalCall();
				157	return -1;
				158	}
				159	return _PyUnicode_Resize(v, length);
				160	}
				161
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	162	/* We allocate one more byte to make sure the string is
				163	Ux0000 terminated -- XXX is this needed ?
				164
				165	XXX This allocator could further be enhanced by assuring that the
				166	free list never reduces its size below 1.
				167
				168	*/
				169
				170	static
				171	PyUnicodeObject *_PyUnicode_New(int length)
				172	{
				173	register PyUnicodeObject *unicode;
				174
				175	/* Optimization for empty strings */
				176	if (length == 0 && unicode_empty != NULL) {
				177	Py_INCREF(unicode_empty);
				178	return unicode_empty;
				179	}
				180
				181	/* Unicode freelist & memory allocation */
				182	if (unicode_freelist) {
				183	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	184	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	185	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	186	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	187	/* Keep-Alive optimization: we only upsize the buffer,
				188	never downsize it. */
				189	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	190	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	191	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	192	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	193	}
				194	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	195	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	197	}
				198	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	}
				200	else {
				201	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				202	if (unicode == NULL)
				203	return NULL;
				204	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				205	}
				206
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	207	if (!unicode->str) {
				208	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	209	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	211	unicode->str[length] = 0;
				212	unicode->length = length;
				213	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	214	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	215	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	216
				217	onError:
				218	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	219	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	220	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	221	}
				222
				223	static
				224	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				225	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	226	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	227	/* Keep-Alive optimization */
				228	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	229	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	230	unicode->str = NULL;
				231	unicode->length = 0;
				232	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	233	if (unicode->defenc) {
				234	Py_DECREF(unicode->defenc);
				235	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	236	}
				237	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	238	(PyUnicodeObject *)unicode = unicode_freelist;
				239	unicode_freelist = unicode;
				240	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	}
				242	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	243	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	244	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	246	}
				247	}
				248
				249	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				250	int size)
				251	{
				252	PyUnicodeObject *unicode;
				253
				254	unicode = _PyUnicode_New(size);
				255	if (!unicode)
				256	return NULL;
				257
				258	/* Copy the Unicode data into the new object */
				259	if (u != NULL)
				260	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				261
				262	return (PyObject *)unicode;
				263	}
				264
				265	#ifdef HAVE_WCHAR_H
				266
				267	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				268	int size)
				269	{
				270	PyUnicodeObject *unicode;
				271
				272	if (w == NULL) {
				273	PyErr_BadInternalCall();
				274	return NULL;
				275	}
				276
				277	unicode = _PyUnicode_New(size);
				278	if (!unicode)
				279	return NULL;
				280
				281	/* Copy the wchar_t data into the new object */
				282	#ifdef HAVE_USABLE_WCHAR_T
				283	memcpy(unicode->str, w, size * sizeof(wchar_t));
				284	#else
				285	{
				286	register Py_UNICODE *u;
				287	register int i;
				288	u = PyUnicode_AS_UNICODE(unicode);
				289	for (i = size; i >= 0; i--)
				290	u++ = w++;
				291	}
				292	#endif
				293
				294	return (PyObject *)unicode;
				295	}
				296
				297	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				298	register wchar_t *w,
				299	int size)
				300	{
				301	if (unicode == NULL) {
				302	PyErr_BadInternalCall();
				303	return -1;
				304	}
				305	if (size > PyUnicode_GET_SIZE(unicode))
				306	size = PyUnicode_GET_SIZE(unicode);
				307	#ifdef HAVE_USABLE_WCHAR_T
				308	memcpy(w, unicode->str, size * sizeof(wchar_t));
				309	#else
				310	{
				311	register Py_UNICODE *u;
				312	register int i;
				313	u = PyUnicode_AS_UNICODE(unicode);
				314	for (i = size; i >= 0; i--)
				315	w++ = u++;
				316	}
				317	#endif
				318
				319	return size;
				320	}
				321
				322	#endif
				323
				324	PyObject PyUnicode_FromObject(register PyObject obj)
				325	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	326	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				327	}
				328
				329	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				330	const char *encoding,
				331	const char *errors)
				332	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	333	const char *s;
				334	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	335	int owned = 0;
				336	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	337
				338	if (obj == NULL) {
				339	PyErr_BadInternalCall();
				340	return NULL;
				341	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	342
				343	/* Coerce object */
				344	if (PyInstance_Check(obj)) {
				345	PyObject *func;
				346	func = PyObject_GetAttrString(obj, "__str__");
				347	if (func == NULL) {
				348	PyErr_SetString(PyExc_TypeError,
				349	"coercing to Unicode: instance doesn't define __str__");
				350	return NULL;
				351	}
				352	obj = PyEval_CallObject(func, NULL);
				353	Py_DECREF(func);
				354	if (obj == NULL)
				355	return NULL;
				356	owned = 1;
				357	}
				358	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	359	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	360	v = obj;
				361	if (encoding) {
				362	PyErr_SetString(PyExc_TypeError,
				363	"decoding Unicode is not supported");
				364	return NULL;
				365	}
				366	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	367	}
				368	else if (PyString_Check(obj)) {
				369	s = PyString_AS_STRING(obj);
				370	len = PyString_GET_SIZE(obj);
				371	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	372	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				373	/* Overwrite the error message with something more useful in
				374	case of a TypeError. */
				375	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	376	PyErr_Format(PyExc_TypeError,
				377	"coercing to Unicode: need string or buffer, "
				378	"%.80s found",
				379	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	380	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	381	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	382
				383	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	384	if (len == 0) {
				385	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	386	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	387	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	388	else
				389	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	390
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	391	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	392	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	393	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	394	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	395	return v;
				396
				397	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	398	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	399	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	400	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	401	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	402	}
				403
				404	PyObject PyUnicode_Decode(const char s,
				405	int size,
				406	const char *encoding,
				407	const char *errors)
				408	{
				409	PyObject buffer = NULL, unicode;
				410
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	411	if (encoding == NULL)
				412	encoding = PyUnicode_GetDefaultEncoding();
				413
				414	/* Shortcuts for common default encodings */
				415	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	416	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	417	else if (strcmp(encoding, "latin-1") == 0)
				418	return PyUnicode_DecodeLatin1(s, size, errors);
				419	else if (strcmp(encoding, "ascii") == 0)
				420	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	421
				422	/* Decode via the codec registry */
				423	buffer = PyBuffer_FromMemory((void *)s, size);
				424	if (buffer == NULL)
				425	goto onError;
				426	unicode = PyCodec_Decode(buffer, encoding, errors);
				427	if (unicode == NULL)
				428	goto onError;
				429	if (!PyUnicode_Check(unicode)) {
				430	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	431	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	432	unicode->ob_type->tp_name);
				433	Py_DECREF(unicode);
				434	goto onError;
				435	}
				436	Py_DECREF(buffer);
				437	return unicode;
				438
				439	onError:
				440	Py_XDECREF(buffer);
				441	return NULL;
				442	}
				443
				444	PyObject PyUnicode_Encode(const Py_UNICODE s,
				445	int size,
				446	const char *encoding,
				447	const char *errors)
				448	{
				449	PyObject v, unicode;
				450
				451	unicode = PyUnicode_FromUnicode(s, size);
				452	if (unicode == NULL)
				453	return NULL;
				454	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				455	Py_DECREF(unicode);
				456	return v;
				457	}
				458
				459	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				460	const char *encoding,
				461	const char *errors)
				462	{
				463	PyObject *v;
				464
				465	if (!PyUnicode_Check(unicode)) {
				466	PyErr_BadArgument();
				467	goto onError;
				468	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	469
				470	if (encoding == NULL)
				471	encoding = PyUnicode_GetDefaultEncoding();
				472
				473	/* Shortcuts for common default encodings */
				474	if (errors == NULL) {
				475	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	476	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	477	else if (strcmp(encoding, "latin-1") == 0)
				478	return PyUnicode_AsLatin1String(unicode);
				479	else if (strcmp(encoding, "ascii") == 0)
				480	return PyUnicode_AsASCIIString(unicode);
				481	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	482
				483	/* Encode via the codec registry */
				484	v = PyCodec_Encode(unicode, encoding, errors);
				485	if (v == NULL)
				486	goto onError;
				487	/* XXX Should we really enforce this ? */
				488	if (!PyString_Check(v)) {
				489	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	490	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	491	v->ob_type->tp_name);
				492	Py_DECREF(v);
				493	goto onError;
				494	}
				495	return v;
				496
				497	onError:
				498	return NULL;
				499	}
				500
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	501	/* Return a Python string holding the default encoded value of the
				502	Unicode object.
				503
				504	The resulting string is cached in the Unicode object for subsequent
				505	usage by this function. The cached version is needed to implement
				506	the character buffer interface and will live (at least) as long as
				507	the Unicode object itself.
				508
				509	The refcount of the string is not incremented.
				510
				511	* Exported for internal use by the interpreter only !!! *
				512
				513	*/
				514
				515	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				516	const char *errors)
				517	{
				518	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				519
				520	if (v)
				521	return v;
				522	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				523	if (v && errors == NULL)
				524	((PyUnicodeObject *)unicode)->defenc = v;
				525	return v;
				526	}
				527
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	528	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				529	{
				530	if (!PyUnicode_Check(unicode)) {
				531	PyErr_BadArgument();
				532	goto onError;
				533	}
				534	return PyUnicode_AS_UNICODE(unicode);
				535
				536	onError:
				537	return NULL;
				538	}
				539
				540	int PyUnicode_GetSize(PyObject *unicode)
				541	{
				542	if (!PyUnicode_Check(unicode)) {
				543	PyErr_BadArgument();
				544	goto onError;
				545	}
				546	return PyUnicode_GET_SIZE(unicode);
				547
				548	onError:
				549	return -1;
				550	}
				551
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	552	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	553	{
				554	return unicode_default_encoding;
				555	}
				556
				557	int PyUnicode_SetDefaultEncoding(const char *encoding)
				558	{
				559	PyObject *v;
				560
				561	/* Make sure the encoding is valid. As side effect, this also
				562	loads the encoding into the codec registry cache. */
				563	v = _PyCodec_Lookup(encoding);
				564	if (v == NULL)
				565	goto onError;
				566	Py_DECREF(v);
				567	strncpy(unicode_default_encoding,
				568	encoding,
				569	sizeof(unicode_default_encoding));
				570	return 0;
				571
				572	onError:
				573	return -1;
				574	}
				575
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	576	/* --- UTF-8 Codec -------------------------------------------------------- */
				577
				578	static
				579	char utf8_code_length[256] = {
				580	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				581	illegal prefix. see RFC 2279 for details */
				582	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				583	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				584	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				585	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				586	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				587	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				588	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				589	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				590	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				591	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				592	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				593	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				594	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				595	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				596	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				597	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				598	};
				599
				600	static
				601	int utf8_decoding_error(const char **source,
				602	Py_UNICODE **dest,
				603	const char *errors,
				604	const char *details)
				605	{
				606	if ((errors == NULL) \|\|
				607	(strcmp(errors,"strict") == 0)) {
				608	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	609	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	610	details);
				611	return -1;
				612	}
				613	else if (strcmp(errors,"ignore") == 0) {
				614	(*source)++;
				615	return 0;
				616	}
				617	else if (strcmp(errors,"replace") == 0) {
				618	(*source)++;
				619	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				620	(*dest)++;
				621	return 0;
				622	}
				623	else {
				624	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	625	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	626	errors);
				627	return -1;
				628	}
				629	}
				630
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	631	PyObject PyUnicode_DecodeUTF8(const char s,
				632	int size,
				633	const char *errors)
				634	{
				635	int n;
				636	const char *e;
				637	PyUnicodeObject *unicode;
				638	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	639	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	640
				641	/* Note: size will always be longer than the resulting Unicode
				642	character count */
				643	unicode = _PyUnicode_New(size);
				644	if (!unicode)
				645	return NULL;
				646	if (size == 0)
				647	return (PyObject *)unicode;
				648
				649	/* Unpack UTF-8 encoded data */
				650	p = unicode->str;
				651	e = s + size;
				652
				653	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	654	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	655
				656	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	657	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	658	s++;
				659	continue;
				660	}
				661
				662	n = utf8_code_length[ch];
				663
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	664	if (s + n > e) {
				665	errmsg = "unexpected end of data";
				666	goto utf8Error;
				667	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	668
				669	switch (n) {
				670
				671	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	672	errmsg = "unexpected code byte";
				673	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	674	break;
				675
				676	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	677	errmsg = "internal error";
				678	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	679	break;
				680
				681	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	682	if ((s[1] & 0xc0) != 0x80) {
				683	errmsg = "invalid data";
				684	goto utf8Error;
				685	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	686	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	687	if (ch < 0x80) {
				688	errmsg = "illegal encoding";
				689	goto utf8Error;
				690	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	691	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	692	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	693	break;
				694
				695	case 3:
				696	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	697	(s[2] & 0xc0) != 0x80) {
				698	errmsg = "invalid data";
				699	goto utf8Error;
				700	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	701	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	702	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				703	errmsg = "illegal encoding";
				704	goto utf8Error;
				705	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	706	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	707	*p++ = (Py_UNICODE)ch;
				708	break;
				709
				710	case 4:
				711	if ((s[1] & 0xc0) != 0x80 \|\|
				712	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	713	(s[3] & 0xc0) != 0x80) {
				714	errmsg = "invalid data";
				715	goto utf8Error;
				716	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	717	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				718	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				719	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	720	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				721	byte encoding */
				722	(ch > 0x10ffff)) { /* maximum value allowed for
				723	UTF-16 */
				724	errmsg = "illegal encoding";
				725	goto utf8Error;
				726	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	727	/* compute and append the two surrogates: */
				728
				729	/* translate from 10000..10FFFF to 0..FFFF */
				730	ch -= 0x10000;
				731
				732	/* high surrogate = top 10 bits added to D800 */
				733	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				734
				735	/* low surrogate = bottom 10 bits added to DC00 */
				736	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	737	break;
				738
				739	default:
				740	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	741	errmsg = "unsupported Unicode code range";
				742	goto utf8Error;
				743	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	744	}
				745	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	746	continue;
				747
				748	utf8Error:
				749	if (utf8_decoding_error(&s, &p, errors, errmsg))
				750	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	751	}
				752
				753	/* Adjust length */
				754	if (_PyUnicode_Resize(unicode, p - unicode->str))
				755	goto onError;
				756
				757	return (PyObject *)unicode;
				758
				759	onError:
				760	Py_DECREF(unicode);
				761	return NULL;
				762	}
				763
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	764	/* Not used anymore, now that the encoder supports UTF-16
				765	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	766	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	767	static
				768	int utf8_encoding_error(const Py_UNICODE **source,
				769	char **dest,
				770	const char *errors,
				771	const char *details)
				772	{
				773	if ((errors == NULL) \|\|
				774	(strcmp(errors,"strict") == 0)) {
				775	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	776	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	777	details);
				778	return -1;
				779	}
				780	else if (strcmp(errors,"ignore") == 0) {
				781	return 0;
				782	}
				783	else if (strcmp(errors,"replace") == 0) {
				784	**dest = '?';
				785	(*dest)++;
				786	return 0;
				787	}
				788	else {
				789	PyErr_Format(PyExc_ValueError,
				790	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	791	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	792	errors);
				793	return -1;
				794	}
				795	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	796	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	797
				798	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				799	int size,
				800	const char *errors)
				801	{
				802	PyObject *v;
				803	char *p;
				804	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	805	Py_UCS4 ch2;
				806	unsigned int cbAllocated = 3 * size;
				807	unsigned int cbWritten = 0;
				808	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	809
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	810	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	811	if (v == NULL)
				812	return NULL;
				813	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	814	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	815
				816	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	817	while (i < size) {
				818	Py_UCS4 ch = s[i++];
				819	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	820	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	821	cbWritten++;
				822	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	823	else if (ch < 0x0800) {
				824	*p++ = 0xc0 \| (ch >> 6);
				825	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	826	cbWritten += 2;
				827	}
				828	else {
				829	/* Check for high surrogate */
				830	if (0xD800 <= ch && ch <= 0xDBFF) {
				831	if (i != size) {
				832	ch2 = s[i];
				833	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				834
				835	if (cbWritten >= (cbAllocated - 4)) {
				836	/* Provide enough room for some more
				837	surrogates */
				838	cbAllocated += 4*10;
				839	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	840	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	841	}
				842
				843	/* combine the two values */
				844	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				845
				846	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	847	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	848	i++;
				849	cbWritten += 4;
				850	}
				851	}
				852	}
				853	else {
				854	*p++ = (char)(0xe0 \| (ch >> 12));
				855	cbWritten += 3;
				856	}
				857	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				858	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	859	}
				860	}
				861	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	862	if (_PyString_Resize(&v, p - q))
				863	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	864	return v;
				865
				866	onError:
				867	Py_DECREF(v);
				868	return NULL;
				869	}
				870
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	871	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				872	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	873	if (!PyUnicode_Check(unicode)) {
				874	PyErr_BadArgument();
				875	return NULL;
				876	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	877	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				878	PyUnicode_GET_SIZE(unicode),
				879	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	880	}
				881
				882	/* --- UTF-16 Codec ------------------------------------------------------- */
				883
				884	static
				885	int utf16_decoding_error(const Py_UNICODE **source,
				886	Py_UNICODE **dest,
				887	const char *errors,
				888	const char *details)
				889	{
				890	if ((errors == NULL) \|\|
				891	(strcmp(errors,"strict") == 0)) {
				892	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	893	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	894	details);
				895	return -1;
				896	}
				897	else if (strcmp(errors,"ignore") == 0) {
				898	return 0;
				899	}
				900	else if (strcmp(errors,"replace") == 0) {
				901	if (dest) {
				902	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				903	(*dest)++;
				904	}
				905	return 0;
				906	}
				907	else {
				908	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	909	"UTF-16 decoding error; "
				910	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	911	errors);
				912	return -1;
				913	}
				914	}
				915
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	916	PyObject PyUnicode_DecodeUTF16(const char s,
				917	int size,
				918	const char *errors,
				919	int *byteorder)
				920	{
				921	PyUnicodeObject *unicode;
				922	Py_UNICODE *p;
				923	const Py_UNICODE q, e;
				924	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	925	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	926
				927	/* size should be an even number */
				928	if (size % sizeof(Py_UNICODE) != 0) {
				929	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				930	return NULL;
				931	/* The remaining input chars are ignored if we fall through
				932	here... */
				933	}
				934
				935	/* Note: size will always be longer than the resulting Unicode
				936	character count */
				937	unicode = _PyUnicode_New(size);
				938	if (!unicode)
				939	return NULL;
				940	if (size == 0)
				941	return (PyObject *)unicode;
				942
				943	/* Unpack UTF-16 encoded data */
				944	p = unicode->str;
				945	q = (Py_UNICODE *)s;
				946	e = q + (size / sizeof(Py_UNICODE));
				947
				948	if (byteorder)
				949	bo = *byteorder;
				950
				951	while (q < e) {
				952	register Py_UNICODE ch = *q++;
				953
				954	/* Check for BOM marks (U+FEFF) in the input and adjust
				955	current byte order setting accordingly. Swap input
				956	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				957	!) */
				958	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				959	if (ch == 0xFEFF) {
				960	bo = -1;
				961	continue;
				962	} else if (ch == 0xFFFE) {
				963	bo = 1;
				964	continue;
				965	}
				966	if (bo == 1)
				967	ch = (ch >> 8) \| (ch << 8);
				968	#else
				969	if (ch == 0xFEFF) {
				970	bo = 1;
				971	continue;
				972	} else if (ch == 0xFFFE) {
				973	bo = -1;
				974	continue;
				975	}
				976	if (bo == -1)
				977	ch = (ch >> 8) \| (ch << 8);
				978	#endif
				979	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				980	*p++ = ch;
				981	continue;
				982	}
				983
				984	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	985	if (q >= e) {
				986	errmsg = "unexpected end of data";
				987	goto utf16Error;
				988	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	989	if (0xDC00 <= q && q <= 0xDFFF) {
				990	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	991	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	992	/* This is valid data (a UTF-16 surrogate pair), but
				993	we are not able to store this information since our
				994	Py_UNICODE type only has 16 bits... this might
				995	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	996	errmsg = "code pairs are not supported";
				997	goto utf16Error;
				998	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	999	else
				1000	continue;
				1001	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1002	errmsg = "illegal encoding";
				1003	/* Fall through to report the error */
				1004
				1005	utf16Error:
				1006	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1007	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1008	}
				1009
				1010	if (byteorder)
				1011	*byteorder = bo;
				1012
				1013	/* Adjust length */
				1014	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1015	goto onError;
				1016
				1017	return (PyObject *)unicode;
				1018
				1019	onError:
				1020	Py_DECREF(unicode);
				1021	return NULL;
				1022	}
				1023
				1024	#undef UTF16_ERROR
				1025
				1026	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1027	int size,
				1028	const char *errors,
				1029	int byteorder)
				1030	{
				1031	PyObject *v;
				1032	Py_UNICODE *p;
				1033	char *q;
				1034
				1035	/* We don't create UTF-16 pairs... */
				1036	v = PyString_FromStringAndSize(NULL,
				1037	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1038	if (v == NULL)
				1039	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1040
				1041	q = PyString_AS_STRING(v);
				1042	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1043	if (byteorder == 0)
				1044	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1045	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1046	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1047	if (byteorder == 0 \|\|
				1048	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1049	byteorder == -1
				1050	#else
				1051	byteorder == 1
				1052	#endif
				1053	)
				1054	memcpy(p, s, size * sizeof(Py_UNICODE));
				1055	else
				1056	while (size-- > 0) {
				1057	Py_UNICODE ch = *s++;
				1058	*p++ = (ch >> 8) \| (ch << 8);
				1059	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1060	return v;
				1061	}
				1062
				1063	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1064	{
				1065	if (!PyUnicode_Check(unicode)) {
				1066	PyErr_BadArgument();
				1067	return NULL;
				1068	}
				1069	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1070	PyUnicode_GET_SIZE(unicode),
				1071	NULL,
				1072	0);
				1073	}
				1074
				1075	/* --- Unicode Escape Codec ----------------------------------------------- */
				1076
				1077	static
				1078	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1079	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1080	const char *errors,
				1081	const char *details)
				1082	{
				1083	if ((errors == NULL) \|\|
				1084	(strcmp(errors,"strict") == 0)) {
				1085	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1086	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1087	details);
				1088	return -1;
				1089	}
				1090	else if (strcmp(errors,"ignore") == 0) {
				1091	return 0;
				1092	}
				1093	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1094	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1095	return 0;
				1096	}
				1097	else {
				1098	PyErr_Format(PyExc_ValueError,
				1099	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1100	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1101	errors);
				1102	return -1;
				1103	}
				1104	}
				1105
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1106	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1107
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1108	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1109	int size,
				1110	const char *errors)
				1111	{
				1112	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1113	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1114	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1115	char* message;
				1116	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1118	/* Escaped strings will always be longer than the resulting
				1119	Unicode string, so we start with size here and then reduce the
				1120	length after conversion to the true value. */
				1121	v = _PyUnicode_New(size);
				1122	if (v == NULL)
				1123	goto onError;
				1124	if (size == 0)
				1125	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1126
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1127	p = buf = PyUnicode_AS_UNICODE(v);
				1128	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1129
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1130	while (s < end) {
				1131	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1132	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1133	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1134
				1135	/* Non-escape characters are interpreted as Unicode ordinals */
				1136	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1137	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1138	continue;
				1139	}
				1140
				1141	/* \ - Escapes */
				1142	s++;
				1143	switch (*s++) {
				1144
				1145	/* \x escapes */
				1146	case '\n': break;
				1147	case '\\': *p++ = '\\'; break;
				1148	case '\'': *p++ = '\''; break;
				1149	case '\"': *p++ = '\"'; break;
				1150	case 'b': *p++ = '\b'; break;
				1151	case 'f': p++ = '\014'; break; / FF */
				1152	case 't': *p++ = '\t'; break;
				1153	case 'n': *p++ = '\n'; break;
				1154	case 'r': *p++ = '\r'; break;
				1155	case 'v': p++ = '\013'; break; / VT */
				1156	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1157
				1158	/* \OOO (octal) escapes */
				1159	case '0': case '1': case '2': case '3':
				1160	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1161	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1162	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1163	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1164	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1165	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1166	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1167	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1168	break;
				1169
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1170	/* hex escapes */
				1171	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1172	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1173	digits = 2;
				1174	message = "truncated \\xXX escape";
				1175	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1176
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1177	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1178	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1179	digits = 4;
				1180	message = "truncated \\uXXXX escape";
				1181	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1182
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1183	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1184	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1185	digits = 8;
				1186	message = "truncated \\UXXXXXXXX escape";
				1187	hexescape:
				1188	chr = 0;
				1189	for (i = 0; i < digits; i++) {
				1190	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1191	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1192	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1193	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1194	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1195	i++;
				1196	break;
				1197	}
				1198	chr = (chr<<4) & ~0xF;
				1199	if (c >= '0' && c <= '9')
				1200	chr += c - '0';
				1201	else if (c >= 'a' && c <= 'f')
				1202	chr += 10 + c - 'a';
				1203	else
				1204	chr += 10 + c - 'A';
				1205	}
				1206	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1207	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1208	/* when we get here, chr is a 32-bit unicode character */
				1209	if (chr <= 0xffff)
				1210	/* UCS-2 character */
				1211	*p++ = (Py_UNICODE) chr;
				1212	else if (chr <= 0x10ffff) {
				1213	/* UCS-4 character. store as two surrogate characters */
				1214	chr -= 0x10000L;
				1215	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
				1216	*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
				1217	} else {
				1218	if (unicodeescape_decoding_error(
				1219	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1220	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1221	)
				1222	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1223	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1224	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1225	break;
				1226
				1227	/* \N{name} */
				1228	case 'N':
				1229	message = "malformed \\N character escape";
				1230	if (ucnhash_CAPI == NULL) {
				1231	/* load the unicode data module */
				1232	PyObject m, v;
				1233	m = PyImport_ImportModule("unicodedata");
				1234	if (m == NULL)
				1235	goto ucnhashError;
				1236	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1237	Py_DECREF(m);
				1238	if (v == NULL)
				1239	goto ucnhashError;
				1240	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1241	Py_DECREF(v);
				1242	if (ucnhash_CAPI == NULL)
				1243	goto ucnhashError;
				1244	}
				1245	if (*s == '{') {
				1246	const char *start = s+1;
				1247	/* look for the closing brace */
				1248	while (*s != '}' && s < end)
				1249	s++;
				1250	if (s > start && s < end && *s == '}') {
				1251	/* found a name. look it up in the unicode database */
				1252	message = "unknown Unicode character name";
				1253	s++;
				1254	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1255	goto store;
				1256	}
				1257	}
				1258	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1259	goto onError;
				1260	*p++ = x;
				1261	break;
				1262
				1263	default:
				1264	*p++ = '\\';
				1265	*p++ = (unsigned char)s[-1];
				1266	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1267	}
				1268	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1269	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1270	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1271	return (PyObject *)v;
				1272
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1273	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1274	PyErr_SetString(
				1275	PyExc_UnicodeError,
				1276	"\\N escapes not supported (can't load unicodedata module)"
				1277	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1278	return NULL;
				1279
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1280	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1281	Py_XDECREF(v);
				1282	return NULL;
				1283	}
				1284
				1285	/* Return a Unicode-Escape string version of the Unicode object.
				1286
				1287	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1288	appropriate.
				1289
				1290	*/
				1291
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1292	static const Py_UNICODE findchar(const Py_UNICODE s,
				1293	int size,
				1294	Py_UNICODE ch);
				1295
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1296	static
				1297	PyObject unicodeescape_string(const Py_UNICODE s,
				1298	int size,
				1299	int quotes)
				1300	{
				1301	PyObject *repr;
				1302	char *p;
				1303	char *q;
				1304
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1305	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1306
				1307	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1308	if (repr == NULL)
				1309	return NULL;
				1310
				1311	p = q = PyString_AS_STRING(repr);
				1312
				1313	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1314	*p++ = 'u';
				1315	*p++ = (findchar(s, size, '\'') &&
				1316	!findchar(s, size, '"')) ? '"' : '\'';
				1317	}
				1318	while (size-- > 0) {
				1319	Py_UNICODE ch = *s++;
				1320	/* Escape quotes */
				1321	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1322	*p++ = '\\';
				1323	*p++ = (char) ch;
				1324	}
				1325	/* Map 16-bit characters to '\uxxxx' */
				1326	else if (ch >= 256) {
				1327	*p++ = '\\';
				1328	*p++ = 'u';
				1329	*p++ = hexdigit[(ch >> 12) & 0xf];
				1330	*p++ = hexdigit[(ch >> 8) & 0xf];
				1331	*p++ = hexdigit[(ch >> 4) & 0xf];
				1332	*p++ = hexdigit[ch & 15];
				1333	}
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1334	/* Map special whitespace to '\t', \n', '\r' */
				1335	else if (ch == '\t') {
				1336	*p++ = '\\';
				1337	*p++ = 't';
				1338	}
				1339	else if (ch == '\n') {
				1340	*p++ = '\\';
				1341	*p++ = 'n';
				1342	}
				1343	else if (ch == '\r') {
				1344	*p++ = '\\';
				1345	*p++ = 'r';
				1346	}
				1347	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1348	else if (ch < ' ' \|\| ch >= 128) {
				1349	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1350	*p++ = 'x';
				1351	*p++ = hexdigit[(ch >> 4) & 0xf];
				1352	*p++ = hexdigit[ch & 15];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1353	}
				1354	/* Copy everything else as-is */
				1355	else
				1356	*p++ = (char) ch;
				1357	}
				1358	if (quotes)
				1359	*p++ = q[1];
				1360
				1361	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1362	if (_PyString_Resize(&repr, p - q))
				1363	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1364
				1365	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1366
				1367	onError:
				1368	Py_DECREF(repr);
				1369	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1370	}
				1371
				1372	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1373	int size)
				1374	{
				1375	return unicodeescape_string(s, size, 0);
				1376	}
				1377
				1378	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1379	{
				1380	if (!PyUnicode_Check(unicode)) {
				1381	PyErr_BadArgument();
				1382	return NULL;
				1383	}
				1384	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1385	PyUnicode_GET_SIZE(unicode));
				1386	}
				1387
				1388	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1389
				1390	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1391	int size,
				1392	const char *errors)
				1393	{
				1394	PyUnicodeObject *v;
				1395	Py_UNICODE p, buf;
				1396	const char *end;
				1397	const char *bs;
				1398
				1399	/* Escaped strings will always be longer than the resulting
				1400	Unicode string, so we start with size here and then reduce the
				1401	length after conversion to the true value. */
				1402	v = _PyUnicode_New(size);
				1403	if (v == NULL)
				1404	goto onError;
				1405	if (size == 0)
				1406	return (PyObject *)v;
				1407	p = buf = PyUnicode_AS_UNICODE(v);
				1408	end = s + size;
				1409	while (s < end) {
				1410	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1411	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1412	int i;
				1413
				1414	/* Non-escape characters are interpreted as Unicode ordinals */
				1415	if (*s != '\\') {
				1416	p++ = (unsigned char)s++;
				1417	continue;
				1418	}
				1419
				1420	/* \u-escapes are only interpreted iff the number of leading
				1421	backslashes if odd */
				1422	bs = s;
				1423	for (;s < end;) {
				1424	if (*s != '\\')
				1425	break;
				1426	p++ = (unsigned char)s++;
				1427	}
				1428	if (((s - bs) & 1) == 0 \|\|
				1429	s >= end \|\|
				1430	*s != 'u') {
				1431	continue;
				1432	}
				1433	p--;
				1434	s++;
				1435
				1436	/* \uXXXX with 4 hex digits */
				1437	for (x = 0, i = 0; i < 4; i++) {
				1438	c = (unsigned char)s[i];
				1439	if (!isxdigit(c)) {
				1440	if (unicodeescape_decoding_error(&s, &x, errors,
				1441	"truncated \\uXXXX"))
				1442	goto onError;
				1443	i++;
				1444	break;
				1445	}
				1446	x = (x<<4) & ~0xF;
				1447	if (c >= '0' && c <= '9')
				1448	x += c - '0';
				1449	else if (c >= 'a' && c <= 'f')
				1450	x += 10 + c - 'a';
				1451	else
				1452	x += 10 + c - 'A';
				1453	}
				1454	s += i;
				1455	*p++ = x;
				1456	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1457	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1458	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1459	return (PyObject *)v;
				1460
				1461	onError:
				1462	Py_XDECREF(v);
				1463	return NULL;
				1464	}
				1465
				1466	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1467	int size)
				1468	{
				1469	PyObject *repr;
				1470	char *p;
				1471	char *q;
				1472
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1473	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1474
				1475	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1476	if (repr == NULL)
				1477	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1478	if (size == 0)
				1479	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1480
				1481	p = q = PyString_AS_STRING(repr);
				1482	while (size-- > 0) {
				1483	Py_UNICODE ch = *s++;
				1484	/* Map 16-bit characters to '\uxxxx' */
				1485	if (ch >= 256) {
				1486	*p++ = '\\';
				1487	*p++ = 'u';
				1488	*p++ = hexdigit[(ch >> 12) & 0xf];
				1489	*p++ = hexdigit[(ch >> 8) & 0xf];
				1490	*p++ = hexdigit[(ch >> 4) & 0xf];
				1491	*p++ = hexdigit[ch & 15];
				1492	}
				1493	/* Copy everything else as-is */
				1494	else
				1495	*p++ = (char) ch;
				1496	}
				1497	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1498	if (_PyString_Resize(&repr, p - q))
				1499	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1500
				1501	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1502
				1503	onError:
				1504	Py_DECREF(repr);
				1505	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1506	}
				1507
				1508	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1509	{
				1510	if (!PyUnicode_Check(unicode)) {
				1511	PyErr_BadArgument();
				1512	return NULL;
				1513	}
				1514	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1515	PyUnicode_GET_SIZE(unicode));
				1516	}
				1517
				1518	/* --- Latin-1 Codec ------------------------------------------------------ */
				1519
				1520	PyObject PyUnicode_DecodeLatin1(const char s,
				1521	int size,
				1522	const char *errors)
				1523	{
				1524	PyUnicodeObject *v;
				1525	Py_UNICODE *p;
				1526
				1527	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1528	v = _PyUnicode_New(size);
				1529	if (v == NULL)
				1530	goto onError;
				1531	if (size == 0)
				1532	return (PyObject *)v;
				1533	p = PyUnicode_AS_UNICODE(v);
				1534	while (size-- > 0)
				1535	p++ = (unsigned char)s++;
				1536	return (PyObject *)v;
				1537
				1538	onError:
				1539	Py_XDECREF(v);
				1540	return NULL;
				1541	}
				1542
				1543	static
				1544	int latin1_encoding_error(const Py_UNICODE **source,
				1545	char **dest,
				1546	const char *errors,
				1547	const char *details)
				1548	{
				1549	if ((errors == NULL) \|\|
				1550	(strcmp(errors,"strict") == 0)) {
				1551	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1552	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1553	details);
				1554	return -1;
				1555	}
				1556	else if (strcmp(errors,"ignore") == 0) {
				1557	return 0;
				1558	}
				1559	else if (strcmp(errors,"replace") == 0) {
				1560	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1561	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1562	return 0;
				1563	}
				1564	else {
				1565	PyErr_Format(PyExc_ValueError,
				1566	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1567	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1568	errors);
				1569	return -1;
				1570	}
				1571	}
				1572
				1573	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1574	int size,
				1575	const char *errors)
				1576	{
				1577	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1578	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1579
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1580	repr = PyString_FromStringAndSize(NULL, size);
				1581	if (repr == NULL)
				1582	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1583	if (size == 0)
				1584	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1585
				1586	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1587	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1588	while (size-- > 0) {
				1589	Py_UNICODE ch = *p++;
				1590	if (ch >= 256) {
				1591	if (latin1_encoding_error(&p, &s, errors,
				1592	"ordinal not in range(256)"))
				1593	goto onError;
				1594	}
				1595	else
				1596	*s++ = (char)ch;
				1597	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1598	/* Resize if error handling skipped some characters */
				1599	if (s - start < PyString_GET_SIZE(repr))
				1600	if (_PyString_Resize(&repr, s - start))
				1601	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1602	return repr;
				1603
				1604	onError:
				1605	Py_DECREF(repr);
				1606	return NULL;
				1607	}
				1608
				1609	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1610	{
				1611	if (!PyUnicode_Check(unicode)) {
				1612	PyErr_BadArgument();
				1613	return NULL;
				1614	}
				1615	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1616	PyUnicode_GET_SIZE(unicode),
				1617	NULL);
				1618	}
				1619
				1620	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1621
				1622	static
				1623	int ascii_decoding_error(const char **source,
				1624	Py_UNICODE **dest,
				1625	const char *errors,
				1626	const char *details)
				1627	{
				1628	if ((errors == NULL) \|\|
				1629	(strcmp(errors,"strict") == 0)) {
				1630	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1631	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1632	details);
				1633	return -1;
				1634	}
				1635	else if (strcmp(errors,"ignore") == 0) {
				1636	return 0;
				1637	}
				1638	else if (strcmp(errors,"replace") == 0) {
				1639	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1640	(*dest)++;
				1641	return 0;
				1642	}
				1643	else {
				1644	PyErr_Format(PyExc_ValueError,
				1645	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1646	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1647	errors);
				1648	return -1;
				1649	}
				1650	}
				1651
				1652	PyObject PyUnicode_DecodeASCII(const char s,
				1653	int size,
				1654	const char *errors)
				1655	{
				1656	PyUnicodeObject *v;
				1657	Py_UNICODE *p;
				1658
				1659	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1660	v = _PyUnicode_New(size);
				1661	if (v == NULL)
				1662	goto onError;
				1663	if (size == 0)
				1664	return (PyObject *)v;
				1665	p = PyUnicode_AS_UNICODE(v);
				1666	while (size-- > 0) {
				1667	register unsigned char c;
				1668
				1669	c = (unsigned char)*s++;
				1670	if (c < 128)
				1671	*p++ = c;
				1672	else if (ascii_decoding_error(&s, &p, errors,
				1673	"ordinal not in range(128)"))
				1674	goto onError;
				1675	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1676	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1677	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1678	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1679	return (PyObject *)v;
				1680
				1681	onError:
				1682	Py_XDECREF(v);
				1683	return NULL;
				1684	}
				1685
				1686	static
				1687	int ascii_encoding_error(const Py_UNICODE **source,
				1688	char **dest,
				1689	const char *errors,
				1690	const char *details)
				1691	{
				1692	if ((errors == NULL) \|\|
				1693	(strcmp(errors,"strict") == 0)) {
				1694	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1695	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1696	details);
				1697	return -1;
				1698	}
				1699	else if (strcmp(errors,"ignore") == 0) {
				1700	return 0;
				1701	}
				1702	else if (strcmp(errors,"replace") == 0) {
				1703	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1704	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1705	return 0;
				1706	}
				1707	else {
				1708	PyErr_Format(PyExc_ValueError,
				1709	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1710	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1711	errors);
				1712	return -1;
				1713	}
				1714	}
				1715
				1716	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1717	int size,
				1718	const char *errors)
				1719	{
				1720	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1721	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1722
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1723	repr = PyString_FromStringAndSize(NULL, size);
				1724	if (repr == NULL)
				1725	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1726	if (size == 0)
				1727	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1728
				1729	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1730	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1731	while (size-- > 0) {
				1732	Py_UNICODE ch = *p++;
				1733	if (ch >= 128) {
				1734	if (ascii_encoding_error(&p, &s, errors,
				1735	"ordinal not in range(128)"))
				1736	goto onError;
				1737	}
				1738	else
				1739	*s++ = (char)ch;
				1740	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1741	/* Resize if error handling skipped some characters */
				1742	if (s - start < PyString_GET_SIZE(repr))
				1743	if (_PyString_Resize(&repr, s - start))
				1744	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1745	return repr;
				1746
				1747	onError:
				1748	Py_DECREF(repr);
				1749	return NULL;
				1750	}
				1751
				1752	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1753	{
				1754	if (!PyUnicode_Check(unicode)) {
				1755	PyErr_BadArgument();
				1756	return NULL;
				1757	}
				1758	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1759	PyUnicode_GET_SIZE(unicode),
				1760	NULL);
				1761	}
				1762
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1763	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1764
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1765	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1766
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1767	PyObject PyUnicode_DecodeMBCS(const char s,
				1768	int size,
				1769	const char *errors)
				1770	{
				1771	PyUnicodeObject *v;
				1772	Py_UNICODE *p;
				1773
				1774	/* First get the size of the result */
				1775	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1776	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1777	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1778
				1779	v = _PyUnicode_New(usize);
				1780	if (v == NULL)
				1781	return NULL;
				1782	if (usize == 0)
				1783	return (PyObject *)v;
				1784	p = PyUnicode_AS_UNICODE(v);
				1785	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1786	Py_DECREF(v);
				1787	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1788	}
				1789
				1790	return (PyObject *)v;
				1791	}
				1792
				1793	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1794	int size,
				1795	const char *errors)
				1796	{
				1797	PyObject *repr;
				1798	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1799	DWORD mbcssize;
				1800
				1801	/* If there are no characters, bail now! */
				1802	if (size==0)
				1803	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1804
				1805	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1806	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1807	if (mbcssize==0)
				1808	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1809
				1810	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1811	if (repr == NULL)
				1812	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1813	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1814	return repr;
				1815
				1816	/* Do the conversion */
				1817	s = PyString_AS_STRING(repr);
				1818	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1819	Py_DECREF(repr);
				1820	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1821	}
				1822	return repr;
				1823	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1824
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1825	#endif /* MS_WIN32 */
				1826
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1827	/* --- Character Mapping Codec -------------------------------------------- */
				1828
				1829	static
				1830	int charmap_decoding_error(const char **source,
				1831	Py_UNICODE **dest,
				1832	const char *errors,
				1833	const char *details)
				1834	{
				1835	if ((errors == NULL) \|\|
				1836	(strcmp(errors,"strict") == 0)) {
				1837	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1838	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1839	details);
				1840	return -1;
				1841	}
				1842	else if (strcmp(errors,"ignore") == 0) {
				1843	return 0;
				1844	}
				1845	else if (strcmp(errors,"replace") == 0) {
				1846	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1847	(*dest)++;
				1848	return 0;
				1849	}
				1850	else {
				1851	PyErr_Format(PyExc_ValueError,
				1852	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1853	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1854	errors);
				1855	return -1;
				1856	}
				1857	}
				1858
				1859	PyObject PyUnicode_DecodeCharmap(const char s,
				1860	int size,
				1861	PyObject *mapping,
				1862	const char *errors)
				1863	{
				1864	PyUnicodeObject *v;
				1865	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1866	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1867
				1868	/* Default to Latin-1 */
				1869	if (mapping == NULL)
				1870	return PyUnicode_DecodeLatin1(s, size, errors);
				1871
				1872	v = _PyUnicode_New(size);
				1873	if (v == NULL)
				1874	goto onError;
				1875	if (size == 0)
				1876	return (PyObject *)v;
				1877	p = PyUnicode_AS_UNICODE(v);
				1878	while (size-- > 0) {
				1879	unsigned char ch = *s++;
				1880	PyObject w, x;
				1881
				1882	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1883	w = PyInt_FromLong((long)ch);
				1884	if (w == NULL)
				1885	goto onError;
				1886	x = PyObject_GetItem(mapping, w);
				1887	Py_DECREF(w);
				1888	if (x == NULL) {
				1889	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1890	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1891	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1892	x = Py_None;
				1893	Py_INCREF(x);
				1894	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1895	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1896	}
				1897
				1898	/* Apply mapping */
				1899	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1900	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1901	if (value < 0 \|\| value > 65535) {
				1902	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1903	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1904	Py_DECREF(x);
				1905	goto onError;
				1906	}
				1907	*p++ = (Py_UNICODE)value;
				1908	}
				1909	else if (x == Py_None) {
				1910	/* undefined mapping */
				1911	if (charmap_decoding_error(&s, &p, errors,
				1912	"character maps to <undefined>")) {
				1913	Py_DECREF(x);
				1914	goto onError;
				1915	}
				1916	}
				1917	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1918	int targetsize = PyUnicode_GET_SIZE(x);
				1919
				1920	if (targetsize == 1)
				1921	/* 1-1 mapping */
				1922	p++ = PyUnicode_AS_UNICODE(x);
				1923
				1924	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1925	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1926	if (targetsize > extrachars) {
				1927	/* resize first */
				1928	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				1929	int needed = (targetsize - extrachars) + \
				1930	(targetsize << 2);
				1931	extrachars += needed;
				1932	if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1933	Py_DECREF(x);
				1934	goto onError;
				1935	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1936	p = PyUnicode_AS_UNICODE(v) + oldpos;
				1937	}
				1938	Py_UNICODE_COPY(p,
				1939	PyUnicode_AS_UNICODE(x),
				1940	targetsize);
				1941	p += targetsize;
				1942	extrachars -= targetsize;
				1943	}
				1944	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1945	}
				1946	else {
				1947	/* wrong return value */
				1948	PyErr_SetString(PyExc_TypeError,
				1949	"character mapping must return integer, None or unicode");
				1950	Py_DECREF(x);
				1951	goto onError;
				1952	}
				1953	Py_DECREF(x);
				1954	}
				1955	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1956	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1957	goto onError;
				1958	return (PyObject *)v;
				1959
				1960	onError:
				1961	Py_XDECREF(v);
				1962	return NULL;
				1963	}
				1964
				1965	static
				1966	int charmap_encoding_error(const Py_UNICODE **source,
				1967	char **dest,
				1968	const char *errors,
				1969	const char *details)
				1970	{
				1971	if ((errors == NULL) \|\|
				1972	(strcmp(errors,"strict") == 0)) {
				1973	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1974	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1975	details);
				1976	return -1;
				1977	}
				1978	else if (strcmp(errors,"ignore") == 0) {
				1979	return 0;
				1980	}
				1981	else if (strcmp(errors,"replace") == 0) {
				1982	**dest = '?';
				1983	(*dest)++;
				1984	return 0;
				1985	}
				1986	else {
				1987	PyErr_Format(PyExc_ValueError,
				1988	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1989	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1990	errors);
				1991	return -1;
				1992	}
				1993	}
				1994
				1995	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1996	int size,
				1997	PyObject *mapping,
				1998	const char *errors)
				1999	{
				2000	PyObject *v;
				2001	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2002	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2003
				2004	/* Default to Latin-1 */
				2005	if (mapping == NULL)
				2006	return PyUnicode_EncodeLatin1(p, size, errors);
				2007
				2008	v = PyString_FromStringAndSize(NULL, size);
				2009	if (v == NULL)
				2010	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2011	if (size == 0)
				2012	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2013	s = PyString_AS_STRING(v);
				2014	while (size-- > 0) {
				2015	Py_UNICODE ch = *p++;
				2016	PyObject w, x;
				2017
				2018	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2019	w = PyInt_FromLong((long)ch);
				2020	if (w == NULL)
				2021	goto onError;
				2022	x = PyObject_GetItem(mapping, w);
				2023	Py_DECREF(w);
				2024	if (x == NULL) {
				2025	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2026	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2027	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2028	x = Py_None;
				2029	Py_INCREF(x);
				2030	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2031	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2032	}
				2033
				2034	/* Apply mapping */
				2035	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2036	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2037	if (value < 0 \|\| value > 255) {
				2038	PyErr_SetString(PyExc_TypeError,
				2039	"character mapping must be in range(256)");
				2040	Py_DECREF(x);
				2041	goto onError;
				2042	}
				2043	*s++ = (char)value;
				2044	}
				2045	else if (x == Py_None) {
				2046	/* undefined mapping */
				2047	if (charmap_encoding_error(&p, &s, errors,
				2048	"character maps to <undefined>")) {
				2049	Py_DECREF(x);
				2050	goto onError;
				2051	}
				2052	}
				2053	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2054	int targetsize = PyString_GET_SIZE(x);
				2055
				2056	if (targetsize == 1)
				2057	/* 1-1 mapping */
				2058	s++ = PyString_AS_STRING(x);
				2059
				2060	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2061	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2062	if (targetsize > extrachars) {
				2063	/* resize first */
				2064	int oldpos = (int)(s - PyString_AS_STRING(v));
				2065	int needed = (targetsize - extrachars) + \
				2066	(targetsize << 2);
				2067	extrachars += needed;
				2068	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2069	Py_DECREF(x);
				2070	goto onError;
				2071	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2072	s = PyString_AS_STRING(v) + oldpos;
				2073	}
				2074	memcpy(s,
				2075	PyString_AS_STRING(x),
				2076	targetsize);
				2077	s += targetsize;
				2078	extrachars -= targetsize;
				2079	}
				2080	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2081	}
				2082	else {
				2083	/* wrong return value */
				2084	PyErr_SetString(PyExc_TypeError,
				2085	"character mapping must return integer, None or unicode");
				2086	Py_DECREF(x);
				2087	goto onError;
				2088	}
				2089	Py_DECREF(x);
				2090	}
				2091	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2092	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2093	goto onError;
				2094	return v;
				2095
				2096	onError:
				2097	Py_DECREF(v);
				2098	return NULL;
				2099	}
				2100
				2101	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2102	PyObject *mapping)
				2103	{
				2104	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2105	PyErr_BadArgument();
				2106	return NULL;
				2107	}
				2108	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2109	PyUnicode_GET_SIZE(unicode),
				2110	mapping,
				2111	NULL);
				2112	}
				2113
				2114	static
				2115	int translate_error(const Py_UNICODE **source,
				2116	Py_UNICODE **dest,
				2117	const char *errors,
				2118	const char *details)
				2119	{
				2120	if ((errors == NULL) \|\|
				2121	(strcmp(errors,"strict") == 0)) {
				2122	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2123	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2124	details);
				2125	return -1;
				2126	}
				2127	else if (strcmp(errors,"ignore") == 0) {
				2128	return 0;
				2129	}
				2130	else if (strcmp(errors,"replace") == 0) {
				2131	**dest = '?';
				2132	(*dest)++;
				2133	return 0;
				2134	}
				2135	else {
				2136	PyErr_Format(PyExc_ValueError,
				2137	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2138	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2139	errors);
				2140	return -1;
				2141	}
				2142	}
				2143
				2144	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2145	int size,
				2146	PyObject *mapping,
				2147	const char *errors)
				2148	{
				2149	PyUnicodeObject *v;
				2150	Py_UNICODE *p;
				2151
				2152	if (mapping == NULL) {
				2153	PyErr_BadArgument();
				2154	return NULL;
				2155	}
				2156
				2157	/* Output will never be longer than input */
				2158	v = _PyUnicode_New(size);
				2159	if (v == NULL)
				2160	goto onError;
				2161	if (size == 0)
				2162	goto done;
				2163	p = PyUnicode_AS_UNICODE(v);
				2164	while (size-- > 0) {
				2165	Py_UNICODE ch = *s++;
				2166	PyObject w, x;
				2167
				2168	/* Get mapping */
				2169	w = PyInt_FromLong(ch);
				2170	if (w == NULL)
				2171	goto onError;
				2172	x = PyObject_GetItem(mapping, w);
				2173	Py_DECREF(w);
				2174	if (x == NULL) {
				2175	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2176	/* No mapping found: default to 1-1 mapping */
				2177	PyErr_Clear();
				2178	*p++ = ch;
				2179	continue;
				2180	}
				2181	goto onError;
				2182	}
				2183
				2184	/* Apply mapping */
				2185	if (PyInt_Check(x))
				2186	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2187	else if (x == Py_None) {
				2188	/* undefined mapping */
				2189	if (translate_error(&s, &p, errors,
				2190	"character maps to <undefined>")) {
				2191	Py_DECREF(x);
				2192	goto onError;
				2193	}
				2194	}
				2195	else if (PyUnicode_Check(x)) {
				2196	if (PyUnicode_GET_SIZE(x) != 1) {
				2197	/* 1-n mapping */
				2198	PyErr_SetString(PyExc_NotImplementedError,
				2199	"1-n mappings are currently not implemented");
				2200	Py_DECREF(x);
				2201	goto onError;
				2202	}
				2203	p++ = PyUnicode_AS_UNICODE(x);
				2204	}
				2205	else {
				2206	/* wrong return value */
				2207	PyErr_SetString(PyExc_TypeError,
				2208	"translate mapping must return integer, None or unicode");
				2209	Py_DECREF(x);
				2210	goto onError;
				2211	}
				2212	Py_DECREF(x);
				2213	}
				2214	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2215	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2216	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2217
				2218	done:
				2219	return (PyObject *)v;
				2220
				2221	onError:
				2222	Py_XDECREF(v);
				2223	return NULL;
				2224	}
				2225
				2226	PyObject PyUnicode_Translate(PyObject str,
				2227	PyObject *mapping,
				2228	const char *errors)
				2229	{
				2230	PyObject *result;
				2231
				2232	str = PyUnicode_FromObject(str);
				2233	if (str == NULL)
				2234	goto onError;
				2235	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2236	PyUnicode_GET_SIZE(str),
				2237	mapping,
				2238	errors);
				2239	Py_DECREF(str);
				2240	return result;
				2241
				2242	onError:
				2243	Py_XDECREF(str);
				2244	return NULL;
				2245	}
				2246
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2247	/* --- Decimal Encoder ---------------------------------------------------- */
				2248
				2249	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2250	int length,
				2251	char *output,
				2252	const char *errors)
				2253	{
				2254	Py_UNICODE p, end;
				2255
				2256	if (output == NULL) {
				2257	PyErr_BadArgument();
				2258	return -1;
				2259	}
				2260
				2261	p = s;
				2262	end = s + length;
				2263	while (p < end) {
				2264	register Py_UNICODE ch = *p++;
				2265	int decimal;
				2266
				2267	if (Py_UNICODE_ISSPACE(ch)) {
				2268	*output++ = ' ';
				2269	continue;
				2270	}
				2271	decimal = Py_UNICODE_TODECIMAL(ch);
				2272	if (decimal >= 0) {
				2273	*output++ = '0' + decimal;
				2274	continue;
				2275	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2276	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2277	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2278	continue;
				2279	}
				2280	/* All other characters are considered invalid */
				2281	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2282	PyErr_SetString(PyExc_ValueError,
				2283	"invalid decimal Unicode string");
				2284	goto onError;
				2285	}
				2286	else if (strcmp(errors, "ignore") == 0)
				2287	continue;
				2288	else if (strcmp(errors, "replace") == 0) {
				2289	*output++ = '?';
				2290	continue;
				2291	}
				2292	}
				2293	/* 0-terminate the output string */
				2294	*output++ = '\0';
				2295	return 0;
				2296
				2297	onError:
				2298	return -1;
				2299	}
				2300
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2301	/* --- Helpers ------------------------------------------------------------ */
				2302
				2303	static
				2304	int count(PyUnicodeObject *self,
				2305	int start,
				2306	int end,
				2307	PyUnicodeObject *substring)
				2308	{
				2309	int count = 0;
				2310
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2311	if (start < 0)
				2312	start += self->length;
				2313	if (start < 0)
				2314	start = 0;
				2315	if (end > self->length)
				2316	end = self->length;
				2317	if (end < 0)
				2318	end += self->length;
				2319	if (end < 0)
				2320	end = 0;
				2321
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2322	if (substring->length == 0)
				2323	return (end - start + 1);
				2324
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2325	end -= substring->length;
				2326
				2327	while (start <= end)
				2328	if (Py_UNICODE_MATCH(self, start, substring)) {
				2329	count++;
				2330	start += substring->length;
				2331	} else
				2332	start++;
				2333
				2334	return count;
				2335	}
				2336
				2337	int PyUnicode_Count(PyObject *str,
				2338	PyObject *substr,
				2339	int start,
				2340	int end)
				2341	{
				2342	int result;
				2343
				2344	str = PyUnicode_FromObject(str);
				2345	if (str == NULL)
				2346	return -1;
				2347	substr = PyUnicode_FromObject(substr);
				2348	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2349	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2350	return -1;
				2351	}
				2352
				2353	result = count((PyUnicodeObject *)str,
				2354	start, end,
				2355	(PyUnicodeObject *)substr);
				2356
				2357	Py_DECREF(str);
				2358	Py_DECREF(substr);
				2359	return result;
				2360	}
				2361
				2362	static
				2363	int findstring(PyUnicodeObject *self,
				2364	PyUnicodeObject *substring,
				2365	int start,
				2366	int end,
				2367	int direction)
				2368	{
				2369	if (start < 0)
				2370	start += self->length;
				2371	if (start < 0)
				2372	start = 0;
				2373
				2374	if (substring->length == 0)
				2375	return start;
				2376
				2377	if (end > self->length)
				2378	end = self->length;
				2379	if (end < 0)
				2380	end += self->length;
				2381	if (end < 0)
				2382	end = 0;
				2383
				2384	end -= substring->length;
				2385
				2386	if (direction < 0) {
				2387	for (; end >= start; end--)
				2388	if (Py_UNICODE_MATCH(self, end, substring))
				2389	return end;
				2390	} else {
				2391	for (; start <= end; start++)
				2392	if (Py_UNICODE_MATCH(self, start, substring))
				2393	return start;
				2394	}
				2395
				2396	return -1;
				2397	}
				2398
				2399	int PyUnicode_Find(PyObject *str,
				2400	PyObject *substr,
				2401	int start,
				2402	int end,
				2403	int direction)
				2404	{
				2405	int result;
				2406
				2407	str = PyUnicode_FromObject(str);
				2408	if (str == NULL)
				2409	return -1;
				2410	substr = PyUnicode_FromObject(substr);
				2411	if (substr == NULL) {
				2412	Py_DECREF(substr);
				2413	return -1;
				2414	}
				2415
				2416	result = findstring((PyUnicodeObject *)str,
				2417	(PyUnicodeObject *)substr,
				2418	start, end, direction);
				2419	Py_DECREF(str);
				2420	Py_DECREF(substr);
				2421	return result;
				2422	}
				2423
				2424	static
				2425	int tailmatch(PyUnicodeObject *self,
				2426	PyUnicodeObject *substring,
				2427	int start,
				2428	int end,
				2429	int direction)
				2430	{
				2431	if (start < 0)
				2432	start += self->length;
				2433	if (start < 0)
				2434	start = 0;
				2435
				2436	if (substring->length == 0)
				2437	return 1;
				2438
				2439	if (end > self->length)
				2440	end = self->length;
				2441	if (end < 0)
				2442	end += self->length;
				2443	if (end < 0)
				2444	end = 0;
				2445
				2446	end -= substring->length;
				2447	if (end < start)
				2448	return 0;
				2449
				2450	if (direction > 0) {
				2451	if (Py_UNICODE_MATCH(self, end, substring))
				2452	return 1;
				2453	} else {
				2454	if (Py_UNICODE_MATCH(self, start, substring))
				2455	return 1;
				2456	}
				2457
				2458	return 0;
				2459	}
				2460
				2461	int PyUnicode_Tailmatch(PyObject *str,
				2462	PyObject *substr,
				2463	int start,
				2464	int end,
				2465	int direction)
				2466	{
				2467	int result;
				2468
				2469	str = PyUnicode_FromObject(str);
				2470	if (str == NULL)
				2471	return -1;
				2472	substr = PyUnicode_FromObject(substr);
				2473	if (substr == NULL) {
				2474	Py_DECREF(substr);
				2475	return -1;
				2476	}
				2477
				2478	result = tailmatch((PyUnicodeObject *)str,
				2479	(PyUnicodeObject *)substr,
				2480	start, end, direction);
				2481	Py_DECREF(str);
				2482	Py_DECREF(substr);
				2483	return result;
				2484	}
				2485
				2486	static
				2487	const Py_UNICODE findchar(const Py_UNICODE s,
				2488	int size,
				2489	Py_UNICODE ch)
				2490	{
				2491	/* like wcschr, but doesn't stop at NULL characters */
				2492
				2493	while (size-- > 0) {
				2494	if (*s == ch)
				2495	return s;
				2496	s++;
				2497	}
				2498
				2499	return NULL;
				2500	}
				2501
				2502	/* Apply fixfct filter to the Unicode object self and return a
				2503	reference to the modified object */
				2504
				2505	static
				2506	PyObject fixup(PyUnicodeObject self,
				2507	int (fixfct)(PyUnicodeObject s))
				2508	{
				2509
				2510	PyUnicodeObject *u;
				2511
				2512	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2513	self->length);
				2514	if (u == NULL)
				2515	return NULL;
				2516	if (!fixfct(u)) {
				2517	/* fixfct should return TRUE if it modified the buffer. If
				2518	FALSE, return a reference to the original buffer instead
				2519	(to save space, not time) */
				2520	Py_INCREF(self);
				2521	Py_DECREF(u);
				2522	return (PyObject*) self;
				2523	}
				2524	return (PyObject*) u;
				2525	}
				2526
				2527	static
				2528	int fixupper(PyUnicodeObject *self)
				2529	{
				2530	int len = self->length;
				2531	Py_UNICODE *s = self->str;
				2532	int status = 0;
				2533
				2534	while (len-- > 0) {
				2535	register Py_UNICODE ch;
				2536
				2537	ch = Py_UNICODE_TOUPPER(*s);
				2538	if (ch != *s) {
				2539	status = 1;
				2540	*s = ch;
				2541	}
				2542	s++;
				2543	}
				2544
				2545	return status;
				2546	}
				2547
				2548	static
				2549	int fixlower(PyUnicodeObject *self)
				2550	{
				2551	int len = self->length;
				2552	Py_UNICODE *s = self->str;
				2553	int status = 0;
				2554
				2555	while (len-- > 0) {
				2556	register Py_UNICODE ch;
				2557
				2558	ch = Py_UNICODE_TOLOWER(*s);
				2559	if (ch != *s) {
				2560	status = 1;
				2561	*s = ch;
				2562	}
				2563	s++;
				2564	}
				2565
				2566	return status;
				2567	}
				2568
				2569	static
				2570	int fixswapcase(PyUnicodeObject *self)
				2571	{
				2572	int len = self->length;
				2573	Py_UNICODE *s = self->str;
				2574	int status = 0;
				2575
				2576	while (len-- > 0) {
				2577	if (Py_UNICODE_ISUPPER(*s)) {
				2578	s = Py_UNICODE_TOLOWER(s);
				2579	status = 1;
				2580	} else if (Py_UNICODE_ISLOWER(*s)) {
				2581	s = Py_UNICODE_TOUPPER(s);
				2582	status = 1;
				2583	}
				2584	s++;
				2585	}
				2586
				2587	return status;
				2588	}
				2589
				2590	static
				2591	int fixcapitalize(PyUnicodeObject *self)
				2592	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2593	int len = self->length;
				2594	Py_UNICODE *s = self->str;
				2595	int status = 0;
				2596
				2597	if (len == 0)
				2598	return 0;
				2599	if (Py_UNICODE_ISLOWER(*s)) {
				2600	s = Py_UNICODE_TOUPPER(s);
				2601	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2602	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2603	s++;
				2604	while (--len > 0) {
				2605	if (Py_UNICODE_ISUPPER(*s)) {
				2606	s = Py_UNICODE_TOLOWER(s);
				2607	status = 1;
				2608	}
				2609	s++;
				2610	}
				2611	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2612	}
				2613
				2614	static
				2615	int fixtitle(PyUnicodeObject *self)
				2616	{
				2617	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2618	register Py_UNICODE *e;
				2619	int previous_is_cased;
				2620
				2621	/* Shortcut for single character strings */
				2622	if (PyUnicode_GET_SIZE(self) == 1) {
				2623	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2624	if (*p != ch) {
				2625	*p = ch;
				2626	return 1;
				2627	}
				2628	else
				2629	return 0;
				2630	}
				2631
				2632	e = p + PyUnicode_GET_SIZE(self);
				2633	previous_is_cased = 0;
				2634	for (; p < e; p++) {
				2635	register const Py_UNICODE ch = *p;
				2636
				2637	if (previous_is_cased)
				2638	*p = Py_UNICODE_TOLOWER(ch);
				2639	else
				2640	*p = Py_UNICODE_TOTITLE(ch);
				2641
				2642	if (Py_UNICODE_ISLOWER(ch) \|\|
				2643	Py_UNICODE_ISUPPER(ch) \|\|
				2644	Py_UNICODE_ISTITLE(ch))
				2645	previous_is_cased = 1;
				2646	else
				2647	previous_is_cased = 0;
				2648	}
				2649	return 1;
				2650	}
				2651
				2652	PyObject PyUnicode_Join(PyObject separator,
				2653	PyObject *seq)
				2654	{
				2655	Py_UNICODE *sep;
				2656	int seplen;
				2657	PyUnicodeObject *res = NULL;
				2658	int reslen = 0;
				2659	Py_UNICODE *p;
				2660	int seqlen = 0;
				2661	int sz = 100;
				2662	int i;
				2663
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2664	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2665	if (seqlen < 0 && PyErr_Occurred())
				2666	return NULL;
				2667
				2668	if (separator == NULL) {
				2669	Py_UNICODE blank = ' ';
				2670	sep = &blank;
				2671	seplen = 1;
				2672	}
				2673	else {
				2674	separator = PyUnicode_FromObject(separator);
				2675	if (separator == NULL)
				2676	return NULL;
				2677	sep = PyUnicode_AS_UNICODE(separator);
				2678	seplen = PyUnicode_GET_SIZE(separator);
				2679	}
				2680
				2681	res = _PyUnicode_New(sz);
				2682	if (res == NULL)
				2683	goto onError;
				2684	p = PyUnicode_AS_UNICODE(res);
				2685	reslen = 0;
				2686
				2687	for (i = 0; i < seqlen; i++) {
				2688	int itemlen;
				2689	PyObject *item;
				2690
				2691	item = PySequence_GetItem(seq, i);
				2692	if (item == NULL)
				2693	goto onError;
				2694	if (!PyUnicode_Check(item)) {
				2695	PyObject *v;
				2696	v = PyUnicode_FromObject(item);
				2697	Py_DECREF(item);
				2698	item = v;
				2699	if (item == NULL)
				2700	goto onError;
				2701	}
				2702	itemlen = PyUnicode_GET_SIZE(item);
				2703	while (reslen + itemlen + seplen >= sz) {
				2704	if (_PyUnicode_Resize(res, sz*2))
				2705	goto onError;
				2706	sz *= 2;
				2707	p = PyUnicode_AS_UNICODE(res) + reslen;
				2708	}
				2709	if (i > 0) {
				2710	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2711	p += seplen;
				2712	reslen += seplen;
				2713	}
				2714	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2715	p += itemlen;
				2716	reslen += itemlen;
				2717	Py_DECREF(item);
				2718	}
				2719	if (_PyUnicode_Resize(res, reslen))
				2720	goto onError;
				2721
				2722	Py_XDECREF(separator);
				2723	return (PyObject *)res;
				2724
				2725	onError:
				2726	Py_XDECREF(separator);
				2727	Py_DECREF(res);
				2728	return NULL;
				2729	}
				2730
				2731	static
				2732	PyUnicodeObject pad(PyUnicodeObject self,
				2733	int left,
				2734	int right,
				2735	Py_UNICODE fill)
				2736	{
				2737	PyUnicodeObject *u;
				2738
				2739	if (left < 0)
				2740	left = 0;
				2741	if (right < 0)
				2742	right = 0;
				2743
				2744	if (left == 0 && right == 0) {
				2745	Py_INCREF(self);
				2746	return self;
				2747	}
				2748
				2749	u = _PyUnicode_New(left + self->length + right);
				2750	if (u) {
				2751	if (left)
				2752	Py_UNICODE_FILL(u->str, fill, left);
				2753	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2754	if (right)
				2755	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2756	}
				2757
				2758	return u;
				2759	}
				2760
				2761	#define SPLIT_APPEND(data, left, right) \
				2762	str = PyUnicode_FromUnicode(data + left, right - left); \
				2763	if (!str) \
				2764	goto onError; \
				2765	if (PyList_Append(list, str)) { \
				2766	Py_DECREF(str); \
				2767	goto onError; \
				2768	} \
				2769	else \
				2770	Py_DECREF(str);
				2771
				2772	static
				2773	PyObject split_whitespace(PyUnicodeObject self,
				2774	PyObject *list,
				2775	int maxcount)
				2776	{
				2777	register int i;
				2778	register int j;
				2779	int len = self->length;
				2780	PyObject *str;
				2781
				2782	for (i = j = 0; i < len; ) {
				2783	/* find a token */
				2784	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2785	i++;
				2786	j = i;
				2787	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2788	i++;
				2789	if (j < i) {
				2790	if (maxcount-- <= 0)
				2791	break;
				2792	SPLIT_APPEND(self->str, j, i);
				2793	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2794	i++;
				2795	j = i;
				2796	}
				2797	}
				2798	if (j < len) {
				2799	SPLIT_APPEND(self->str, j, len);
				2800	}
				2801	return list;
				2802
				2803	onError:
				2804	Py_DECREF(list);
				2805	return NULL;
				2806	}
				2807
				2808	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2809	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2810	{
				2811	register int i;
				2812	register int j;
				2813	int len;
				2814	PyObject *list;
				2815	PyObject *str;
				2816	Py_UNICODE *data;
				2817
				2818	string = PyUnicode_FromObject(string);
				2819	if (string == NULL)
				2820	return NULL;
				2821	data = PyUnicode_AS_UNICODE(string);
				2822	len = PyUnicode_GET_SIZE(string);
				2823
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2824	list = PyList_New(0);
				2825	if (!list)
				2826	goto onError;
				2827
				2828	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2829	int eol;
				2830
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2831	/* Find a line and append it */
				2832	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2833	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2834
				2835	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2836	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2837	if (i < len) {
				2838	if (data[i] == '\r' && i + 1 < len &&
				2839	data[i+1] == '\n')
				2840	i += 2;
				2841	else
				2842	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2843	if (keepends)
				2844	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2845	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2846	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2847	j = i;
				2848	}
				2849	if (j < len) {
				2850	SPLIT_APPEND(data, j, len);
				2851	}
				2852
				2853	Py_DECREF(string);
				2854	return list;
				2855
				2856	onError:
				2857	Py_DECREF(list);
				2858	Py_DECREF(string);
				2859	return NULL;
				2860	}
				2861
				2862	static
				2863	PyObject split_char(PyUnicodeObject self,
				2864	PyObject *list,
				2865	Py_UNICODE ch,
				2866	int maxcount)
				2867	{
				2868	register int i;
				2869	register int j;
				2870	int len = self->length;
				2871	PyObject *str;
				2872
				2873	for (i = j = 0; i < len; ) {
				2874	if (self->str[i] == ch) {
				2875	if (maxcount-- <= 0)
				2876	break;
				2877	SPLIT_APPEND(self->str, j, i);
				2878	i = j = i + 1;
				2879	} else
				2880	i++;
				2881	}
				2882	if (j <= len) {
				2883	SPLIT_APPEND(self->str, j, len);
				2884	}
				2885	return list;
				2886
				2887	onError:
				2888	Py_DECREF(list);
				2889	return NULL;
				2890	}
				2891
				2892	static
				2893	PyObject split_substring(PyUnicodeObject self,
				2894	PyObject *list,
				2895	PyUnicodeObject *substring,
				2896	int maxcount)
				2897	{
				2898	register int i;
				2899	register int j;
				2900	int len = self->length;
				2901	int sublen = substring->length;
				2902	PyObject *str;
				2903
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	2904	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2905	if (Py_UNICODE_MATCH(self, i, substring)) {
				2906	if (maxcount-- <= 0)
				2907	break;
				2908	SPLIT_APPEND(self->str, j, i);
				2909	i = j = i + sublen;
				2910	} else
				2911	i++;
				2912	}
				2913	if (j <= len) {
				2914	SPLIT_APPEND(self->str, j, len);
				2915	}
				2916	return list;
				2917
				2918	onError:
				2919	Py_DECREF(list);
				2920	return NULL;
				2921	}
				2922
				2923	#undef SPLIT_APPEND
				2924
				2925	static
				2926	PyObject split(PyUnicodeObject self,
				2927	PyUnicodeObject *substring,
				2928	int maxcount)
				2929	{
				2930	PyObject *list;
				2931
				2932	if (maxcount < 0)
				2933	maxcount = INT_MAX;
				2934
				2935	list = PyList_New(0);
				2936	if (!list)
				2937	return NULL;
				2938
				2939	if (substring == NULL)
				2940	return split_whitespace(self,list,maxcount);
				2941
				2942	else if (substring->length == 1)
				2943	return split_char(self,list,substring->str[0],maxcount);
				2944
				2945	else if (substring->length == 0) {
				2946	Py_DECREF(list);
				2947	PyErr_SetString(PyExc_ValueError, "empty separator");
				2948	return NULL;
				2949	}
				2950	else
				2951	return split_substring(self,list,substring,maxcount);
				2952	}
				2953
				2954	static
				2955	PyObject strip(PyUnicodeObject self,
				2956	int left,
				2957	int right)
				2958	{
				2959	Py_UNICODE *p = self->str;
				2960	int start = 0;
				2961	int end = self->length;
				2962
				2963	if (left)
				2964	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2965	start++;
				2966
				2967	if (right)
				2968	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2969	end--;
				2970
				2971	if (start == 0 && end == self->length) {
				2972	/* couldn't strip anything off, return original string */
				2973	Py_INCREF(self);
				2974	return (PyObject*) self;
				2975	}
				2976
				2977	return (PyObject*) PyUnicode_FromUnicode(
				2978	self->str + start,
				2979	end - start
				2980	);
				2981	}
				2982
				2983	static
				2984	PyObject replace(PyUnicodeObject self,
				2985	PyUnicodeObject *str1,
				2986	PyUnicodeObject *str2,
				2987	int maxcount)
				2988	{
				2989	PyUnicodeObject *u;
				2990
				2991	if (maxcount < 0)
				2992	maxcount = INT_MAX;
				2993
				2994	if (str1->length == 1 && str2->length == 1) {
				2995	int i;
				2996
				2997	/* replace characters */
				2998	if (!findchar(self->str, self->length, str1->str[0])) {
				2999	/* nothing to replace, return original string */
				3000	Py_INCREF(self);
				3001	u = self;
				3002	} else {
				3003	Py_UNICODE u1 = str1->str[0];
				3004	Py_UNICODE u2 = str2->str[0];
				3005
				3006	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3007	self->str,
				3008	self->length
				3009	);
				3010	if (u)
				3011	for (i = 0; i < u->length; i++)
				3012	if (u->str[i] == u1) {
				3013	if (--maxcount < 0)
				3014	break;
				3015	u->str[i] = u2;
				3016	}
				3017	}
				3018
				3019	} else {
				3020	int n, i;
				3021	Py_UNICODE *p;
				3022
				3023	/* replace strings */
				3024	n = count(self, 0, self->length, str1);
				3025	if (n > maxcount)
				3026	n = maxcount;
				3027	if (n == 0) {
				3028	/* nothing to replace, return original string */
				3029	Py_INCREF(self);
				3030	u = self;
				3031	} else {
				3032	u = _PyUnicode_New(
				3033	self->length + n * (str2->length - str1->length));
				3034	if (u) {
				3035	i = 0;
				3036	p = u->str;
				3037	while (i <= self->length - str1->length)
				3038	if (Py_UNICODE_MATCH(self, i, str1)) {
				3039	/* replace string segment */
				3040	Py_UNICODE_COPY(p, str2->str, str2->length);
				3041	p += str2->length;
				3042	i += str1->length;
				3043	if (--n <= 0) {
				3044	/* copy remaining part */
				3045	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3046	break;
				3047	}
				3048	} else
				3049	*p++ = self->str[i++];
				3050	}
				3051	}
				3052	}
				3053
				3054	return (PyObject *) u;
				3055	}
				3056
				3057	/* --- Unicode Object Methods --------------------------------------------- */
				3058
				3059	static char title__doc__[] =
				3060	"S.title() -> unicode\n\
				3061	\n\
				3062	Return a titlecased version of S, i.e. words start with title case\n\
				3063	characters, all remaining cased characters have lower case.";
				3064
				3065	static PyObject*
				3066	unicode_title(PyUnicodeObject self, PyObject args)
				3067	{
				3068	if (!PyArg_NoArgs(args))
				3069	return NULL;
				3070	return fixup(self, fixtitle);
				3071	}
				3072
				3073	static char capitalize__doc__[] =
				3074	"S.capitalize() -> unicode\n\
				3075	\n\
				3076	Return a capitalized version of S, i.e. make the first character\n\
				3077	have upper case.";
				3078
				3079	static PyObject*
				3080	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3081	{
				3082	if (!PyArg_NoArgs(args))
				3083	return NULL;
				3084	return fixup(self, fixcapitalize);
				3085	}
				3086
				3087	#if 0
				3088	static char capwords__doc__[] =
				3089	"S.capwords() -> unicode\n\
				3090	\n\
				3091	Apply .capitalize() to all words in S and return the result with\n\
				3092	normalized whitespace (all whitespace strings are replaced by ' ').";
				3093
				3094	static PyObject*
				3095	unicode_capwords(PyUnicodeObject self, PyObject args)
				3096	{
				3097	PyObject *list;
				3098	PyObject *item;
				3099	int i;
				3100
				3101	if (!PyArg_NoArgs(args))
				3102	return NULL;
				3103
				3104	/* Split into words */
				3105	list = split(self, NULL, -1);
				3106	if (!list)
				3107	return NULL;
				3108
				3109	/* Capitalize each word */
				3110	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3111	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3112	fixcapitalize);
				3113	if (item == NULL)
				3114	goto onError;
				3115	Py_DECREF(PyList_GET_ITEM(list, i));
				3116	PyList_SET_ITEM(list, i, item);
				3117	}
				3118
				3119	/* Join the words to form a new string */
				3120	item = PyUnicode_Join(NULL, list);
				3121
				3122	onError:
				3123	Py_DECREF(list);
				3124	return (PyObject *)item;
				3125	}
				3126	#endif
				3127
				3128	static char center__doc__[] =
				3129	"S.center(width) -> unicode\n\
				3130	\n\
				3131	Return S centered in a Unicode string of length width. Padding is done\n\
				3132	using spaces.";
				3133
				3134	static PyObject *
				3135	unicode_center(PyUnicodeObject self, PyObject args)
				3136	{
				3137	int marg, left;
				3138	int width;
				3139
				3140	if (!PyArg_ParseTuple(args, "i:center", &width))
				3141	return NULL;
				3142
				3143	if (self->length >= width) {
				3144	Py_INCREF(self);
				3145	return (PyObject*) self;
				3146	}
				3147
				3148	marg = width - self->length;
				3149	left = marg / 2 + (marg & width & 1);
				3150
				3151	return (PyObject*) pad(self, left, marg - left, ' ');
				3152	}
				3153
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3154	#if 0
				3155
				3156	/* This code should go into some future Unicode collation support
				3157	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3158	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3159
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3160	/* speedy UTF-16 code point order comparison */
				3161	/* gleaned from: */
				3162	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3163
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3164	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3165	{
				3166	0, 0, 0, 0, 0, 0, 0, 0,
				3167	0, 0, 0, 0, 0, 0, 0, 0,
				3168	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3169	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3170	};
				3171
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3172	static int
				3173	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3174	{
				3175	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3176
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3177	Py_UNICODE *s1 = str1->str;
				3178	Py_UNICODE *s2 = str2->str;
				3179
				3180	len1 = str1->length;
				3181	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3182
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3183	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3184	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3185	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3186
				3187	c1 = *s1++;
				3188	c2 = *s2++;
				3189	if (c1 > (1<<11) * 26)
				3190	c1 += utf16Fixup[c1>>11];
				3191	if (c2 > (1<<11) * 26)
				3192	c2 += utf16Fixup[c2>>11];
				3193
				3194	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3195	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3196	if (diff)
				3197	return (diff < 0) ? -1 : (diff != 0);
				3198	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3199	}
				3200
				3201	return (len1 < len2) ? -1 : (len1 != len2);
				3202	}
				3203
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3204	#else
				3205
				3206	static int
				3207	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3208	{
				3209	register int len1, len2;
				3210
				3211	Py_UNICODE *s1 = str1->str;
				3212	Py_UNICODE *s2 = str2->str;
				3213
				3214	len1 = str1->length;
				3215	len2 = str2->length;
				3216
				3217	while (len1 > 0 && len2 > 0) {
				3218	register long diff;
				3219
				3220	diff = (long)s1++ - (long)s2++;
				3221	if (diff)
				3222	return (diff < 0) ? -1 : (diff != 0);
				3223	len1--; len2--;
				3224	}
				3225
				3226	return (len1 < len2) ? -1 : (len1 != len2);
				3227	}
				3228
				3229	#endif
				3230
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3231	int PyUnicode_Compare(PyObject *left,
				3232	PyObject *right)
				3233	{
				3234	PyUnicodeObject u = NULL, v = NULL;
				3235	int result;
				3236
				3237	/* Coerce the two arguments */
				3238	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3239	if (u == NULL)
				3240	goto onError;
				3241	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3242	if (v == NULL)
				3243	goto onError;
				3244
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3245	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3246	if (v == u) {
				3247	Py_DECREF(u);
				3248	Py_DECREF(v);
				3249	return 0;
				3250	}
				3251
				3252	result = unicode_compare(u, v);
				3253
				3254	Py_DECREF(u);
				3255	Py_DECREF(v);
				3256	return result;
				3257
				3258	onError:
				3259	Py_XDECREF(u);
				3260	Py_XDECREF(v);
				3261	return -1;
				3262	}
				3263
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3264	int PyUnicode_Contains(PyObject *container,
				3265	PyObject *element)
				3266	{
				3267	PyUnicodeObject u = NULL, v = NULL;
				3268	int result;
				3269	register const Py_UNICODE p, e;
				3270	register Py_UNICODE ch;
				3271
				3272	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3273	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3274	if (v == NULL) {
				3275	PyErr_SetString(PyExc_TypeError,
				3276	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3277	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3278	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3279	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3280	if (u == NULL) {
				3281	Py_DECREF(v);
				3282	goto onError;
				3283	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3284
				3285	/* Check v in u */
				3286	if (PyUnicode_GET_SIZE(v) != 1) {
				3287	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3288	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3289	goto onError;
				3290	}
				3291	ch = *PyUnicode_AS_UNICODE(v);
				3292	p = PyUnicode_AS_UNICODE(u);
				3293	e = p + PyUnicode_GET_SIZE(u);
				3294	result = 0;
				3295	while (p < e) {
				3296	if (*p++ == ch) {
				3297	result = 1;
				3298	break;
				3299	}
				3300	}
				3301
				3302	Py_DECREF(u);
				3303	Py_DECREF(v);
				3304	return result;
				3305
				3306	onError:
				3307	Py_XDECREF(u);
				3308	Py_XDECREF(v);
				3309	return -1;
				3310	}
				3311
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3312	/* Concat to string or Unicode object giving a new Unicode object. */
				3313
				3314	PyObject PyUnicode_Concat(PyObject left,
				3315	PyObject *right)
				3316	{
				3317	PyUnicodeObject u = NULL, v = NULL, *w;
				3318
				3319	/* Coerce the two arguments */
				3320	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3321	if (u == NULL)
				3322	goto onError;
				3323	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3324	if (v == NULL)
				3325	goto onError;
				3326
				3327	/* Shortcuts */
				3328	if (v == unicode_empty) {
				3329	Py_DECREF(v);
				3330	return (PyObject *)u;
				3331	}
				3332	if (u == unicode_empty) {
				3333	Py_DECREF(u);
				3334	return (PyObject *)v;
				3335	}
				3336
				3337	/* Concat the two Unicode strings */
				3338	w = _PyUnicode_New(u->length + v->length);
				3339	if (w == NULL)
				3340	goto onError;
				3341	Py_UNICODE_COPY(w->str, u->str, u->length);
				3342	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3343
				3344	Py_DECREF(u);
				3345	Py_DECREF(v);
				3346	return (PyObject *)w;
				3347
				3348	onError:
				3349	Py_XDECREF(u);
				3350	Py_XDECREF(v);
				3351	return NULL;
				3352	}
				3353
				3354	static char count__doc__[] =
				3355	"S.count(sub[, start[, end]]) -> int\n\
				3356	\n\
				3357	Return the number of occurrences of substring sub in Unicode string\n\
				3358	S[start:end]. Optional arguments start and end are\n\
				3359	interpreted as in slice notation.";
				3360
				3361	static PyObject *
				3362	unicode_count(PyUnicodeObject self, PyObject args)
				3363	{
				3364	PyUnicodeObject *substring;
				3365	int start = 0;
				3366	int end = INT_MAX;
				3367	PyObject *result;
				3368
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3369	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3370	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3371	return NULL;
				3372
				3373	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3374	(PyObject *)substring);
				3375	if (substring == NULL)
				3376	return NULL;
				3377
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3378	if (start < 0)
				3379	start += self->length;
				3380	if (start < 0)
				3381	start = 0;
				3382	if (end > self->length)
				3383	end = self->length;
				3384	if (end < 0)
				3385	end += self->length;
				3386	if (end < 0)
				3387	end = 0;
				3388
				3389	result = PyInt_FromLong((long) count(self, start, end, substring));
				3390
				3391	Py_DECREF(substring);
				3392	return result;
				3393	}
				3394
				3395	static char encode__doc__[] =
				3396	"S.encode([encoding[,errors]]) -> string\n\
				3397	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3398	Return an encoded string version of S. Default encoding is the current\n\
				3399	default string encoding. errors may be given to set a different error\n\
				3400	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3401	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3402
				3403	static PyObject *
				3404	unicode_encode(PyUnicodeObject self, PyObject args)
				3405	{
				3406	char *encoding = NULL;
				3407	char *errors = NULL;
				3408	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3409	return NULL;
				3410	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3411	}
				3412
				3413	static char expandtabs__doc__[] =
				3414	"S.expandtabs([tabsize]) -> unicode\n\
				3415	\n\
				3416	Return a copy of S where all tab characters are expanded using spaces.\n\
				3417	If tabsize is not given, a tab size of 8 characters is assumed.";
				3418
				3419	static PyObject*
				3420	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3421	{
				3422	Py_UNICODE *e;
				3423	Py_UNICODE *p;
				3424	Py_UNICODE *q;
				3425	int i, j;
				3426	PyUnicodeObject *u;
				3427	int tabsize = 8;
				3428
				3429	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3430	return NULL;
				3431
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3432	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3433	i = j = 0;
				3434	e = self->str + self->length;
				3435	for (p = self->str; p < e; p++)
				3436	if (*p == '\t') {
				3437	if (tabsize > 0)
				3438	j += tabsize - (j % tabsize);
				3439	}
				3440	else {
				3441	j++;
				3442	if (p == '\n' \|\| p == '\r') {
				3443	i += j;
				3444	j = 0;
				3445	}
				3446	}
				3447
				3448	/* Second pass: create output string and fill it */
				3449	u = _PyUnicode_New(i + j);
				3450	if (!u)
				3451	return NULL;
				3452
				3453	j = 0;
				3454	q = u->str;
				3455
				3456	for (p = self->str; p < e; p++)
				3457	if (*p == '\t') {
				3458	if (tabsize > 0) {
				3459	i = tabsize - (j % tabsize);
				3460	j += i;
				3461	while (i--)
				3462	*q++ = ' ';
				3463	}
				3464	}
				3465	else {
				3466	j++;
				3467	q++ = p;
				3468	if (p == '\n' \|\| p == '\r')
				3469	j = 0;
				3470	}
				3471
				3472	return (PyObject*) u;
				3473	}
				3474
				3475	static char find__doc__[] =
				3476	"S.find(sub [,start [,end]]) -> int\n\
				3477	\n\
				3478	Return the lowest index in S where substring sub is found,\n\
				3479	such that sub is contained within s[start,end]. Optional\n\
				3480	arguments start and end are interpreted as in slice notation.\n\
				3481	\n\
				3482	Return -1 on failure.";
				3483
				3484	static PyObject *
				3485	unicode_find(PyUnicodeObject self, PyObject args)
				3486	{
				3487	PyUnicodeObject *substring;
				3488	int start = 0;
				3489	int end = INT_MAX;
				3490	PyObject *result;
				3491
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3492	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3493	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3494	return NULL;
				3495	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3496	(PyObject *)substring);
				3497	if (substring == NULL)
				3498	return NULL;
				3499
				3500	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3501
				3502	Py_DECREF(substring);
				3503	return result;
				3504	}
				3505
				3506	static PyObject *
				3507	unicode_getitem(PyUnicodeObject *self, int index)
				3508	{
				3509	if (index < 0 \|\| index >= self->length) {
				3510	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3511	return NULL;
				3512	}
				3513
				3514	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3515	}
				3516
				3517	static long
				3518	unicode_hash(PyUnicodeObject *self)
				3519	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3520	/* Since Unicode objects compare equal to their ASCII string
				3521	counterparts, they should use the individual character values
				3522	as basis for their hash value. This is needed to assure that
				3523	strings and Unicode objects behave in the same way as
				3524	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3525
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3526	register int len;
				3527	register Py_UNICODE *p;
				3528	register long x;
				3529
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3530	if (self->hash != -1)
				3531	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3532	len = PyUnicode_GET_SIZE(self);
				3533	p = PyUnicode_AS_UNICODE(self);
				3534	x = *p << 7;
				3535	while (--len >= 0)
				3536	x = (1000003x) ^ p++;
				3537	x ^= PyUnicode_GET_SIZE(self);
				3538	if (x == -1)
				3539	x = -2;
				3540	self->hash = x;
				3541	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3542	}
				3543
				3544	static char index__doc__[] =
				3545	"S.index(sub [,start [,end]]) -> int\n\
				3546	\n\
				3547	Like S.find() but raise ValueError when the substring is not found.";
				3548
				3549	static PyObject *
				3550	unicode_index(PyUnicodeObject self, PyObject args)
				3551	{
				3552	int result;
				3553	PyUnicodeObject *substring;
				3554	int start = 0;
				3555	int end = INT_MAX;
				3556
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3557	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3558	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3559	return NULL;
				3560
				3561	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3562	(PyObject *)substring);
				3563	if (substring == NULL)
				3564	return NULL;
				3565
				3566	result = findstring(self, substring, start, end, 1);
				3567
				3568	Py_DECREF(substring);
				3569	if (result < 0) {
				3570	PyErr_SetString(PyExc_ValueError, "substring not found");
				3571	return NULL;
				3572	}
				3573	return PyInt_FromLong(result);
				3574	}
				3575
				3576	static char islower__doc__[] =
				3577	"S.islower() -> int\n\
				3578	\n\
				3579	Return 1 if all cased characters in S are lowercase and there is\n\
				3580	at least one cased character in S, 0 otherwise.";
				3581
				3582	static PyObject*
				3583	unicode_islower(PyUnicodeObject self, PyObject args)
				3584	{
				3585	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3586	register const Py_UNICODE *e;
				3587	int cased;
				3588
				3589	if (!PyArg_NoArgs(args))
				3590	return NULL;
				3591
				3592	/* Shortcut for single character strings */
				3593	if (PyUnicode_GET_SIZE(self) == 1)
				3594	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3595
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3596	/* Special case for empty strings */
				3597	if (PyString_GET_SIZE(self) == 0)
				3598	return PyInt_FromLong(0);
				3599
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3600	e = p + PyUnicode_GET_SIZE(self);
				3601	cased = 0;
				3602	for (; p < e; p++) {
				3603	register const Py_UNICODE ch = *p;
				3604
				3605	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3606	return PyInt_FromLong(0);
				3607	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3608	cased = 1;
				3609	}
				3610	return PyInt_FromLong(cased);
				3611	}
				3612
				3613	static char isupper__doc__[] =
				3614	"S.isupper() -> int\n\
				3615	\n\
				3616	Return 1 if all cased characters in S are uppercase and there is\n\
				3617	at least one cased character in S, 0 otherwise.";
				3618
				3619	static PyObject*
				3620	unicode_isupper(PyUnicodeObject self, PyObject args)
				3621	{
				3622	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3623	register const Py_UNICODE *e;
				3624	int cased;
				3625
				3626	if (!PyArg_NoArgs(args))
				3627	return NULL;
				3628
				3629	/* Shortcut for single character strings */
				3630	if (PyUnicode_GET_SIZE(self) == 1)
				3631	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3632
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3633	/* Special case for empty strings */
				3634	if (PyString_GET_SIZE(self) == 0)
				3635	return PyInt_FromLong(0);
				3636
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3637	e = p + PyUnicode_GET_SIZE(self);
				3638	cased = 0;
				3639	for (; p < e; p++) {
				3640	register const Py_UNICODE ch = *p;
				3641
				3642	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3643	return PyInt_FromLong(0);
				3644	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3645	cased = 1;
				3646	}
				3647	return PyInt_FromLong(cased);
				3648	}
				3649
				3650	static char istitle__doc__[] =
				3651	"S.istitle() -> int\n\
				3652	\n\
				3653	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3654	may only follow uncased characters and lowercase characters only cased\n\
				3655	ones. Return 0 otherwise.";
				3656
				3657	static PyObject*
				3658	unicode_istitle(PyUnicodeObject self, PyObject args)
				3659	{
				3660	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3661	register const Py_UNICODE *e;
				3662	int cased, previous_is_cased;
				3663
				3664	if (!PyArg_NoArgs(args))
				3665	return NULL;
				3666
				3667	/* Shortcut for single character strings */
				3668	if (PyUnicode_GET_SIZE(self) == 1)
				3669	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3670	(Py_UNICODE_ISUPPER(*p) != 0));
				3671
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3672	/* Special case for empty strings */
				3673	if (PyString_GET_SIZE(self) == 0)
				3674	return PyInt_FromLong(0);
				3675
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3676	e = p + PyUnicode_GET_SIZE(self);
				3677	cased = 0;
				3678	previous_is_cased = 0;
				3679	for (; p < e; p++) {
				3680	register const Py_UNICODE ch = *p;
				3681
				3682	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3683	if (previous_is_cased)
				3684	return PyInt_FromLong(0);
				3685	previous_is_cased = 1;
				3686	cased = 1;
				3687	}
				3688	else if (Py_UNICODE_ISLOWER(ch)) {
				3689	if (!previous_is_cased)
				3690	return PyInt_FromLong(0);
				3691	previous_is_cased = 1;
				3692	cased = 1;
				3693	}
				3694	else
				3695	previous_is_cased = 0;
				3696	}
				3697	return PyInt_FromLong(cased);
				3698	}
				3699
				3700	static char isspace__doc__[] =
				3701	"S.isspace() -> int\n\
				3702	\n\
				3703	Return 1 if there are only whitespace characters in S,\n\
				3704	0 otherwise.";
				3705
				3706	static PyObject*
				3707	unicode_isspace(PyUnicodeObject self, PyObject args)
				3708	{
				3709	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3710	register const Py_UNICODE *e;
				3711
				3712	if (!PyArg_NoArgs(args))
				3713	return NULL;
				3714
				3715	/* Shortcut for single character strings */
				3716	if (PyUnicode_GET_SIZE(self) == 1 &&
				3717	Py_UNICODE_ISSPACE(*p))
				3718	return PyInt_FromLong(1);
				3719
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3720	/* Special case for empty strings */
				3721	if (PyString_GET_SIZE(self) == 0)
				3722	return PyInt_FromLong(0);
				3723
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3724	e = p + PyUnicode_GET_SIZE(self);
				3725	for (; p < e; p++) {
				3726	if (!Py_UNICODE_ISSPACE(*p))
				3727	return PyInt_FromLong(0);
				3728	}
				3729	return PyInt_FromLong(1);
				3730	}
				3731
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3732	static char isalpha__doc__[] =
				3733	"S.isalpha() -> int\n\
				3734	\n\
				3735	Return 1 if all characters in S are alphabetic\n\
				3736	and there is at least one character in S, 0 otherwise.";
				3737
				3738	static PyObject*
				3739	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3740	{
				3741	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3742	register const Py_UNICODE *e;
				3743
				3744	if (!PyArg_NoArgs(args))
				3745	return NULL;
				3746
				3747	/* Shortcut for single character strings */
				3748	if (PyUnicode_GET_SIZE(self) == 1 &&
				3749	Py_UNICODE_ISALPHA(*p))
				3750	return PyInt_FromLong(1);
				3751
				3752	/* Special case for empty strings */
				3753	if (PyString_GET_SIZE(self) == 0)
				3754	return PyInt_FromLong(0);
				3755
				3756	e = p + PyUnicode_GET_SIZE(self);
				3757	for (; p < e; p++) {
				3758	if (!Py_UNICODE_ISALPHA(*p))
				3759	return PyInt_FromLong(0);
				3760	}
				3761	return PyInt_FromLong(1);
				3762	}
				3763
				3764	static char isalnum__doc__[] =
				3765	"S.isalnum() -> int\n\
				3766	\n\
				3767	Return 1 if all characters in S are alphanumeric\n\
				3768	and there is at least one character in S, 0 otherwise.";
				3769
				3770	static PyObject*
				3771	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3772	{
				3773	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3774	register const Py_UNICODE *e;
				3775
				3776	if (!PyArg_NoArgs(args))
				3777	return NULL;
				3778
				3779	/* Shortcut for single character strings */
				3780	if (PyUnicode_GET_SIZE(self) == 1 &&
				3781	Py_UNICODE_ISALNUM(*p))
				3782	return PyInt_FromLong(1);
				3783
				3784	/* Special case for empty strings */
				3785	if (PyString_GET_SIZE(self) == 0)
				3786	return PyInt_FromLong(0);
				3787
				3788	e = p + PyUnicode_GET_SIZE(self);
				3789	for (; p < e; p++) {
				3790	if (!Py_UNICODE_ISALNUM(*p))
				3791	return PyInt_FromLong(0);
				3792	}
				3793	return PyInt_FromLong(1);
				3794	}
				3795
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3796	static char isdecimal__doc__[] =
				3797	"S.isdecimal() -> int\n\
				3798	\n\
				3799	Return 1 if there are only decimal characters in S,\n\
				3800	0 otherwise.";
				3801
				3802	static PyObject*
				3803	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3804	{
				3805	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3806	register const Py_UNICODE *e;
				3807
				3808	if (!PyArg_NoArgs(args))
				3809	return NULL;
				3810
				3811	/* Shortcut for single character strings */
				3812	if (PyUnicode_GET_SIZE(self) == 1 &&
				3813	Py_UNICODE_ISDECIMAL(*p))
				3814	return PyInt_FromLong(1);
				3815
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3816	/* Special case for empty strings */
				3817	if (PyString_GET_SIZE(self) == 0)
				3818	return PyInt_FromLong(0);
				3819
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3820	e = p + PyUnicode_GET_SIZE(self);
				3821	for (; p < e; p++) {
				3822	if (!Py_UNICODE_ISDECIMAL(*p))
				3823	return PyInt_FromLong(0);
				3824	}
				3825	return PyInt_FromLong(1);
				3826	}
				3827
				3828	static char isdigit__doc__[] =
				3829	"S.isdigit() -> int\n\
				3830	\n\
				3831	Return 1 if there are only digit characters in S,\n\
				3832	0 otherwise.";
				3833
				3834	static PyObject*
				3835	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3836	{
				3837	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3838	register const Py_UNICODE *e;
				3839
				3840	if (!PyArg_NoArgs(args))
				3841	return NULL;
				3842
				3843	/* Shortcut for single character strings */
				3844	if (PyUnicode_GET_SIZE(self) == 1 &&
				3845	Py_UNICODE_ISDIGIT(*p))
				3846	return PyInt_FromLong(1);
				3847
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3848	/* Special case for empty strings */
				3849	if (PyString_GET_SIZE(self) == 0)
				3850	return PyInt_FromLong(0);
				3851
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3852	e = p + PyUnicode_GET_SIZE(self);
				3853	for (; p < e; p++) {
				3854	if (!Py_UNICODE_ISDIGIT(*p))
				3855	return PyInt_FromLong(0);
				3856	}
				3857	return PyInt_FromLong(1);
				3858	}
				3859
				3860	static char isnumeric__doc__[] =
				3861	"S.isnumeric() -> int\n\
				3862	\n\
				3863	Return 1 if there are only numeric characters in S,\n\
				3864	0 otherwise.";
				3865
				3866	static PyObject*
				3867	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3868	{
				3869	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3870	register const Py_UNICODE *e;
				3871
				3872	if (!PyArg_NoArgs(args))
				3873	return NULL;
				3874
				3875	/* Shortcut for single character strings */
				3876	if (PyUnicode_GET_SIZE(self) == 1 &&
				3877	Py_UNICODE_ISNUMERIC(*p))
				3878	return PyInt_FromLong(1);
				3879
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3880	/* Special case for empty strings */
				3881	if (PyString_GET_SIZE(self) == 0)
				3882	return PyInt_FromLong(0);
				3883
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3884	e = p + PyUnicode_GET_SIZE(self);
				3885	for (; p < e; p++) {
				3886	if (!Py_UNICODE_ISNUMERIC(*p))
				3887	return PyInt_FromLong(0);
				3888	}
				3889	return PyInt_FromLong(1);
				3890	}
				3891
				3892	static char join__doc__[] =
				3893	"S.join(sequence) -> unicode\n\
				3894	\n\
				3895	Return a string which is the concatenation of the strings in the\n\
				3896	sequence. The separator between elements is S.";
				3897
				3898	static PyObject*
				3899	unicode_join(PyUnicodeObject self, PyObject args)
				3900	{
				3901	PyObject *data;
				3902	if (!PyArg_ParseTuple(args, "O:join", &data))
				3903	return NULL;
				3904
				3905	return PyUnicode_Join((PyObject *)self, data);
				3906	}
				3907
				3908	static int
				3909	unicode_length(PyUnicodeObject *self)
				3910	{
				3911	return self->length;
				3912	}
				3913
				3914	static char ljust__doc__[] =
				3915	"S.ljust(width) -> unicode\n\
				3916	\n\
				3917	Return S left justified in a Unicode string of length width. Padding is\n\
				3918	done using spaces.";
				3919
				3920	static PyObject *
				3921	unicode_ljust(PyUnicodeObject self, PyObject args)
				3922	{
				3923	int width;
				3924	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3925	return NULL;
				3926
				3927	if (self->length >= width) {
				3928	Py_INCREF(self);
				3929	return (PyObject*) self;
				3930	}
				3931
				3932	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3933	}
				3934
				3935	static char lower__doc__[] =
				3936	"S.lower() -> unicode\n\
				3937	\n\
				3938	Return a copy of the string S converted to lowercase.";
				3939
				3940	static PyObject*
				3941	unicode_lower(PyUnicodeObject self, PyObject args)
				3942	{
				3943	if (!PyArg_NoArgs(args))
				3944	return NULL;
				3945	return fixup(self, fixlower);
				3946	}
				3947
				3948	static char lstrip__doc__[] =
				3949	"S.lstrip() -> unicode\n\
				3950	\n\
				3951	Return a copy of the string S with leading whitespace removed.";
				3952
				3953	static PyObject *
				3954	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3955	{
				3956	if (!PyArg_NoArgs(args))
				3957	return NULL;
				3958	return strip(self, 1, 0);
				3959	}
				3960
				3961	static PyObject*
				3962	unicode_repeat(PyUnicodeObject *str, int len)
				3963	{
				3964	PyUnicodeObject *u;
				3965	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3966	int nchars;
				3967	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3968
				3969	if (len < 0)
				3970	len = 0;
				3971
				3972	if (len == 1) {
				3973	/* no repeat, return original string */
				3974	Py_INCREF(str);
				3975	return (PyObject*) str;
				3976	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3977
				3978	/* ensure # of chars needed doesn't overflow int and # of bytes
				3979	* needed doesn't overflow size_t
				3980	*/
				3981	nchars = len * str->length;
				3982	if (len && nchars / len != str->length) {
				3983	PyErr_SetString(PyExc_OverflowError,
				3984	"repeated string is too long");
				3985	return NULL;
				3986	}
				3987	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				3988	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				3989	PyErr_SetString(PyExc_OverflowError,
				3990	"repeated string is too long");
				3991	return NULL;
				3992	}
				3993	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3994	if (!u)
				3995	return NULL;
				3996
				3997	p = u->str;
				3998
				3999	while (len-- > 0) {
				4000	Py_UNICODE_COPY(p, str->str, str->length);
				4001	p += str->length;
				4002	}
				4003
				4004	return (PyObject*) u;
				4005	}
				4006
				4007	PyObject PyUnicode_Replace(PyObject obj,
				4008	PyObject *subobj,
				4009	PyObject *replobj,
				4010	int maxcount)
				4011	{
				4012	PyObject *self;
				4013	PyObject *str1;
				4014	PyObject *str2;
				4015	PyObject *result;
				4016
				4017	self = PyUnicode_FromObject(obj);
				4018	if (self == NULL)
				4019	return NULL;
				4020	str1 = PyUnicode_FromObject(subobj);
				4021	if (str1 == NULL) {
				4022	Py_DECREF(self);
				4023	return NULL;
				4024	}
				4025	str2 = PyUnicode_FromObject(replobj);
				4026	if (str2 == NULL) {
				4027	Py_DECREF(self);
				4028	Py_DECREF(str1);
				4029	return NULL;
				4030	}
				4031	result = replace((PyUnicodeObject *)self,
				4032	(PyUnicodeObject *)str1,
				4033	(PyUnicodeObject *)str2,
				4034	maxcount);
				4035	Py_DECREF(self);
				4036	Py_DECREF(str1);
				4037	Py_DECREF(str2);
				4038	return result;
				4039	}
				4040
				4041	static char replace__doc__[] =
				4042	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4043	\n\
				4044	Return a copy of S with all occurrences of substring\n\
				4045	old replaced by new. If the optional argument maxsplit is\n\
				4046	given, only the first maxsplit occurrences are replaced.";
				4047
				4048	static PyObject*
				4049	unicode_replace(PyUnicodeObject self, PyObject args)
				4050	{
				4051	PyUnicodeObject *str1;
				4052	PyUnicodeObject *str2;
				4053	int maxcount = -1;
				4054	PyObject *result;
				4055
				4056	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4057	return NULL;
				4058	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4059	if (str1 == NULL)
				4060	return NULL;
				4061	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4062	if (str2 == NULL)
				4063	return NULL;
				4064
				4065	result = replace(self, str1, str2, maxcount);
				4066
				4067	Py_DECREF(str1);
				4068	Py_DECREF(str2);
				4069	return result;
				4070	}
				4071
				4072	static
				4073	PyObject unicode_repr(PyObject unicode)
				4074	{
				4075	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4076	PyUnicode_GET_SIZE(unicode),
				4077	1);
				4078	}
				4079
				4080	static char rfind__doc__[] =
				4081	"S.rfind(sub [,start [,end]]) -> int\n\
				4082	\n\
				4083	Return the highest index in S where substring sub is found,\n\
				4084	such that sub is contained within s[start,end]. Optional\n\
				4085	arguments start and end are interpreted as in slice notation.\n\
				4086	\n\
				4087	Return -1 on failure.";
				4088
				4089	static PyObject *
				4090	unicode_rfind(PyUnicodeObject self, PyObject args)
				4091	{
				4092	PyUnicodeObject *substring;
				4093	int start = 0;
				4094	int end = INT_MAX;
				4095	PyObject *result;
				4096
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4097	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4098	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4099	return NULL;
				4100	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4101	(PyObject *)substring);
				4102	if (substring == NULL)
				4103	return NULL;
				4104
				4105	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4106
				4107	Py_DECREF(substring);
				4108	return result;
				4109	}
				4110
				4111	static char rindex__doc__[] =
				4112	"S.rindex(sub [,start [,end]]) -> int\n\
				4113	\n\
				4114	Like S.rfind() but raise ValueError when the substring is not found.";
				4115
				4116	static PyObject *
				4117	unicode_rindex(PyUnicodeObject self, PyObject args)
				4118	{
				4119	int result;
				4120	PyUnicodeObject *substring;
				4121	int start = 0;
				4122	int end = INT_MAX;
				4123
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4124	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4125	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4126	return NULL;
				4127	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4128	(PyObject *)substring);
				4129	if (substring == NULL)
				4130	return NULL;
				4131
				4132	result = findstring(self, substring, start, end, -1);
				4133
				4134	Py_DECREF(substring);
				4135	if (result < 0) {
				4136	PyErr_SetString(PyExc_ValueError, "substring not found");
				4137	return NULL;
				4138	}
				4139	return PyInt_FromLong(result);
				4140	}
				4141
				4142	static char rjust__doc__[] =
				4143	"S.rjust(width) -> unicode\n\
				4144	\n\
				4145	Return S right justified in a Unicode string of length width. Padding is\n\
				4146	done using spaces.";
				4147
				4148	static PyObject *
				4149	unicode_rjust(PyUnicodeObject self, PyObject args)
				4150	{
				4151	int width;
				4152	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4153	return NULL;
				4154
				4155	if (self->length >= width) {
				4156	Py_INCREF(self);
				4157	return (PyObject*) self;
				4158	}
				4159
				4160	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4161	}
				4162
				4163	static char rstrip__doc__[] =
				4164	"S.rstrip() -> unicode\n\
				4165	\n\
				4166	Return a copy of the string S with trailing whitespace removed.";
				4167
				4168	static PyObject *
				4169	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4170	{
				4171	if (!PyArg_NoArgs(args))
				4172	return NULL;
				4173	return strip(self, 0, 1);
				4174	}
				4175
				4176	static PyObject*
				4177	unicode_slice(PyUnicodeObject *self, int start, int end)
				4178	{
				4179	/* standard clamping */
				4180	if (start < 0)
				4181	start = 0;
				4182	if (end < 0)
				4183	end = 0;
				4184	if (end > self->length)
				4185	end = self->length;
				4186	if (start == 0 && end == self->length) {
				4187	/* full slice, return original string */
				4188	Py_INCREF(self);
				4189	return (PyObject*) self;
				4190	}
				4191	if (start > end)
				4192	start = end;
				4193	/* copy slice */
				4194	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4195	end - start);
				4196	}
				4197
				4198	PyObject PyUnicode_Split(PyObject s,
				4199	PyObject *sep,
				4200	int maxsplit)
				4201	{
				4202	PyObject *result;
				4203
				4204	s = PyUnicode_FromObject(s);
				4205	if (s == NULL)
				4206	return NULL;
				4207	if (sep != NULL) {
				4208	sep = PyUnicode_FromObject(sep);
				4209	if (sep == NULL) {
				4210	Py_DECREF(s);
				4211	return NULL;
				4212	}
				4213	}
				4214
				4215	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4216
				4217	Py_DECREF(s);
				4218	Py_XDECREF(sep);
				4219	return result;
				4220	}
				4221
				4222	static char split__doc__[] =
				4223	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4224	\n\
				4225	Return a list of the words in S, using sep as the\n\
				4226	delimiter string. If maxsplit is given, at most maxsplit\n\
				4227	splits are done. If sep is not specified, any whitespace string\n\
				4228	is a separator.";
				4229
				4230	static PyObject*
				4231	unicode_split(PyUnicodeObject self, PyObject args)
				4232	{
				4233	PyObject *substring = Py_None;
				4234	int maxcount = -1;
				4235
				4236	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4237	return NULL;
				4238
				4239	if (substring == Py_None)
				4240	return split(self, NULL, maxcount);
				4241	else if (PyUnicode_Check(substring))
				4242	return split(self, (PyUnicodeObject *)substring, maxcount);
				4243	else
				4244	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4245	}
				4246
				4247	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4248	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4249	\n\
				4250	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4251	Line breaks are not included in the resulting list unless keepends\n\
				4252	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4253
				4254	static PyObject*
				4255	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4256	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4257	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4258
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4259	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4260	return NULL;
				4261
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4262	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4263	}
				4264
				4265	static
				4266	PyObject unicode_str(PyUnicodeObject self)
				4267	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4268	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4269	}
				4270
				4271	static char strip__doc__[] =
				4272	"S.strip() -> unicode\n\
				4273	\n\
				4274	Return a copy of S with leading and trailing whitespace removed.";
				4275
				4276	static PyObject *
				4277	unicode_strip(PyUnicodeObject self, PyObject args)
				4278	{
				4279	if (!PyArg_NoArgs(args))
				4280	return NULL;
				4281	return strip(self, 1, 1);
				4282	}
				4283
				4284	static char swapcase__doc__[] =
				4285	"S.swapcase() -> unicode\n\
				4286	\n\
				4287	Return a copy of S with uppercase characters converted to lowercase\n\
				4288	and vice versa.";
				4289
				4290	static PyObject*
				4291	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4292	{
				4293	if (!PyArg_NoArgs(args))
				4294	return NULL;
				4295	return fixup(self, fixswapcase);
				4296	}
				4297
				4298	static char translate__doc__[] =
				4299	"S.translate(table) -> unicode\n\
				4300	\n\
				4301	Return a copy of the string S, where all characters have been mapped\n\
				4302	through the given translation table, which must be a mapping of\n\
				4303	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4304	are left untouched. Characters mapped to None are deleted.";
				4305
				4306	static PyObject*
				4307	unicode_translate(PyUnicodeObject self, PyObject args)
				4308	{
				4309	PyObject *table;
				4310
				4311	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4312	return NULL;
				4313	return PyUnicode_TranslateCharmap(self->str,
				4314	self->length,
				4315	table,
				4316	"ignore");
				4317	}
				4318
				4319	static char upper__doc__[] =
				4320	"S.upper() -> unicode\n\
				4321	\n\
				4322	Return a copy of S converted to uppercase.";
				4323
				4324	static PyObject*
				4325	unicode_upper(PyUnicodeObject self, PyObject args)
				4326	{
				4327	if (!PyArg_NoArgs(args))
				4328	return NULL;
				4329	return fixup(self, fixupper);
				4330	}
				4331
				4332	#if 0
				4333	static char zfill__doc__[] =
				4334	"S.zfill(width) -> unicode\n\
				4335	\n\
				4336	Pad a numeric string x with zeros on the left, to fill a field\n\
				4337	of the specified width. The string x is never truncated.";
				4338
				4339	static PyObject *
				4340	unicode_zfill(PyUnicodeObject self, PyObject args)
				4341	{
				4342	int fill;
				4343	PyUnicodeObject *u;
				4344
				4345	int width;
				4346	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4347	return NULL;
				4348
				4349	if (self->length >= width) {
				4350	Py_INCREF(self);
				4351	return (PyObject*) self;
				4352	}
				4353
				4354	fill = width - self->length;
				4355
				4356	u = pad(self, fill, 0, '0');
				4357
				4358	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4359	/* move sign to beginning of string */
				4360	u->str[0] = u->str[fill];
				4361	u->str[fill] = '0';
				4362	}
				4363
				4364	return (PyObject*) u;
				4365	}
				4366	#endif
				4367
				4368	#if 0
				4369	static PyObject*
				4370	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4371	{
				4372	if (!PyArg_NoArgs(args))
				4373	return NULL;
				4374	return PyInt_FromLong(unicode_freelist_size);
				4375	}
				4376	#endif
				4377
				4378	static char startswith__doc__[] =
				4379	"S.startswith(prefix[, start[, end]]) -> int\n\
				4380	\n\
				4381	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4382	optional start, test S beginning at that position. With optional end, stop\n\
				4383	comparing S at that position.";
				4384
				4385	static PyObject *
				4386	unicode_startswith(PyUnicodeObject *self,
				4387	PyObject *args)
				4388	{
				4389	PyUnicodeObject *substring;
				4390	int start = 0;
				4391	int end = INT_MAX;
				4392	PyObject *result;
				4393
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4394	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4395	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4396	return NULL;
				4397	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4398	(PyObject *)substring);
				4399	if (substring == NULL)
				4400	return NULL;
				4401
				4402	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4403
				4404	Py_DECREF(substring);
				4405	return result;
				4406	}
				4407
				4408
				4409	static char endswith__doc__[] =
				4410	"S.endswith(suffix[, start[, end]]) -> int\n\
				4411	\n\
				4412	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4413	optional start, test S beginning at that position. With optional end, stop\n\
				4414	comparing S at that position.";
				4415
				4416	static PyObject *
				4417	unicode_endswith(PyUnicodeObject *self,
				4418	PyObject *args)
				4419	{
				4420	PyUnicodeObject *substring;
				4421	int start = 0;
				4422	int end = INT_MAX;
				4423	PyObject *result;
				4424
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4425	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4426	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4427	return NULL;
				4428	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4429	(PyObject *)substring);
				4430	if (substring == NULL)
				4431	return NULL;
				4432
				4433	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4434
				4435	Py_DECREF(substring);
				4436	return result;
				4437	}
				4438
				4439
				4440	static PyMethodDef unicode_methods[] = {
				4441
				4442	/* Order is according to common usage: often used methods should
				4443	appear first, since lookup is done sequentially. */
				4444
				4445	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4446	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4447	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4448	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4449	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4450	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4451	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4452	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4453	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4454	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4455	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4456	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4457	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4458	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4459	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4460	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4461	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4462	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4463	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4464	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4465	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4466	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4467	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4468	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4469	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4470	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4471	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4472	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4473	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4474	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4475	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4476	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4477	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4478	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4479	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4480	#if 0
				4481	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4482	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4483	#endif
				4484
				4485	#if 0
				4486	/* This one is just used for debugging the implementation. */
				4487	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4488	#endif
				4489
				4490	{NULL, NULL}
				4491	};
				4492
				4493	static PyObject *
				4494	unicode_getattr(PyUnicodeObject self, char name)
				4495	{
				4496	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4497	}
				4498
				4499	static PySequenceMethods unicode_as_sequence = {
				4500	(inquiry) unicode_length, /* sq_length */
				4501	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4502	(intargfunc) unicode_repeat, /* sq_repeat */
				4503	(intargfunc) unicode_getitem, /* sq_item */
				4504	(intintargfunc) unicode_slice, /* sq_slice */
				4505	0, /* sq_ass_item */
				4506	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4507	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4508	};
				4509
				4510	static int
				4511	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4512	int index,
				4513	const void **ptr)
				4514	{
				4515	if (index != 0) {
				4516	PyErr_SetString(PyExc_SystemError,
				4517	"accessing non-existent unicode segment");
				4518	return -1;
				4519	}
				4520	ptr = (void ) self->str;
				4521	return PyUnicode_GET_DATA_SIZE(self);
				4522	}
				4523
				4524	static int
				4525	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4526	const void **ptr)
				4527	{
				4528	PyErr_SetString(PyExc_TypeError,
				4529	"cannot use unicode as modifyable buffer");
				4530	return -1;
				4531	}
				4532
				4533	static int
				4534	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4535	int *lenp)
				4536	{
				4537	if (lenp)
				4538	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4539	return 1;
				4540	}
				4541
				4542	static int
				4543	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4544	int index,
				4545	const void **ptr)
				4546	{
				4547	PyObject *str;
				4548
				4549	if (index != 0) {
				4550	PyErr_SetString(PyExc_SystemError,
				4551	"accessing non-existent unicode segment");
				4552	return -1;
				4553	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4554	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4555	if (str == NULL)
				4556	return -1;
				4557	ptr = (void ) PyString_AS_STRING(str);
				4558	return PyString_GET_SIZE(str);
				4559	}
				4560
				4561	/* Helpers for PyUnicode_Format() */
				4562
				4563	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4564	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4565	{
				4566	int argidx = *p_argidx;
				4567	if (argidx < arglen) {
				4568	(*p_argidx)++;
				4569	if (arglen < 0)
				4570	return args;
				4571	else
				4572	return PyTuple_GetItem(args, argidx);
				4573	}
				4574	PyErr_SetString(PyExc_TypeError,
				4575	"not enough arguments for format string");
				4576	return NULL;
				4577	}
				4578
				4579	#define F_LJUST (1<<0)
				4580	#define F_SIGN (1<<1)
				4581	#define F_BLANK (1<<2)
				4582	#define F_ALT (1<<3)
				4583	#define F_ZERO (1<<4)
				4584
				4585	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4586	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4587	{
				4588	register int i;
				4589	int len;
				4590	va_list va;
				4591	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4592	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4593
				4594	/* First, format the string as char array, then expand to Py_UNICODE
				4595	array. */
				4596	charbuffer = (char *)buffer;
				4597	len = vsprintf(charbuffer, format, va);
				4598	for (i = len - 1; i >= 0; i--)
				4599	buffer[i] = (Py_UNICODE) charbuffer[i];
				4600
				4601	va_end(va);
				4602	return len;
				4603	}
				4604
				4605	static int
				4606	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4607	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4608	int flags,
				4609	int prec,
				4610	int type,
				4611	PyObject *v)
				4612	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4613	/* fmt = '%#.' + `prec` + `type`
				4614	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4615	char fmt[20];
				4616	double x;
				4617
				4618	x = PyFloat_AsDouble(v);
				4619	if (x == -1.0 && PyErr_Occurred())
				4620	return -1;
				4621	if (prec < 0)
				4622	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4623	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4624	type = 'g';
				4625	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4626	/* worst case length calc to ensure no buffer overrun:
				4627	fmt = %#.<prec>g
				4628	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4629	for any double rep.)
				4630	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4631	If prec=0 the effective precision is 1 (the leading digit is
				4632	always given), therefore increase by one to 10+prec. */
				4633	if (buflen <= (size_t)10 + (size_t)prec) {
				4634	PyErr_SetString(PyExc_OverflowError,
				4635	"formatted float is too long (precision too long?)");
				4636	return -1;
				4637	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4638	return usprintf(buf, fmt, x);
				4639	}
				4640
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4641	static PyObject*
				4642	formatlong(PyObject *val, int flags, int prec, int type)
				4643	{
				4644	char *buf;
				4645	int i, len;
				4646	PyObject str; / temporary string object. */
				4647	PyUnicodeObject *result;
				4648
				4649	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4650	if (!str)
				4651	return NULL;
				4652	result = _PyUnicode_New(len);
				4653	for (i = 0; i < len; i++)
				4654	result->str[i] = buf[i];
				4655	result->str[len] = 0;
				4656	Py_DECREF(str);
				4657	return (PyObject*)result;
				4658	}
				4659
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4660	static int
				4661	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4662	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4663	int flags,
				4664	int prec,
				4665	int type,
				4666	PyObject *v)
				4667	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4668	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4669	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4670	+ 1 + 1 = 24*/
				4671	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4672	long x;
				4673
				4674	x = PyInt_AsLong(v);
				4675	if (x == -1 && PyErr_Occurred())
				4676	return -1;
				4677	if (prec < 0)
				4678	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4679	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4680	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4681	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4682	PyErr_SetString(PyExc_OverflowError,
				4683	"formatted integer is too long (precision too long?)");
				4684	return -1;
				4685	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4686	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4687	return usprintf(buf, fmt, x);
				4688	}
				4689
				4690	static int
				4691	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4692	size_t buflen,
				4693	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4694	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4695	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4696	if (PyUnicode_Check(v)) {
				4697	if (PyUnicode_GET_SIZE(v) != 1)
				4698	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4699	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4700	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4701
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4702	else if (PyString_Check(v)) {
				4703	if (PyString_GET_SIZE(v) != 1)
				4704	goto onError;
				4705	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4706	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4707
				4708	else {
				4709	/* Integer input truncated to a character */
				4710	long x;
				4711	x = PyInt_AsLong(v);
				4712	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4713	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4714	buf[0] = (char) x;
				4715	}
				4716	buf[1] = '\0';
				4717	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4718
				4719	onError:
				4720	PyErr_SetString(PyExc_TypeError,
				4721	"%c requires int or char");
				4722	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4723	}
				4724
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4725	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4726
				4727	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4728	chars are formatted. XXX This is a magic number. Each formatting
				4729	routine does bounds checking to ensure no overflow, but a better
				4730	solution may be to malloc a buffer of appropriate size for each
				4731	format. For now, the current solution is sufficient.
				4732	*/
				4733	#define FORMATBUFLEN (size_t)120
				4734
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4735	PyObject PyUnicode_Format(PyObject format,
				4736	PyObject *args)
				4737	{
				4738	Py_UNICODE fmt, res;
				4739	int fmtcnt, rescnt, reslen, arglen, argidx;
				4740	int args_owned = 0;
				4741	PyUnicodeObject *result = NULL;
				4742	PyObject *dict = NULL;
				4743	PyObject *uformat;
				4744
				4745	if (format == NULL \|\| args == NULL) {
				4746	PyErr_BadInternalCall();
				4747	return NULL;
				4748	}
				4749	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4750	if (uformat == NULL)
				4751	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4752	fmt = PyUnicode_AS_UNICODE(uformat);
				4753	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4754
				4755	reslen = rescnt = fmtcnt + 100;
				4756	result = _PyUnicode_New(reslen);
				4757	if (result == NULL)
				4758	goto onError;
				4759	res = PyUnicode_AS_UNICODE(result);
				4760
				4761	if (PyTuple_Check(args)) {
				4762	arglen = PyTuple_Size(args);
				4763	argidx = 0;
				4764	}
				4765	else {
				4766	arglen = -1;
				4767	argidx = -2;
				4768	}
				4769	if (args->ob_type->tp_as_mapping)
				4770	dict = args;
				4771
				4772	while (--fmtcnt >= 0) {
				4773	if (*fmt != '%') {
				4774	if (--rescnt < 0) {
				4775	rescnt = fmtcnt + 100;
				4776	reslen += rescnt;
				4777	if (_PyUnicode_Resize(result, reslen) < 0)
				4778	return NULL;
				4779	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4780	--rescnt;
				4781	}
				4782	res++ = fmt++;
				4783	}
				4784	else {
				4785	/* Got a format specifier */
				4786	int flags = 0;
				4787	int width = -1;
				4788	int prec = -1;
				4789	int size = 0;
				4790	Py_UNICODE c = '\0';
				4791	Py_UNICODE fill;
				4792	PyObject *v = NULL;
				4793	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4794	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4795	Py_UNICODE sign;
				4796	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4797	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4798
				4799	fmt++;
				4800	if (*fmt == '(') {
				4801	Py_UNICODE *keystart;
				4802	int keylen;
				4803	PyObject *key;
				4804	int pcount = 1;
				4805
				4806	if (dict == NULL) {
				4807	PyErr_SetString(PyExc_TypeError,
				4808	"format requires a mapping");
				4809	goto onError;
				4810	}
				4811	++fmt;
				4812	--fmtcnt;
				4813	keystart = fmt;
				4814	/* Skip over balanced parentheses */
				4815	while (pcount > 0 && --fmtcnt >= 0) {
				4816	if (*fmt == ')')
				4817	--pcount;
				4818	else if (*fmt == '(')
				4819	++pcount;
				4820	fmt++;
				4821	}
				4822	keylen = fmt - keystart - 1;
				4823	if (fmtcnt < 0 \|\| pcount > 0) {
				4824	PyErr_SetString(PyExc_ValueError,
				4825	"incomplete format key");
				4826	goto onError;
				4827	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4828	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4829	then looked up since Python uses strings to hold
				4830	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4831	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4832	key = PyUnicode_EncodeUTF8(keystart,
				4833	keylen,
				4834	NULL);
				4835	if (key == NULL)
				4836	goto onError;
				4837	if (args_owned) {
				4838	Py_DECREF(args);
				4839	args_owned = 0;
				4840	}
				4841	args = PyObject_GetItem(dict, key);
				4842	Py_DECREF(key);
				4843	if (args == NULL) {
				4844	goto onError;
				4845	}
				4846	args_owned = 1;
				4847	arglen = -1;
				4848	argidx = -2;
				4849	}
				4850	while (--fmtcnt >= 0) {
				4851	switch (c = *fmt++) {
				4852	case '-': flags \|= F_LJUST; continue;
				4853	case '+': flags \|= F_SIGN; continue;
				4854	case ' ': flags \|= F_BLANK; continue;
				4855	case '#': flags \|= F_ALT; continue;
				4856	case '0': flags \|= F_ZERO; continue;
				4857	}
				4858	break;
				4859	}
				4860	if (c == '*') {
				4861	v = getnextarg(args, arglen, &argidx);
				4862	if (v == NULL)
				4863	goto onError;
				4864	if (!PyInt_Check(v)) {
				4865	PyErr_SetString(PyExc_TypeError,
				4866	"* wants int");
				4867	goto onError;
				4868	}
				4869	width = PyInt_AsLong(v);
				4870	if (width < 0) {
				4871	flags \|= F_LJUST;
				4872	width = -width;
				4873	}
				4874	if (--fmtcnt >= 0)
				4875	c = *fmt++;
				4876	}
				4877	else if (c >= '0' && c <= '9') {
				4878	width = c - '0';
				4879	while (--fmtcnt >= 0) {
				4880	c = *fmt++;
				4881	if (c < '0' \|\| c > '9')
				4882	break;
				4883	if ((width*10) / 10 != width) {
				4884	PyErr_SetString(PyExc_ValueError,
				4885	"width too big");
				4886	goto onError;
				4887	}
				4888	width = width*10 + (c - '0');
				4889	}
				4890	}
				4891	if (c == '.') {
				4892	prec = 0;
				4893	if (--fmtcnt >= 0)
				4894	c = *fmt++;
				4895	if (c == '*') {
				4896	v = getnextarg(args, arglen, &argidx);
				4897	if (v == NULL)
				4898	goto onError;
				4899	if (!PyInt_Check(v)) {
				4900	PyErr_SetString(PyExc_TypeError,
				4901	"* wants int");
				4902	goto onError;
				4903	}
				4904	prec = PyInt_AsLong(v);
				4905	if (prec < 0)
				4906	prec = 0;
				4907	if (--fmtcnt >= 0)
				4908	c = *fmt++;
				4909	}
				4910	else if (c >= '0' && c <= '9') {
				4911	prec = c - '0';
				4912	while (--fmtcnt >= 0) {
				4913	c = Py_CHARMASK(*fmt++);
				4914	if (c < '0' \|\| c > '9')
				4915	break;
				4916	if ((prec*10) / 10 != prec) {
				4917	PyErr_SetString(PyExc_ValueError,
				4918	"prec too big");
				4919	goto onError;
				4920	}
				4921	prec = prec*10 + (c - '0');
				4922	}
				4923	}
				4924	} /* prec */
				4925	if (fmtcnt >= 0) {
				4926	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4927	size = c;
				4928	if (--fmtcnt >= 0)
				4929	c = *fmt++;
				4930	}
				4931	}
				4932	if (fmtcnt < 0) {
				4933	PyErr_SetString(PyExc_ValueError,
				4934	"incomplete format");
				4935	goto onError;
				4936	}
				4937	if (c != '%') {
				4938	v = getnextarg(args, arglen, &argidx);
				4939	if (v == NULL)
				4940	goto onError;
				4941	}
				4942	sign = 0;
				4943	fill = ' ';
				4944	switch (c) {
				4945
				4946	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4947	pbuf = formatbuf;
				4948	/* presume that buffer length is at least 1 */
				4949	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4950	len = 1;
				4951	break;
				4952
				4953	case 's':
				4954	case 'r':
				4955	if (PyUnicode_Check(v) && c == 's') {
				4956	temp = v;
				4957	Py_INCREF(temp);
				4958	}
				4959	else {
				4960	PyObject *unicode;
				4961	if (c == 's')
				4962	temp = PyObject_Str(v);
				4963	else
				4964	temp = PyObject_Repr(v);
				4965	if (temp == NULL)
				4966	goto onError;
				4967	if (!PyString_Check(temp)) {
				4968	/* XXX Note: this should never happen, since
				4969	PyObject_Repr() and PyObject_Str() assure
				4970	this */
				4971	Py_DECREF(temp);
				4972	PyErr_SetString(PyExc_TypeError,
				4973	"%s argument has non-string str()");
				4974	goto onError;
				4975	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4976	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4977	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4978	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4979	"strict");
				4980	Py_DECREF(temp);
				4981	temp = unicode;
				4982	if (temp == NULL)
				4983	goto onError;
				4984	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4985	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4986	len = PyUnicode_GET_SIZE(temp);
				4987	if (prec >= 0 && len > prec)
				4988	len = prec;
				4989	break;
				4990
				4991	case 'i':
				4992	case 'd':
				4993	case 'u':
				4994	case 'o':
				4995	case 'x':
				4996	case 'X':
				4997	if (c == 'i')
				4998	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	4999	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5000	temp = formatlong(v, flags, prec, c);
				5001	if (!temp)
				5002	goto onError;
				5003	pbuf = PyUnicode_AS_UNICODE(temp);
				5004	len = PyUnicode_GET_SIZE(temp);
				5005	/* unbounded ints can always produce
				5006	a sign character! */
				5007	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5008	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5009	else {
				5010	pbuf = formatbuf;
				5011	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5012	flags, prec, c, v);
				5013	if (len < 0)
				5014	goto onError;
				5015	/* only d conversion is signed */
				5016	sign = c == 'd';
				5017	}
				5018	if (flags & F_ZERO)
				5019	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5020	break;
				5021
				5022	case 'e':
				5023	case 'E':
				5024	case 'f':
				5025	case 'g':
				5026	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5027	pbuf = formatbuf;
				5028	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5029	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5030	if (len < 0)
				5031	goto onError;
				5032	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5033	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5034	fill = '0';
				5035	break;
				5036
				5037	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5038	pbuf = formatbuf;
				5039	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5040	if (len < 0)
				5041	goto onError;
				5042	break;
				5043
				5044	default:
				5045	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5046	"unsupported format character '%c' (0x%x) "
				5047	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5048	(31<=c && c<=126) ? c : '?',
				5049	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5050	goto onError;
				5051	}
				5052	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5053	if (pbuf == '-' \|\| pbuf == '+') {
				5054	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5055	len--;
				5056	}
				5057	else if (flags & F_SIGN)
				5058	sign = '+';
				5059	else if (flags & F_BLANK)
				5060	sign = ' ';
				5061	else
				5062	sign = 0;
				5063	}
				5064	if (width < len)
				5065	width = len;
				5066	if (rescnt < width + (sign != 0)) {
				5067	reslen -= rescnt;
				5068	rescnt = width + fmtcnt + 100;
				5069	reslen += rescnt;
				5070	if (_PyUnicode_Resize(result, reslen) < 0)
				5071	return NULL;
				5072	res = PyUnicode_AS_UNICODE(result)
				5073	+ reslen - rescnt;
				5074	}
				5075	if (sign) {
				5076	if (fill != ' ')
				5077	*res++ = sign;
				5078	rescnt--;
				5079	if (width > len)
				5080	width--;
				5081	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5082	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5083	assert(pbuf[0] == '0');
				5084	assert(pbuf[1] == c);
				5085	if (fill != ' ') {
				5086	res++ = pbuf++;
				5087	res++ = pbuf++;
				5088	}
				5089	rescnt -= 2;
				5090	width -= 2;
				5091	if (width < 0)
				5092	width = 0;
				5093	len -= 2;
				5094	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5095	if (width > len && !(flags & F_LJUST)) {
				5096	do {
				5097	--rescnt;
				5098	*res++ = fill;
				5099	} while (--width > len);
				5100	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5101	if (fill == ' ') {
				5102	if (sign)
				5103	*res++ = sign;
				5104	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5105	assert(pbuf[0] == '0');
				5106	assert(pbuf[1] == c);
				5107	res++ = pbuf++;
				5108	res++ = pbuf++;
				5109	}
				5110	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5111	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5112	res += len;
				5113	rescnt -= len;
				5114	while (--width >= len) {
				5115	--rescnt;
				5116	*res++ = ' ';
				5117	}
				5118	if (dict && (argidx < arglen) && c != '%') {
				5119	PyErr_SetString(PyExc_TypeError,
				5120	"not all arguments converted");
				5121	goto onError;
				5122	}
				5123	Py_XDECREF(temp);
				5124	} /* '%' */
				5125	} /* until end */
				5126	if (argidx < arglen && !dict) {
				5127	PyErr_SetString(PyExc_TypeError,
				5128	"not all arguments converted");
				5129	goto onError;
				5130	}
				5131
				5132	if (args_owned) {
				5133	Py_DECREF(args);
				5134	}
				5135	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5136	if (_PyUnicode_Resize(result, reslen - rescnt))
				5137	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5138	return (PyObject *)result;
				5139
				5140	onError:
				5141	Py_XDECREF(result);
				5142	Py_DECREF(uformat);
				5143	if (args_owned) {
				5144	Py_DECREF(args);
				5145	}
				5146	return NULL;
				5147	}
				5148
				5149	static PyBufferProcs unicode_as_buffer = {
				5150	(getreadbufferproc) unicode_buffer_getreadbuf,
				5151	(getwritebufferproc) unicode_buffer_getwritebuf,
				5152	(getsegcountproc) unicode_buffer_getsegcount,
				5153	(getcharbufferproc) unicode_buffer_getcharbuf,
				5154	};
				5155
				5156	PyTypeObject PyUnicode_Type = {
				5157	PyObject_HEAD_INIT(&PyType_Type)
				5158	0, /* ob_size */
				5159	"unicode", /* tp_name */
				5160	sizeof(PyUnicodeObject), /* tp_size */
				5161	0, /* tp_itemsize */
				5162	/* Slots */
				5163	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5164	0, /* tp_print */
				5165	(getattrfunc)unicode_getattr, /* tp_getattr */
				5166	0, /* tp_setattr */
				5167	(cmpfunc) unicode_compare, /* tp_compare */
				5168	(reprfunc) unicode_repr, /* tp_repr */
				5169	0, /* tp_as_number */
				5170	&unicode_as_sequence, /* tp_as_sequence */
				5171	0, /* tp_as_mapping */
				5172	(hashfunc) unicode_hash, /* tp_hash*/
				5173	0, /* tp_call*/
				5174	(reprfunc) unicode_str, /* tp_str */
				5175	(getattrofunc) NULL, /* tp_getattro */
				5176	(setattrofunc) NULL, /* tp_setattro */
				5177	&unicode_as_buffer, /* tp_as_buffer */
				5178	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5179	};
				5180
				5181	/* Initialize the Unicode implementation */
				5182
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5183	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5184	{
				5185	/* Doublecheck the configuration... */
				5186	if (sizeof(Py_UNICODE) != 2)
				5187	Py_FatalError("Unicode configuration error: "
				5188	"sizeof(Py_UNICODE) != 2 bytes");
				5189
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5190	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5191	unicode_freelist = NULL;
				5192	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5193	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5194	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5195	}
				5196
				5197	/* Finalize the Unicode implementation */
				5198
				5199	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5200	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5201	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5202	PyUnicodeObject *u;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5203
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5204	Py_XDECREF(unicode_empty);
				5205	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5206
				5207	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5208	PyUnicodeObject *v = u;
				5209	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5210	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5211	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5212	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5213	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5214	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5215	unicode_freelist = NULL;
				5216	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5217	}