Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 585afe6364739792c8372c64040d714969646dc8 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
				86	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	88
				89	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	90	static PyUnicodeObject *unicode_freelist;
				91	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	93	/* Default encoding to use and assume when NULL is passed as encoding
				94	parameter; it is initialized by _PyUnicode_Init().
				95
				96	Always use the PyUnicode_SetDefaultEncoding() and
				97	PyUnicode_GetDefaultEncoding() APIs to access this global.
				98
				99	*/
				100
				101	static char unicode_default_encoding[100];
				102
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	103	/* --- Unicode Object ----------------------------------------------------- */
				104
				105	static
				106	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				107	int length)
				108	{
				109	void *oldstr;
				110
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	111	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	112	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	113	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	114
				115	/* Resizing unicode_empty is not allowed. */
				116	if (unicode == unicode_empty) {
				117	PyErr_SetString(PyExc_SystemError,
				118	"can't resize empty unicode object");
				119	return -1;
				120	}
				121
				122	/* We allocate one more byte to make sure the string is
				123	Ux0000 terminated -- XXX is this needed ? */
				124	oldstr = unicode->str;
				125	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				126	if (!unicode->str) {
				127	unicode->str = oldstr;
				128	PyErr_NoMemory();
				129	return -1;
				130	}
				131	unicode->str[length] = 0;
				132	unicode->length = length;
				133
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	134	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	136	if (unicode->defenc) {
				137	Py_DECREF(unicode->defenc);
				138	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	}
				140	unicode->hash = -1;
				141
				142	return 0;
				143	}
				144
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	int PyUnicode_Resize(PyObject **unicode,
				146	int length)
				147	{
				148	PyUnicodeObject *v;
				149
				150	if (unicode == NULL) {
				151	PyErr_BadInternalCall();
				152	return -1;
				153	}
				154	v = (PyUnicodeObject )unicode;
				155	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				156	PyErr_BadInternalCall();
				157	return -1;
				158	}
				159	return _PyUnicode_Resize(v, length);
				160	}
				161
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	162	/* We allocate one more byte to make sure the string is
				163	Ux0000 terminated -- XXX is this needed ?
				164
				165	XXX This allocator could further be enhanced by assuring that the
				166	free list never reduces its size below 1.
				167
				168	*/
				169
				170	static
				171	PyUnicodeObject *_PyUnicode_New(int length)
				172	{
				173	register PyUnicodeObject *unicode;
				174
				175	/* Optimization for empty strings */
				176	if (length == 0 && unicode_empty != NULL) {
				177	Py_INCREF(unicode_empty);
				178	return unicode_empty;
				179	}
				180
				181	/* Unicode freelist & memory allocation */
				182	if (unicode_freelist) {
				183	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	184	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	185	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	186	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	187	/* Keep-Alive optimization: we only upsize the buffer,
				188	never downsize it. */
				189	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	190	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	191	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	192	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	193	}
				194	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	195	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	197	}
				198	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	}
				200	else {
				201	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				202	if (unicode == NULL)
				203	return NULL;
				204	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				205	}
				206
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	207	if (!unicode->str) {
				208	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	209	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	211	unicode->str[length] = 0;
				212	unicode->length = length;
				213	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	214	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	215	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	216
				217	onError:
				218	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	219	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	220	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	221	}
				222
				223	static
				224	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				225	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	226	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	227	/* Keep-Alive optimization */
				228	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	229	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	230	unicode->str = NULL;
				231	unicode->length = 0;
				232	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	233	if (unicode->defenc) {
				234	Py_DECREF(unicode->defenc);
				235	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	236	}
				237	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	238	(PyUnicodeObject *)unicode = unicode_freelist;
				239	unicode_freelist = unicode;
				240	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	}
				242	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	243	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	244	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	246	}
				247	}
				248
				249	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				250	int size)
				251	{
				252	PyUnicodeObject *unicode;
				253
				254	unicode = _PyUnicode_New(size);
				255	if (!unicode)
				256	return NULL;
				257
				258	/* Copy the Unicode data into the new object */
				259	if (u != NULL)
				260	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				261
				262	return (PyObject *)unicode;
				263	}
				264
				265	#ifdef HAVE_WCHAR_H
				266
				267	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				268	int size)
				269	{
				270	PyUnicodeObject *unicode;
				271
				272	if (w == NULL) {
				273	PyErr_BadInternalCall();
				274	return NULL;
				275	}
				276
				277	unicode = _PyUnicode_New(size);
				278	if (!unicode)
				279	return NULL;
				280
				281	/* Copy the wchar_t data into the new object */
				282	#ifdef HAVE_USABLE_WCHAR_T
				283	memcpy(unicode->str, w, size * sizeof(wchar_t));
				284	#else
				285	{
				286	register Py_UNICODE *u;
				287	register int i;
				288	u = PyUnicode_AS_UNICODE(unicode);
				289	for (i = size; i >= 0; i--)
				290	u++ = w++;
				291	}
				292	#endif
				293
				294	return (PyObject *)unicode;
				295	}
				296
				297	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				298	register wchar_t *w,
				299	int size)
				300	{
				301	if (unicode == NULL) {
				302	PyErr_BadInternalCall();
				303	return -1;
				304	}
				305	if (size > PyUnicode_GET_SIZE(unicode))
				306	size = PyUnicode_GET_SIZE(unicode);
				307	#ifdef HAVE_USABLE_WCHAR_T
				308	memcpy(w, unicode->str, size * sizeof(wchar_t));
				309	#else
				310	{
				311	register Py_UNICODE *u;
				312	register int i;
				313	u = PyUnicode_AS_UNICODE(unicode);
				314	for (i = size; i >= 0; i--)
				315	w++ = u++;
				316	}
				317	#endif
				318
				319	return size;
				320	}
				321
				322	#endif
				323
				324	PyObject PyUnicode_FromObject(register PyObject obj)
				325	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	326	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				327	}
				328
				329	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				330	const char *encoding,
				331	const char *errors)
				332	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	333	const char *s;
				334	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	335	int owned = 0;
				336	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	337
				338	if (obj == NULL) {
				339	PyErr_BadInternalCall();
				340	return NULL;
				341	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	342
				343	/* Coerce object */
				344	if (PyInstance_Check(obj)) {
				345	PyObject *func;
				346	func = PyObject_GetAttrString(obj, "__str__");
				347	if (func == NULL) {
				348	PyErr_SetString(PyExc_TypeError,
				349	"coercing to Unicode: instance doesn't define __str__");
				350	return NULL;
				351	}
				352	obj = PyEval_CallObject(func, NULL);
				353	Py_DECREF(func);
				354	if (obj == NULL)
				355	return NULL;
				356	owned = 1;
				357	}
				358	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	359	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	360	v = obj;
				361	if (encoding) {
				362	PyErr_SetString(PyExc_TypeError,
				363	"decoding Unicode is not supported");
				364	return NULL;
				365	}
				366	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	367	}
				368	else if (PyString_Check(obj)) {
				369	s = PyString_AS_STRING(obj);
				370	len = PyString_GET_SIZE(obj);
				371	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	372	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				373	/* Overwrite the error message with something more useful in
				374	case of a TypeError. */
				375	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	376	PyErr_Format(PyExc_TypeError,
				377	"coercing to Unicode: need string or buffer, "
				378	"%.80s found",
				379	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	380	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	381	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	382
				383	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	384	if (len == 0) {
				385	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	386	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	387	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	388	else
				389	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	390
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	391	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	392	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	393	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	394	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	395	return v;
				396
				397	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	398	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	399	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	400	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	401	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	402	}
				403
				404	PyObject PyUnicode_Decode(const char s,
				405	int size,
				406	const char *encoding,
				407	const char *errors)
				408	{
				409	PyObject buffer = NULL, unicode;
				410
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	411	if (encoding == NULL)
				412	encoding = PyUnicode_GetDefaultEncoding();
				413
				414	/* Shortcuts for common default encodings */
				415	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	416	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	417	else if (strcmp(encoding, "latin-1") == 0)
				418	return PyUnicode_DecodeLatin1(s, size, errors);
				419	else if (strcmp(encoding, "ascii") == 0)
				420	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	421
				422	/* Decode via the codec registry */
				423	buffer = PyBuffer_FromMemory((void *)s, size);
				424	if (buffer == NULL)
				425	goto onError;
				426	unicode = PyCodec_Decode(buffer, encoding, errors);
				427	if (unicode == NULL)
				428	goto onError;
				429	if (!PyUnicode_Check(unicode)) {
				430	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	431	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	432	unicode->ob_type->tp_name);
				433	Py_DECREF(unicode);
				434	goto onError;
				435	}
				436	Py_DECREF(buffer);
				437	return unicode;
				438
				439	onError:
				440	Py_XDECREF(buffer);
				441	return NULL;
				442	}
				443
				444	PyObject PyUnicode_Encode(const Py_UNICODE s,
				445	int size,
				446	const char *encoding,
				447	const char *errors)
				448	{
				449	PyObject v, unicode;
				450
				451	unicode = PyUnicode_FromUnicode(s, size);
				452	if (unicode == NULL)
				453	return NULL;
				454	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				455	Py_DECREF(unicode);
				456	return v;
				457	}
				458
				459	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				460	const char *encoding,
				461	const char *errors)
				462	{
				463	PyObject *v;
				464
				465	if (!PyUnicode_Check(unicode)) {
				466	PyErr_BadArgument();
				467	goto onError;
				468	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	469
				470	if (encoding == NULL)
				471	encoding = PyUnicode_GetDefaultEncoding();
				472
				473	/* Shortcuts for common default encodings */
				474	if (errors == NULL) {
				475	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	476	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	477	else if (strcmp(encoding, "latin-1") == 0)
				478	return PyUnicode_AsLatin1String(unicode);
				479	else if (strcmp(encoding, "ascii") == 0)
				480	return PyUnicode_AsASCIIString(unicode);
				481	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	482
				483	/* Encode via the codec registry */
				484	v = PyCodec_Encode(unicode, encoding, errors);
				485	if (v == NULL)
				486	goto onError;
				487	/* XXX Should we really enforce this ? */
				488	if (!PyString_Check(v)) {
				489	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	490	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	491	v->ob_type->tp_name);
				492	Py_DECREF(v);
				493	goto onError;
				494	}
				495	return v;
				496
				497	onError:
				498	return NULL;
				499	}
				500
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	501	/* Return a Python string holding the default encoded value of the
				502	Unicode object.
				503
				504	The resulting string is cached in the Unicode object for subsequent
				505	usage by this function. The cached version is needed to implement
				506	the character buffer interface and will live (at least) as long as
				507	the Unicode object itself.
				508
				509	The refcount of the string is not incremented.
				510
				511	* Exported for internal use by the interpreter only !!! *
				512
				513	*/
				514
				515	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				516	const char *errors)
				517	{
				518	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				519
				520	if (v)
				521	return v;
				522	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				523	if (v && errors == NULL)
				524	((PyUnicodeObject *)unicode)->defenc = v;
				525	return v;
				526	}
				527
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	528	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				529	{
				530	if (!PyUnicode_Check(unicode)) {
				531	PyErr_BadArgument();
				532	goto onError;
				533	}
				534	return PyUnicode_AS_UNICODE(unicode);
				535
				536	onError:
				537	return NULL;
				538	}
				539
				540	int PyUnicode_GetSize(PyObject *unicode)
				541	{
				542	if (!PyUnicode_Check(unicode)) {
				543	PyErr_BadArgument();
				544	goto onError;
				545	}
				546	return PyUnicode_GET_SIZE(unicode);
				547
				548	onError:
				549	return -1;
				550	}
				551
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	552	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	553	{
				554	return unicode_default_encoding;
				555	}
				556
				557	int PyUnicode_SetDefaultEncoding(const char *encoding)
				558	{
				559	PyObject *v;
				560
				561	/* Make sure the encoding is valid. As side effect, this also
				562	loads the encoding into the codec registry cache. */
				563	v = _PyCodec_Lookup(encoding);
				564	if (v == NULL)
				565	goto onError;
				566	Py_DECREF(v);
				567	strncpy(unicode_default_encoding,
				568	encoding,
				569	sizeof(unicode_default_encoding));
				570	return 0;
				571
				572	onError:
				573	return -1;
				574	}
				575
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	576	/* --- UTF-8 Codec -------------------------------------------------------- */
				577
				578	static
				579	char utf8_code_length[256] = {
				580	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				581	illegal prefix. see RFC 2279 for details */
				582	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				583	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				584	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				585	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				586	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				587	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				588	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				589	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				590	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				591	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				592	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				593	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				594	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				595	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				596	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				597	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				598	};
				599
				600	static
				601	int utf8_decoding_error(const char **source,
				602	Py_UNICODE **dest,
				603	const char *errors,
				604	const char *details)
				605	{
				606	if ((errors == NULL) \|\|
				607	(strcmp(errors,"strict") == 0)) {
				608	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	609	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	610	details);
				611	return -1;
				612	}
				613	else if (strcmp(errors,"ignore") == 0) {
				614	(*source)++;
				615	return 0;
				616	}
				617	else if (strcmp(errors,"replace") == 0) {
				618	(*source)++;
				619	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				620	(*dest)++;
				621	return 0;
				622	}
				623	else {
				624	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	625	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	626	errors);
				627	return -1;
				628	}
				629	}
				630
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	631	PyObject PyUnicode_DecodeUTF8(const char s,
				632	int size,
				633	const char *errors)
				634	{
				635	int n;
				636	const char *e;
				637	PyUnicodeObject *unicode;
				638	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	639	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	640
				641	/* Note: size will always be longer than the resulting Unicode
				642	character count */
				643	unicode = _PyUnicode_New(size);
				644	if (!unicode)
				645	return NULL;
				646	if (size == 0)
				647	return (PyObject *)unicode;
				648
				649	/* Unpack UTF-8 encoded data */
				650	p = unicode->str;
				651	e = s + size;
				652
				653	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	654	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	655
				656	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	657	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	658	s++;
				659	continue;
				660	}
				661
				662	n = utf8_code_length[ch];
				663
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	664	if (s + n > e) {
				665	errmsg = "unexpected end of data";
				666	goto utf8Error;
				667	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	668
				669	switch (n) {
				670
				671	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	672	errmsg = "unexpected code byte";
				673	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	674	break;
				675
				676	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	677	errmsg = "internal error";
				678	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	679	break;
				680
				681	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	682	if ((s[1] & 0xc0) != 0x80) {
				683	errmsg = "invalid data";
				684	goto utf8Error;
				685	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	686	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	687	if (ch < 0x80) {
				688	errmsg = "illegal encoding";
				689	goto utf8Error;
				690	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	691	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	692	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	693	break;
				694
				695	case 3:
				696	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	697	(s[2] & 0xc0) != 0x80) {
				698	errmsg = "invalid data";
				699	goto utf8Error;
				700	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	701	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	702	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				703	errmsg = "illegal encoding";
				704	goto utf8Error;
				705	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	706	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	707	*p++ = (Py_UNICODE)ch;
				708	break;
				709
				710	case 4:
				711	if ((s[1] & 0xc0) != 0x80 \|\|
				712	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	713	(s[3] & 0xc0) != 0x80) {
				714	errmsg = "invalid data";
				715	goto utf8Error;
				716	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	717	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				718	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				719	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	720	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				721	byte encoding */
				722	(ch > 0x10ffff)) { /* maximum value allowed for
				723	UTF-16 */
				724	errmsg = "illegal encoding";
				725	goto utf8Error;
				726	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	727	/* compute and append the two surrogates: */
				728
				729	/* translate from 10000..10FFFF to 0..FFFF */
				730	ch -= 0x10000;
				731
				732	/* high surrogate = top 10 bits added to D800 */
				733	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				734
				735	/* low surrogate = bottom 10 bits added to DC00 */
				736	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	737	break;
				738
				739	default:
				740	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	741	errmsg = "unsupported Unicode code range";
				742	goto utf8Error;
				743	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	744	}
				745	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	746	continue;
				747
				748	utf8Error:
				749	if (utf8_decoding_error(&s, &p, errors, errmsg))
				750	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	751	}
				752
				753	/* Adjust length */
				754	if (_PyUnicode_Resize(unicode, p - unicode->str))
				755	goto onError;
				756
				757	return (PyObject *)unicode;
				758
				759	onError:
				760	Py_DECREF(unicode);
				761	return NULL;
				762	}
				763
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	764	/* Not used anymore, now that the encoder supports UTF-16
				765	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	766	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	767	static
				768	int utf8_encoding_error(const Py_UNICODE **source,
				769	char **dest,
				770	const char *errors,
				771	const char *details)
				772	{
				773	if ((errors == NULL) \|\|
				774	(strcmp(errors,"strict") == 0)) {
				775	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	776	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	777	details);
				778	return -1;
				779	}
				780	else if (strcmp(errors,"ignore") == 0) {
				781	return 0;
				782	}
				783	else if (strcmp(errors,"replace") == 0) {
				784	**dest = '?';
				785	(*dest)++;
				786	return 0;
				787	}
				788	else {
				789	PyErr_Format(PyExc_ValueError,
				790	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	791	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	792	errors);
				793	return -1;
				794	}
				795	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	796	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	797
				798	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				799	int size,
				800	const char *errors)
				801	{
				802	PyObject *v;
				803	char *p;
				804	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	805	Py_UCS4 ch2;
				806	unsigned int cbAllocated = 3 * size;
				807	unsigned int cbWritten = 0;
				808	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	809
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	810	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	811	if (v == NULL)
				812	return NULL;
				813	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	814	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	815
				816	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	817	while (i < size) {
				818	Py_UCS4 ch = s[i++];
				819	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	820	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	821	cbWritten++;
				822	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	823	else if (ch < 0x0800) {
				824	*p++ = 0xc0 \| (ch >> 6);
				825	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	826	cbWritten += 2;
				827	}
				828	else {
				829	/* Check for high surrogate */
				830	if (0xD800 <= ch && ch <= 0xDBFF) {
				831	if (i != size) {
				832	ch2 = s[i];
				833	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				834
				835	if (cbWritten >= (cbAllocated - 4)) {
				836	/* Provide enough room for some more
				837	surrogates */
				838	cbAllocated += 4*10;
				839	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	840	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	841	}
				842
				843	/* combine the two values */
				844	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				845
				846	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	847	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	848	i++;
				849	cbWritten += 4;
				850	}
				851	}
				852	}
				853	else {
				854	*p++ = (char)(0xe0 \| (ch >> 12));
				855	cbWritten += 3;
				856	}
				857	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				858	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	859	}
				860	}
				861	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	862	if (_PyString_Resize(&v, p - q))
				863	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	864	return v;
				865
				866	onError:
				867	Py_DECREF(v);
				868	return NULL;
				869	}
				870
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	871	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				872	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	873	if (!PyUnicode_Check(unicode)) {
				874	PyErr_BadArgument();
				875	return NULL;
				876	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	877	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				878	PyUnicode_GET_SIZE(unicode),
				879	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	880	}
				881
				882	/* --- UTF-16 Codec ------------------------------------------------------- */
				883
				884	static
				885	int utf16_decoding_error(const Py_UNICODE **source,
				886	Py_UNICODE **dest,
				887	const char *errors,
				888	const char *details)
				889	{
				890	if ((errors == NULL) \|\|
				891	(strcmp(errors,"strict") == 0)) {
				892	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	893	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	894	details);
				895	return -1;
				896	}
				897	else if (strcmp(errors,"ignore") == 0) {
				898	return 0;
				899	}
				900	else if (strcmp(errors,"replace") == 0) {
				901	if (dest) {
				902	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				903	(*dest)++;
				904	}
				905	return 0;
				906	}
				907	else {
				908	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	909	"UTF-16 decoding error; "
				910	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	911	errors);
				912	return -1;
				913	}
				914	}
				915
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	916	PyObject PyUnicode_DecodeUTF16(const char s,
				917	int size,
				918	const char *errors,
				919	int *byteorder)
				920	{
				921	PyUnicodeObject *unicode;
				922	Py_UNICODE *p;
				923	const Py_UNICODE q, e;
				924	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	925	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	926
				927	/* size should be an even number */
				928	if (size % sizeof(Py_UNICODE) != 0) {
				929	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				930	return NULL;
				931	/* The remaining input chars are ignored if we fall through
				932	here... */
				933	}
				934
				935	/* Note: size will always be longer than the resulting Unicode
				936	character count */
				937	unicode = _PyUnicode_New(size);
				938	if (!unicode)
				939	return NULL;
				940	if (size == 0)
				941	return (PyObject *)unicode;
				942
				943	/* Unpack UTF-16 encoded data */
				944	p = unicode->str;
				945	q = (Py_UNICODE *)s;
				946	e = q + (size / sizeof(Py_UNICODE));
				947
				948	if (byteorder)
				949	bo = *byteorder;
				950
				951	while (q < e) {
				952	register Py_UNICODE ch = *q++;
				953
				954	/* Check for BOM marks (U+FEFF) in the input and adjust
				955	current byte order setting accordingly. Swap input
				956	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				957	!) */
				958	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				959	if (ch == 0xFEFF) {
				960	bo = -1;
				961	continue;
				962	} else if (ch == 0xFFFE) {
				963	bo = 1;
				964	continue;
				965	}
				966	if (bo == 1)
				967	ch = (ch >> 8) \| (ch << 8);
				968	#else
				969	if (ch == 0xFEFF) {
				970	bo = 1;
				971	continue;
				972	} else if (ch == 0xFFFE) {
				973	bo = -1;
				974	continue;
				975	}
				976	if (bo == -1)
				977	ch = (ch >> 8) \| (ch << 8);
				978	#endif
				979	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				980	*p++ = ch;
				981	continue;
				982	}
				983
				984	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	985	if (q >= e) {
				986	errmsg = "unexpected end of data";
				987	goto utf16Error;
				988	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	989	if (0xDC00 <= q && q <= 0xDFFF) {
				990	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	991	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	992	/* This is valid data (a UTF-16 surrogate pair), but
				993	we are not able to store this information since our
				994	Py_UNICODE type only has 16 bits... this might
				995	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	996	errmsg = "code pairs are not supported";
				997	goto utf16Error;
				998	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	999	else
				1000	continue;
				1001	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1002	errmsg = "illegal encoding";
				1003	/* Fall through to report the error */
				1004
				1005	utf16Error:
				1006	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1007	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1008	}
				1009
				1010	if (byteorder)
				1011	*byteorder = bo;
				1012
				1013	/* Adjust length */
				1014	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1015	goto onError;
				1016
				1017	return (PyObject *)unicode;
				1018
				1019	onError:
				1020	Py_DECREF(unicode);
				1021	return NULL;
				1022	}
				1023
				1024	#undef UTF16_ERROR
				1025
				1026	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1027	int size,
				1028	const char *errors,
				1029	int byteorder)
				1030	{
				1031	PyObject *v;
				1032	Py_UNICODE *p;
				1033	char *q;
				1034
				1035	/* We don't create UTF-16 pairs... */
				1036	v = PyString_FromStringAndSize(NULL,
				1037	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1038	if (v == NULL)
				1039	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1040
				1041	q = PyString_AS_STRING(v);
				1042	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1043	if (byteorder == 0)
				1044	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1045	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1046	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1047	if (byteorder == 0 \|\|
				1048	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1049	byteorder == -1
				1050	#else
				1051	byteorder == 1
				1052	#endif
				1053	)
				1054	memcpy(p, s, size * sizeof(Py_UNICODE));
				1055	else
				1056	while (size-- > 0) {
				1057	Py_UNICODE ch = *s++;
				1058	*p++ = (ch >> 8) \| (ch << 8);
				1059	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1060	return v;
				1061	}
				1062
				1063	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1064	{
				1065	if (!PyUnicode_Check(unicode)) {
				1066	PyErr_BadArgument();
				1067	return NULL;
				1068	}
				1069	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1070	PyUnicode_GET_SIZE(unicode),
				1071	NULL,
				1072	0);
				1073	}
				1074
				1075	/* --- Unicode Escape Codec ----------------------------------------------- */
				1076
				1077	static
				1078	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1079	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1080	const char *errors,
				1081	const char *details)
				1082	{
				1083	if ((errors == NULL) \|\|
				1084	(strcmp(errors,"strict") == 0)) {
				1085	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1086	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1087	details);
				1088	return -1;
				1089	}
				1090	else if (strcmp(errors,"ignore") == 0) {
				1091	return 0;
				1092	}
				1093	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1094	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1095	return 0;
				1096	}
				1097	else {
				1098	PyErr_Format(PyExc_ValueError,
				1099	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1100	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1101	errors);
				1102	return -1;
				1103	}
				1104	}
				1105
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1106	static _PyUnicode_Name_CAPI *unicode_names = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1107
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1108	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1109	int size,
				1110	const char *errors)
				1111	{
				1112	PyUnicodeObject *v;
				1113	Py_UNICODE p = NULL, buf = NULL;
				1114	const char *end;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1115	Py_UCS4 chr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1116
				1117	/* Escaped strings will always be longer than the resulting
				1118	Unicode string, so we start with size here and then reduce the
				1119	length after conversion to the true value. */
				1120	v = _PyUnicode_New(size);
				1121	if (v == NULL)
				1122	goto onError;
				1123	if (size == 0)
				1124	return (PyObject *)v;
				1125	p = buf = PyUnicode_AS_UNICODE(v);
				1126	end = s + size;
				1127	while (s < end) {
				1128	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1129	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1130	int i;
				1131
				1132	/* Non-escape characters are interpreted as Unicode ordinals */
				1133	if (*s != '\\') {
				1134	p++ = (unsigned char)s++;
				1135	continue;
				1136	}
				1137
				1138	/* \ - Escapes */
				1139	s++;
				1140	switch (*s++) {
				1141
				1142	/* \x escapes */
				1143	case '\n': break;
				1144	case '\\': *p++ = '\\'; break;
				1145	case '\'': *p++ = '\''; break;
				1146	case '\"': *p++ = '\"'; break;
				1147	case 'b': *p++ = '\b'; break;
				1148	case 'f': p++ = '\014'; break; / FF */
				1149	case 't': *p++ = '\t'; break;
				1150	case 'n': *p++ = '\n'; break;
				1151	case 'r': *p++ = '\r'; break;
				1152	case 'v': p++ = '\013'; break; / VT */
				1153	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1154
				1155	/* \OOO (octal) escapes */
				1156	case '0': case '1': case '2': case '3':
				1157	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1158	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1159	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1160	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1161	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1162	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1163	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1164	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1165	break;
				1166
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1167	/* \xXX with two hex digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1168	case 'x':
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1169	for (x = 0, i = 0; i < 2; i++) {
				1170	c = (unsigned char)s[i];
				1171	if (!isxdigit(c)) {
				1172	if (unicodeescape_decoding_error(&s, &x, errors,
				1173	"truncated \\xXX"))
				1174	goto onError;
				1175	i++;
				1176	break;
				1177	}
				1178	x = (x<<4) & ~0xF;
				1179	if (c >= '0' && c <= '9')
				1180	x += c - '0';
				1181	else if (c >= 'a' && c <= 'f')
				1182	x += 10 + c - 'a';
				1183	else
				1184	x += 10 + c - 'A';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1185	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1186	s += i;
				1187	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1188	break;
				1189
				1190	/* \uXXXX with 4 hex digits */
				1191	case 'u':
				1192	for (x = 0, i = 0; i < 4; i++) {
				1193	c = (unsigned char)s[i];
				1194	if (!isxdigit(c)) {
				1195	if (unicodeescape_decoding_error(&s, &x, errors,
				1196	"truncated \\uXXXX"))
				1197	goto onError;
				1198	i++;
				1199	break;
				1200	}
				1201	x = (x<<4) & ~0xF;
				1202	if (c >= '0' && c <= '9')
				1203	x += c - '0';
				1204	else if (c >= 'a' && c <= 'f')
				1205	x += 10 + c - 'a';
				1206	else
				1207	x += 10 + c - 'A';
				1208	}
				1209	s += i;
				1210	*p++ = x;
				1211	break;
				1212
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1213	/* \UXXXXXXXX with 8 hex digits */
				1214	case 'U':
				1215	for (chr = 0, i = 0; i < 8; i++) {
				1216	c = (unsigned char)s[i];
				1217	if (!isxdigit(c)) {
				1218	if (unicodeescape_decoding_error(&s, &x, errors,
				1219	"truncated \\uXXXX"))
				1220	goto onError;
				1221	i++;
				1222	break;
				1223	}
				1224	chr = (chr<<4) & ~0xF;
				1225	if (c >= '0' && c <= '9')
				1226	chr += c - '0';
				1227	else if (c >= 'a' && c <= 'f')
				1228	chr += 10 + c - 'a';
				1229	else
				1230	chr += 10 + c - 'A';
				1231	}
				1232	s += i;
				1233	goto store;
				1234
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1235	case 'N':
				1236	/* Ok, we need to deal with Unicode Character Names now,
				1237	* make sure we've imported the hash table data...
				1238	*/
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1239	if (unicode_names == NULL) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1240	PyObject mod = 0, v = 0;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1241	mod = PyImport_ImportModule("ucnhash");
				1242	if (mod == NULL)
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1243	goto ucnhashError;
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1244	v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI");
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1245	Py_DECREF(mod);
				1246	if (v == NULL)
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1247	goto ucnhashError;
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1248	unicode_names = PyCObject_AsVoidPtr(v);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1249	Py_DECREF(v);
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1250	if (unicode_names == NULL)
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1251	goto ucnhashError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1252	}
				1253
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1254	if (*s == '{') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1255	const char *start = s + 1;
				1256	const char *endBrace = start;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1257
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1258	/* look for the closing brace */
				1259	while (*endBrace != '}' && endBrace < end)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1260	endBrace++;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1261	if (endBrace != end && *endBrace == '}') {
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1262	if (!unicode_names->getcode(start, endBrace-start, &chr)) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1263	if (unicodeescape_decoding_error(
				1264	&s, &x, errors,
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1265	"Invalid Unicode Character Name")
				1266	)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1267	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1268	goto ucnFallthrough;
				1269	}
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1270	s = endBrace + 1;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1271	goto store;
				1272	} else {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1273	if (unicodeescape_decoding_error(
				1274	&s, &x, errors,
				1275	"Unicode name missing closing brace"))
				1276	goto onError;
				1277	goto ucnFallthrough;
				1278	}
				1279	break;
				1280	}
				1281	if (unicodeescape_decoding_error(
				1282	&s, &x, errors,
				1283	"Missing opening brace for Unicode Character Name escape"))
				1284	goto onError;
				1285	ucnFallthrough:
				1286	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1287	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1288	*p++ = '\\';
				1289	*p++ = (unsigned char)s[-1];
				1290	break;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1291	store:
				1292	/* when we get here, chr is a 32-bit unicode character */
				1293	if (chr <= 0xffff)
				1294	/* UCS-2 character */
				1295	*p++ = (Py_UNICODE) chr;
				1296	else if (chr <= 0x10ffff) {
				1297	/* UCS-4 character. store as two surrogate characters */
				1298	chr -= 0x10000L;
				1299	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
				1300	*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
				1301	} else {
				1302	if (unicodeescape_decoding_error(
				1303	&s, &x, errors,
				1304	"Illegal Unicode character")
				1305	)
				1306	goto onError;
				1307	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1308	}
				1309	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1310	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1311	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1312	return (PyObject *)v;
				1313
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1314	ucnhashError:
				1315	PyErr_SetString(PyExc_UnicodeError,
				1316	"\\N escapes not supported (can't load ucnhash module)");
				1317	return NULL;
				1318
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1319	onError:
				1320	Py_XDECREF(v);
				1321	return NULL;
				1322	}
				1323
				1324	/* Return a Unicode-Escape string version of the Unicode object.
				1325
				1326	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1327	appropriate.
				1328
				1329	*/
				1330
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1331	static const Py_UNICODE findchar(const Py_UNICODE s,
				1332	int size,
				1333	Py_UNICODE ch);
				1334
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1335	static
				1336	PyObject unicodeescape_string(const Py_UNICODE s,
				1337	int size,
				1338	int quotes)
				1339	{
				1340	PyObject *repr;
				1341	char *p;
				1342	char *q;
				1343
				1344	static const char *hexdigit = "0123456789ABCDEF";
				1345
				1346	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1347	if (repr == NULL)
				1348	return NULL;
				1349
				1350	p = q = PyString_AS_STRING(repr);
				1351
				1352	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1353	*p++ = 'u';
				1354	*p++ = (findchar(s, size, '\'') &&
				1355	!findchar(s, size, '"')) ? '"' : '\'';
				1356	}
				1357	while (size-- > 0) {
				1358	Py_UNICODE ch = *s++;
				1359	/* Escape quotes */
				1360	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1361	*p++ = '\\';
				1362	*p++ = (char) ch;
				1363	}
				1364	/* Map 16-bit characters to '\uxxxx' */
				1365	else if (ch >= 256) {
				1366	*p++ = '\\';
				1367	*p++ = 'u';
				1368	*p++ = hexdigit[(ch >> 12) & 0xf];
				1369	*p++ = hexdigit[(ch >> 8) & 0xf];
				1370	*p++ = hexdigit[(ch >> 4) & 0xf];
				1371	*p++ = hexdigit[ch & 15];
				1372	}
				1373	/* Map non-printable US ASCII to '\ooo' */
				1374	else if (ch < ' ' \|\| ch >= 128) {
				1375	*p++ = '\\';
				1376	*p++ = hexdigit[(ch >> 6) & 7];
				1377	*p++ = hexdigit[(ch >> 3) & 7];
				1378	*p++ = hexdigit[ch & 7];
				1379	}
				1380	/* Copy everything else as-is */
				1381	else
				1382	*p++ = (char) ch;
				1383	}
				1384	if (quotes)
				1385	*p++ = q[1];
				1386
				1387	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1388	if (_PyString_Resize(&repr, p - q))
				1389	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1390
				1391	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1392
				1393	onError:
				1394	Py_DECREF(repr);
				1395	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1396	}
				1397
				1398	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1399	int size)
				1400	{
				1401	return unicodeescape_string(s, size, 0);
				1402	}
				1403
				1404	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1405	{
				1406	if (!PyUnicode_Check(unicode)) {
				1407	PyErr_BadArgument();
				1408	return NULL;
				1409	}
				1410	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1411	PyUnicode_GET_SIZE(unicode));
				1412	}
				1413
				1414	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1415
				1416	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1417	int size,
				1418	const char *errors)
				1419	{
				1420	PyUnicodeObject *v;
				1421	Py_UNICODE p, buf;
				1422	const char *end;
				1423	const char *bs;
				1424
				1425	/* Escaped strings will always be longer than the resulting
				1426	Unicode string, so we start with size here and then reduce the
				1427	length after conversion to the true value. */
				1428	v = _PyUnicode_New(size);
				1429	if (v == NULL)
				1430	goto onError;
				1431	if (size == 0)
				1432	return (PyObject *)v;
				1433	p = buf = PyUnicode_AS_UNICODE(v);
				1434	end = s + size;
				1435	while (s < end) {
				1436	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1437	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1438	int i;
				1439
				1440	/* Non-escape characters are interpreted as Unicode ordinals */
				1441	if (*s != '\\') {
				1442	p++ = (unsigned char)s++;
				1443	continue;
				1444	}
				1445
				1446	/* \u-escapes are only interpreted iff the number of leading
				1447	backslashes if odd */
				1448	bs = s;
				1449	for (;s < end;) {
				1450	if (*s != '\\')
				1451	break;
				1452	p++ = (unsigned char)s++;
				1453	}
				1454	if (((s - bs) & 1) == 0 \|\|
				1455	s >= end \|\|
				1456	*s != 'u') {
				1457	continue;
				1458	}
				1459	p--;
				1460	s++;
				1461
				1462	/* \uXXXX with 4 hex digits */
				1463	for (x = 0, i = 0; i < 4; i++) {
				1464	c = (unsigned char)s[i];
				1465	if (!isxdigit(c)) {
				1466	if (unicodeescape_decoding_error(&s, &x, errors,
				1467	"truncated \\uXXXX"))
				1468	goto onError;
				1469	i++;
				1470	break;
				1471	}
				1472	x = (x<<4) & ~0xF;
				1473	if (c >= '0' && c <= '9')
				1474	x += c - '0';
				1475	else if (c >= 'a' && c <= 'f')
				1476	x += 10 + c - 'a';
				1477	else
				1478	x += 10 + c - 'A';
				1479	}
				1480	s += i;
				1481	*p++ = x;
				1482	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1483	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1484	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1485	return (PyObject *)v;
				1486
				1487	onError:
				1488	Py_XDECREF(v);
				1489	return NULL;
				1490	}
				1491
				1492	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1493	int size)
				1494	{
				1495	PyObject *repr;
				1496	char *p;
				1497	char *q;
				1498
				1499	static const char *hexdigit = "0123456789ABCDEF";
				1500
				1501	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1502	if (repr == NULL)
				1503	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1504	if (size == 0)
				1505	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1506
				1507	p = q = PyString_AS_STRING(repr);
				1508	while (size-- > 0) {
				1509	Py_UNICODE ch = *s++;
				1510	/* Map 16-bit characters to '\uxxxx' */
				1511	if (ch >= 256) {
				1512	*p++ = '\\';
				1513	*p++ = 'u';
				1514	*p++ = hexdigit[(ch >> 12) & 0xf];
				1515	*p++ = hexdigit[(ch >> 8) & 0xf];
				1516	*p++ = hexdigit[(ch >> 4) & 0xf];
				1517	*p++ = hexdigit[ch & 15];
				1518	}
				1519	/* Copy everything else as-is */
				1520	else
				1521	*p++ = (char) ch;
				1522	}
				1523	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1524	if (_PyString_Resize(&repr, p - q))
				1525	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1526
				1527	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1528
				1529	onError:
				1530	Py_DECREF(repr);
				1531	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1532	}
				1533
				1534	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1535	{
				1536	if (!PyUnicode_Check(unicode)) {
				1537	PyErr_BadArgument();
				1538	return NULL;
				1539	}
				1540	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1541	PyUnicode_GET_SIZE(unicode));
				1542	}
				1543
				1544	/* --- Latin-1 Codec ------------------------------------------------------ */
				1545
				1546	PyObject PyUnicode_DecodeLatin1(const char s,
				1547	int size,
				1548	const char *errors)
				1549	{
				1550	PyUnicodeObject *v;
				1551	Py_UNICODE *p;
				1552
				1553	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1554	v = _PyUnicode_New(size);
				1555	if (v == NULL)
				1556	goto onError;
				1557	if (size == 0)
				1558	return (PyObject *)v;
				1559	p = PyUnicode_AS_UNICODE(v);
				1560	while (size-- > 0)
				1561	p++ = (unsigned char)s++;
				1562	return (PyObject *)v;
				1563
				1564	onError:
				1565	Py_XDECREF(v);
				1566	return NULL;
				1567	}
				1568
				1569	static
				1570	int latin1_encoding_error(const Py_UNICODE **source,
				1571	char **dest,
				1572	const char *errors,
				1573	const char *details)
				1574	{
				1575	if ((errors == NULL) \|\|
				1576	(strcmp(errors,"strict") == 0)) {
				1577	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1578	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1579	details);
				1580	return -1;
				1581	}
				1582	else if (strcmp(errors,"ignore") == 0) {
				1583	return 0;
				1584	}
				1585	else if (strcmp(errors,"replace") == 0) {
				1586	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1587	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1588	return 0;
				1589	}
				1590	else {
				1591	PyErr_Format(PyExc_ValueError,
				1592	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1593	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1594	errors);
				1595	return -1;
				1596	}
				1597	}
				1598
				1599	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1600	int size,
				1601	const char *errors)
				1602	{
				1603	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1604	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1605
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1606	repr = PyString_FromStringAndSize(NULL, size);
				1607	if (repr == NULL)
				1608	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1609	if (size == 0)
				1610	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1611
				1612	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1613	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1614	while (size-- > 0) {
				1615	Py_UNICODE ch = *p++;
				1616	if (ch >= 256) {
				1617	if (latin1_encoding_error(&p, &s, errors,
				1618	"ordinal not in range(256)"))
				1619	goto onError;
				1620	}
				1621	else
				1622	*s++ = (char)ch;
				1623	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1624	/* Resize if error handling skipped some characters */
				1625	if (s - start < PyString_GET_SIZE(repr))
				1626	if (_PyString_Resize(&repr, s - start))
				1627	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1628	return repr;
				1629
				1630	onError:
				1631	Py_DECREF(repr);
				1632	return NULL;
				1633	}
				1634
				1635	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1636	{
				1637	if (!PyUnicode_Check(unicode)) {
				1638	PyErr_BadArgument();
				1639	return NULL;
				1640	}
				1641	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1642	PyUnicode_GET_SIZE(unicode),
				1643	NULL);
				1644	}
				1645
				1646	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1647
				1648	static
				1649	int ascii_decoding_error(const char **source,
				1650	Py_UNICODE **dest,
				1651	const char *errors,
				1652	const char *details)
				1653	{
				1654	if ((errors == NULL) \|\|
				1655	(strcmp(errors,"strict") == 0)) {
				1656	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1657	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1658	details);
				1659	return -1;
				1660	}
				1661	else if (strcmp(errors,"ignore") == 0) {
				1662	return 0;
				1663	}
				1664	else if (strcmp(errors,"replace") == 0) {
				1665	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1666	(*dest)++;
				1667	return 0;
				1668	}
				1669	else {
				1670	PyErr_Format(PyExc_ValueError,
				1671	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1672	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1673	errors);
				1674	return -1;
				1675	}
				1676	}
				1677
				1678	PyObject PyUnicode_DecodeASCII(const char s,
				1679	int size,
				1680	const char *errors)
				1681	{
				1682	PyUnicodeObject *v;
				1683	Py_UNICODE *p;
				1684
				1685	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1686	v = _PyUnicode_New(size);
				1687	if (v == NULL)
				1688	goto onError;
				1689	if (size == 0)
				1690	return (PyObject *)v;
				1691	p = PyUnicode_AS_UNICODE(v);
				1692	while (size-- > 0) {
				1693	register unsigned char c;
				1694
				1695	c = (unsigned char)*s++;
				1696	if (c < 128)
				1697	*p++ = c;
				1698	else if (ascii_decoding_error(&s, &p, errors,
				1699	"ordinal not in range(128)"))
				1700	goto onError;
				1701	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1702	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1703	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1704	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1705	return (PyObject *)v;
				1706
				1707	onError:
				1708	Py_XDECREF(v);
				1709	return NULL;
				1710	}
				1711
				1712	static
				1713	int ascii_encoding_error(const Py_UNICODE **source,
				1714	char **dest,
				1715	const char *errors,
				1716	const char *details)
				1717	{
				1718	if ((errors == NULL) \|\|
				1719	(strcmp(errors,"strict") == 0)) {
				1720	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1721	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1722	details);
				1723	return -1;
				1724	}
				1725	else if (strcmp(errors,"ignore") == 0) {
				1726	return 0;
				1727	}
				1728	else if (strcmp(errors,"replace") == 0) {
				1729	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1730	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1731	return 0;
				1732	}
				1733	else {
				1734	PyErr_Format(PyExc_ValueError,
				1735	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1736	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1737	errors);
				1738	return -1;
				1739	}
				1740	}
				1741
				1742	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1743	int size,
				1744	const char *errors)
				1745	{
				1746	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1747	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1748
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1749	repr = PyString_FromStringAndSize(NULL, size);
				1750	if (repr == NULL)
				1751	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1752	if (size == 0)
				1753	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1754
				1755	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1756	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1757	while (size-- > 0) {
				1758	Py_UNICODE ch = *p++;
				1759	if (ch >= 128) {
				1760	if (ascii_encoding_error(&p, &s, errors,
				1761	"ordinal not in range(128)"))
				1762	goto onError;
				1763	}
				1764	else
				1765	*s++ = (char)ch;
				1766	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1767	/* Resize if error handling skipped some characters */
				1768	if (s - start < PyString_GET_SIZE(repr))
				1769	if (_PyString_Resize(&repr, s - start))
				1770	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1771	return repr;
				1772
				1773	onError:
				1774	Py_DECREF(repr);
				1775	return NULL;
				1776	}
				1777
				1778	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1779	{
				1780	if (!PyUnicode_Check(unicode)) {
				1781	PyErr_BadArgument();
				1782	return NULL;
				1783	}
				1784	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1785	PyUnicode_GET_SIZE(unicode),
				1786	NULL);
				1787	}
				1788
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1789	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1790
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1791	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1792
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1793	PyObject PyUnicode_DecodeMBCS(const char s,
				1794	int size,
				1795	const char *errors)
				1796	{
				1797	PyUnicodeObject *v;
				1798	Py_UNICODE *p;
				1799
				1800	/* First get the size of the result */
				1801	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1802	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1803	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1804
				1805	v = _PyUnicode_New(usize);
				1806	if (v == NULL)
				1807	return NULL;
				1808	if (usize == 0)
				1809	return (PyObject *)v;
				1810	p = PyUnicode_AS_UNICODE(v);
				1811	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1812	Py_DECREF(v);
				1813	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1814	}
				1815
				1816	return (PyObject *)v;
				1817	}
				1818
				1819	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1820	int size,
				1821	const char *errors)
				1822	{
				1823	PyObject *repr;
				1824	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1825	DWORD mbcssize;
				1826
				1827	/* If there are no characters, bail now! */
				1828	if (size==0)
				1829	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1830
				1831	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1832	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1833	if (mbcssize==0)
				1834	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1835
				1836	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1837	if (repr == NULL)
				1838	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1839	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1840	return repr;
				1841
				1842	/* Do the conversion */
				1843	s = PyString_AS_STRING(repr);
				1844	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1845	Py_DECREF(repr);
				1846	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1847	}
				1848	return repr;
				1849	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1850
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1851	#endif /* MS_WIN32 */
				1852
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1853	/* --- Character Mapping Codec -------------------------------------------- */
				1854
				1855	static
				1856	int charmap_decoding_error(const char **source,
				1857	Py_UNICODE **dest,
				1858	const char *errors,
				1859	const char *details)
				1860	{
				1861	if ((errors == NULL) \|\|
				1862	(strcmp(errors,"strict") == 0)) {
				1863	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1864	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1865	details);
				1866	return -1;
				1867	}
				1868	else if (strcmp(errors,"ignore") == 0) {
				1869	return 0;
				1870	}
				1871	else if (strcmp(errors,"replace") == 0) {
				1872	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1873	(*dest)++;
				1874	return 0;
				1875	}
				1876	else {
				1877	PyErr_Format(PyExc_ValueError,
				1878	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1879	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1880	errors);
				1881	return -1;
				1882	}
				1883	}
				1884
				1885	PyObject PyUnicode_DecodeCharmap(const char s,
				1886	int size,
				1887	PyObject *mapping,
				1888	const char *errors)
				1889	{
				1890	PyUnicodeObject *v;
				1891	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1892	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1893
				1894	/* Default to Latin-1 */
				1895	if (mapping == NULL)
				1896	return PyUnicode_DecodeLatin1(s, size, errors);
				1897
				1898	v = _PyUnicode_New(size);
				1899	if (v == NULL)
				1900	goto onError;
				1901	if (size == 0)
				1902	return (PyObject *)v;
				1903	p = PyUnicode_AS_UNICODE(v);
				1904	while (size-- > 0) {
				1905	unsigned char ch = *s++;
				1906	PyObject w, x;
				1907
				1908	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1909	w = PyInt_FromLong((long)ch);
				1910	if (w == NULL)
				1911	goto onError;
				1912	x = PyObject_GetItem(mapping, w);
				1913	Py_DECREF(w);
				1914	if (x == NULL) {
				1915	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1916	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1917	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1918	x = Py_None;
				1919	Py_INCREF(x);
				1920	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1921	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1922	}
				1923
				1924	/* Apply mapping */
				1925	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1926	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1927	if (value < 0 \|\| value > 65535) {
				1928	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1929	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1930	Py_DECREF(x);
				1931	goto onError;
				1932	}
				1933	*p++ = (Py_UNICODE)value;
				1934	}
				1935	else if (x == Py_None) {
				1936	/* undefined mapping */
				1937	if (charmap_decoding_error(&s, &p, errors,
				1938	"character maps to <undefined>")) {
				1939	Py_DECREF(x);
				1940	goto onError;
				1941	}
				1942	}
				1943	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1944	int targetsize = PyUnicode_GET_SIZE(x);
				1945
				1946	if (targetsize == 1)
				1947	/* 1-1 mapping */
				1948	p++ = PyUnicode_AS_UNICODE(x);
				1949
				1950	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1951	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1952	if (targetsize > extrachars) {
				1953	/* resize first */
				1954	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				1955	int needed = (targetsize - extrachars) + \
				1956	(targetsize << 2);
				1957	extrachars += needed;
				1958	if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1959	Py_DECREF(x);
				1960	goto onError;
				1961	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1962	p = PyUnicode_AS_UNICODE(v) + oldpos;
				1963	}
				1964	Py_UNICODE_COPY(p,
				1965	PyUnicode_AS_UNICODE(x),
				1966	targetsize);
				1967	p += targetsize;
				1968	extrachars -= targetsize;
				1969	}
				1970	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1971	}
				1972	else {
				1973	/* wrong return value */
				1974	PyErr_SetString(PyExc_TypeError,
				1975	"character mapping must return integer, None or unicode");
				1976	Py_DECREF(x);
				1977	goto onError;
				1978	}
				1979	Py_DECREF(x);
				1980	}
				1981	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1982	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1983	goto onError;
				1984	return (PyObject *)v;
				1985
				1986	onError:
				1987	Py_XDECREF(v);
				1988	return NULL;
				1989	}
				1990
				1991	static
				1992	int charmap_encoding_error(const Py_UNICODE **source,
				1993	char **dest,
				1994	const char *errors,
				1995	const char *details)
				1996	{
				1997	if ((errors == NULL) \|\|
				1998	(strcmp(errors,"strict") == 0)) {
				1999	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2000	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2001	details);
				2002	return -1;
				2003	}
				2004	else if (strcmp(errors,"ignore") == 0) {
				2005	return 0;
				2006	}
				2007	else if (strcmp(errors,"replace") == 0) {
				2008	**dest = '?';
				2009	(*dest)++;
				2010	return 0;
				2011	}
				2012	else {
				2013	PyErr_Format(PyExc_ValueError,
				2014	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2015	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2016	errors);
				2017	return -1;
				2018	}
				2019	}
				2020
				2021	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2022	int size,
				2023	PyObject *mapping,
				2024	const char *errors)
				2025	{
				2026	PyObject *v;
				2027	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2028	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2029
				2030	/* Default to Latin-1 */
				2031	if (mapping == NULL)
				2032	return PyUnicode_EncodeLatin1(p, size, errors);
				2033
				2034	v = PyString_FromStringAndSize(NULL, size);
				2035	if (v == NULL)
				2036	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2037	if (size == 0)
				2038	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2039	s = PyString_AS_STRING(v);
				2040	while (size-- > 0) {
				2041	Py_UNICODE ch = *p++;
				2042	PyObject w, x;
				2043
				2044	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2045	w = PyInt_FromLong((long)ch);
				2046	if (w == NULL)
				2047	goto onError;
				2048	x = PyObject_GetItem(mapping, w);
				2049	Py_DECREF(w);
				2050	if (x == NULL) {
				2051	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2052	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2053	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2054	x = Py_None;
				2055	Py_INCREF(x);
				2056	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2057	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2058	}
				2059
				2060	/* Apply mapping */
				2061	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2062	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2063	if (value < 0 \|\| value > 255) {
				2064	PyErr_SetString(PyExc_TypeError,
				2065	"character mapping must be in range(256)");
				2066	Py_DECREF(x);
				2067	goto onError;
				2068	}
				2069	*s++ = (char)value;
				2070	}
				2071	else if (x == Py_None) {
				2072	/* undefined mapping */
				2073	if (charmap_encoding_error(&p, &s, errors,
				2074	"character maps to <undefined>")) {
				2075	Py_DECREF(x);
				2076	goto onError;
				2077	}
				2078	}
				2079	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2080	int targetsize = PyString_GET_SIZE(x);
				2081
				2082	if (targetsize == 1)
				2083	/* 1-1 mapping */
				2084	s++ = PyString_AS_STRING(x);
				2085
				2086	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2087	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2088	if (targetsize > extrachars) {
				2089	/* resize first */
				2090	int oldpos = (int)(s - PyString_AS_STRING(v));
				2091	int needed = (targetsize - extrachars) + \
				2092	(targetsize << 2);
				2093	extrachars += needed;
				2094	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2095	Py_DECREF(x);
				2096	goto onError;
				2097	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2098	s = PyString_AS_STRING(v) + oldpos;
				2099	}
				2100	memcpy(s,
				2101	PyString_AS_STRING(x),
				2102	targetsize);
				2103	s += targetsize;
				2104	extrachars -= targetsize;
				2105	}
				2106	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2107	}
				2108	else {
				2109	/* wrong return value */
				2110	PyErr_SetString(PyExc_TypeError,
				2111	"character mapping must return integer, None or unicode");
				2112	Py_DECREF(x);
				2113	goto onError;
				2114	}
				2115	Py_DECREF(x);
				2116	}
				2117	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2118	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2119	goto onError;
				2120	return v;
				2121
				2122	onError:
				2123	Py_DECREF(v);
				2124	return NULL;
				2125	}
				2126
				2127	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2128	PyObject *mapping)
				2129	{
				2130	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2131	PyErr_BadArgument();
				2132	return NULL;
				2133	}
				2134	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2135	PyUnicode_GET_SIZE(unicode),
				2136	mapping,
				2137	NULL);
				2138	}
				2139
				2140	static
				2141	int translate_error(const Py_UNICODE **source,
				2142	Py_UNICODE **dest,
				2143	const char *errors,
				2144	const char *details)
				2145	{
				2146	if ((errors == NULL) \|\|
				2147	(strcmp(errors,"strict") == 0)) {
				2148	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2149	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2150	details);
				2151	return -1;
				2152	}
				2153	else if (strcmp(errors,"ignore") == 0) {
				2154	return 0;
				2155	}
				2156	else if (strcmp(errors,"replace") == 0) {
				2157	**dest = '?';
				2158	(*dest)++;
				2159	return 0;
				2160	}
				2161	else {
				2162	PyErr_Format(PyExc_ValueError,
				2163	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2164	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2165	errors);
				2166	return -1;
				2167	}
				2168	}
				2169
				2170	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2171	int size,
				2172	PyObject *mapping,
				2173	const char *errors)
				2174	{
				2175	PyUnicodeObject *v;
				2176	Py_UNICODE *p;
				2177
				2178	if (mapping == NULL) {
				2179	PyErr_BadArgument();
				2180	return NULL;
				2181	}
				2182
				2183	/* Output will never be longer than input */
				2184	v = _PyUnicode_New(size);
				2185	if (v == NULL)
				2186	goto onError;
				2187	if (size == 0)
				2188	goto done;
				2189	p = PyUnicode_AS_UNICODE(v);
				2190	while (size-- > 0) {
				2191	Py_UNICODE ch = *s++;
				2192	PyObject w, x;
				2193
				2194	/* Get mapping */
				2195	w = PyInt_FromLong(ch);
				2196	if (w == NULL)
				2197	goto onError;
				2198	x = PyObject_GetItem(mapping, w);
				2199	Py_DECREF(w);
				2200	if (x == NULL) {
				2201	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2202	/* No mapping found: default to 1-1 mapping */
				2203	PyErr_Clear();
				2204	*p++ = ch;
				2205	continue;
				2206	}
				2207	goto onError;
				2208	}
				2209
				2210	/* Apply mapping */
				2211	if (PyInt_Check(x))
				2212	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2213	else if (x == Py_None) {
				2214	/* undefined mapping */
				2215	if (translate_error(&s, &p, errors,
				2216	"character maps to <undefined>")) {
				2217	Py_DECREF(x);
				2218	goto onError;
				2219	}
				2220	}
				2221	else if (PyUnicode_Check(x)) {
				2222	if (PyUnicode_GET_SIZE(x) != 1) {
				2223	/* 1-n mapping */
				2224	PyErr_SetString(PyExc_NotImplementedError,
				2225	"1-n mappings are currently not implemented");
				2226	Py_DECREF(x);
				2227	goto onError;
				2228	}
				2229	p++ = PyUnicode_AS_UNICODE(x);
				2230	}
				2231	else {
				2232	/* wrong return value */
				2233	PyErr_SetString(PyExc_TypeError,
				2234	"translate mapping must return integer, None or unicode");
				2235	Py_DECREF(x);
				2236	goto onError;
				2237	}
				2238	Py_DECREF(x);
				2239	}
				2240	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2241	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2242	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2243
				2244	done:
				2245	return (PyObject *)v;
				2246
				2247	onError:
				2248	Py_XDECREF(v);
				2249	return NULL;
				2250	}
				2251
				2252	PyObject PyUnicode_Translate(PyObject str,
				2253	PyObject *mapping,
				2254	const char *errors)
				2255	{
				2256	PyObject *result;
				2257
				2258	str = PyUnicode_FromObject(str);
				2259	if (str == NULL)
				2260	goto onError;
				2261	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2262	PyUnicode_GET_SIZE(str),
				2263	mapping,
				2264	errors);
				2265	Py_DECREF(str);
				2266	return result;
				2267
				2268	onError:
				2269	Py_XDECREF(str);
				2270	return NULL;
				2271	}
				2272
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2273	/* --- Decimal Encoder ---------------------------------------------------- */
				2274
				2275	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2276	int length,
				2277	char *output,
				2278	const char *errors)
				2279	{
				2280	Py_UNICODE p, end;
				2281
				2282	if (output == NULL) {
				2283	PyErr_BadArgument();
				2284	return -1;
				2285	}
				2286
				2287	p = s;
				2288	end = s + length;
				2289	while (p < end) {
				2290	register Py_UNICODE ch = *p++;
				2291	int decimal;
				2292
				2293	if (Py_UNICODE_ISSPACE(ch)) {
				2294	*output++ = ' ';
				2295	continue;
				2296	}
				2297	decimal = Py_UNICODE_TODECIMAL(ch);
				2298	if (decimal >= 0) {
				2299	*output++ = '0' + decimal;
				2300	continue;
				2301	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2302	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2303	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2304	continue;
				2305	}
				2306	/* All other characters are considered invalid */
				2307	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2308	PyErr_SetString(PyExc_ValueError,
				2309	"invalid decimal Unicode string");
				2310	goto onError;
				2311	}
				2312	else if (strcmp(errors, "ignore") == 0)
				2313	continue;
				2314	else if (strcmp(errors, "replace") == 0) {
				2315	*output++ = '?';
				2316	continue;
				2317	}
				2318	}
				2319	/* 0-terminate the output string */
				2320	*output++ = '\0';
				2321	return 0;
				2322
				2323	onError:
				2324	return -1;
				2325	}
				2326
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2327	/* --- Helpers ------------------------------------------------------------ */
				2328
				2329	static
				2330	int count(PyUnicodeObject *self,
				2331	int start,
				2332	int end,
				2333	PyUnicodeObject *substring)
				2334	{
				2335	int count = 0;
				2336
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2337	if (start < 0)
				2338	start += self->length;
				2339	if (start < 0)
				2340	start = 0;
				2341	if (end > self->length)
				2342	end = self->length;
				2343	if (end < 0)
				2344	end += self->length;
				2345	if (end < 0)
				2346	end = 0;
				2347
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2348	if (substring->length == 0)
				2349	return (end - start + 1);
				2350
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2351	end -= substring->length;
				2352
				2353	while (start <= end)
				2354	if (Py_UNICODE_MATCH(self, start, substring)) {
				2355	count++;
				2356	start += substring->length;
				2357	} else
				2358	start++;
				2359
				2360	return count;
				2361	}
				2362
				2363	int PyUnicode_Count(PyObject *str,
				2364	PyObject *substr,
				2365	int start,
				2366	int end)
				2367	{
				2368	int result;
				2369
				2370	str = PyUnicode_FromObject(str);
				2371	if (str == NULL)
				2372	return -1;
				2373	substr = PyUnicode_FromObject(substr);
				2374	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2375	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2376	return -1;
				2377	}
				2378
				2379	result = count((PyUnicodeObject *)str,
				2380	start, end,
				2381	(PyUnicodeObject *)substr);
				2382
				2383	Py_DECREF(str);
				2384	Py_DECREF(substr);
				2385	return result;
				2386	}
				2387
				2388	static
				2389	int findstring(PyUnicodeObject *self,
				2390	PyUnicodeObject *substring,
				2391	int start,
				2392	int end,
				2393	int direction)
				2394	{
				2395	if (start < 0)
				2396	start += self->length;
				2397	if (start < 0)
				2398	start = 0;
				2399
				2400	if (substring->length == 0)
				2401	return start;
				2402
				2403	if (end > self->length)
				2404	end = self->length;
				2405	if (end < 0)
				2406	end += self->length;
				2407	if (end < 0)
				2408	end = 0;
				2409
				2410	end -= substring->length;
				2411
				2412	if (direction < 0) {
				2413	for (; end >= start; end--)
				2414	if (Py_UNICODE_MATCH(self, end, substring))
				2415	return end;
				2416	} else {
				2417	for (; start <= end; start++)
				2418	if (Py_UNICODE_MATCH(self, start, substring))
				2419	return start;
				2420	}
				2421
				2422	return -1;
				2423	}
				2424
				2425	int PyUnicode_Find(PyObject *str,
				2426	PyObject *substr,
				2427	int start,
				2428	int end,
				2429	int direction)
				2430	{
				2431	int result;
				2432
				2433	str = PyUnicode_FromObject(str);
				2434	if (str == NULL)
				2435	return -1;
				2436	substr = PyUnicode_FromObject(substr);
				2437	if (substr == NULL) {
				2438	Py_DECREF(substr);
				2439	return -1;
				2440	}
				2441
				2442	result = findstring((PyUnicodeObject *)str,
				2443	(PyUnicodeObject *)substr,
				2444	start, end, direction);
				2445	Py_DECREF(str);
				2446	Py_DECREF(substr);
				2447	return result;
				2448	}
				2449
				2450	static
				2451	int tailmatch(PyUnicodeObject *self,
				2452	PyUnicodeObject *substring,
				2453	int start,
				2454	int end,
				2455	int direction)
				2456	{
				2457	if (start < 0)
				2458	start += self->length;
				2459	if (start < 0)
				2460	start = 0;
				2461
				2462	if (substring->length == 0)
				2463	return 1;
				2464
				2465	if (end > self->length)
				2466	end = self->length;
				2467	if (end < 0)
				2468	end += self->length;
				2469	if (end < 0)
				2470	end = 0;
				2471
				2472	end -= substring->length;
				2473	if (end < start)
				2474	return 0;
				2475
				2476	if (direction > 0) {
				2477	if (Py_UNICODE_MATCH(self, end, substring))
				2478	return 1;
				2479	} else {
				2480	if (Py_UNICODE_MATCH(self, start, substring))
				2481	return 1;
				2482	}
				2483
				2484	return 0;
				2485	}
				2486
				2487	int PyUnicode_Tailmatch(PyObject *str,
				2488	PyObject *substr,
				2489	int start,
				2490	int end,
				2491	int direction)
				2492	{
				2493	int result;
				2494
				2495	str = PyUnicode_FromObject(str);
				2496	if (str == NULL)
				2497	return -1;
				2498	substr = PyUnicode_FromObject(substr);
				2499	if (substr == NULL) {
				2500	Py_DECREF(substr);
				2501	return -1;
				2502	}
				2503
				2504	result = tailmatch((PyUnicodeObject *)str,
				2505	(PyUnicodeObject *)substr,
				2506	start, end, direction);
				2507	Py_DECREF(str);
				2508	Py_DECREF(substr);
				2509	return result;
				2510	}
				2511
				2512	static
				2513	const Py_UNICODE findchar(const Py_UNICODE s,
				2514	int size,
				2515	Py_UNICODE ch)
				2516	{
				2517	/* like wcschr, but doesn't stop at NULL characters */
				2518
				2519	while (size-- > 0) {
				2520	if (*s == ch)
				2521	return s;
				2522	s++;
				2523	}
				2524
				2525	return NULL;
				2526	}
				2527
				2528	/* Apply fixfct filter to the Unicode object self and return a
				2529	reference to the modified object */
				2530
				2531	static
				2532	PyObject fixup(PyUnicodeObject self,
				2533	int (fixfct)(PyUnicodeObject s))
				2534	{
				2535
				2536	PyUnicodeObject *u;
				2537
				2538	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2539	self->length);
				2540	if (u == NULL)
				2541	return NULL;
				2542	if (!fixfct(u)) {
				2543	/* fixfct should return TRUE if it modified the buffer. If
				2544	FALSE, return a reference to the original buffer instead
				2545	(to save space, not time) */
				2546	Py_INCREF(self);
				2547	Py_DECREF(u);
				2548	return (PyObject*) self;
				2549	}
				2550	return (PyObject*) u;
				2551	}
				2552
				2553	static
				2554	int fixupper(PyUnicodeObject *self)
				2555	{
				2556	int len = self->length;
				2557	Py_UNICODE *s = self->str;
				2558	int status = 0;
				2559
				2560	while (len-- > 0) {
				2561	register Py_UNICODE ch;
				2562
				2563	ch = Py_UNICODE_TOUPPER(*s);
				2564	if (ch != *s) {
				2565	status = 1;
				2566	*s = ch;
				2567	}
				2568	s++;
				2569	}
				2570
				2571	return status;
				2572	}
				2573
				2574	static
				2575	int fixlower(PyUnicodeObject *self)
				2576	{
				2577	int len = self->length;
				2578	Py_UNICODE *s = self->str;
				2579	int status = 0;
				2580
				2581	while (len-- > 0) {
				2582	register Py_UNICODE ch;
				2583
				2584	ch = Py_UNICODE_TOLOWER(*s);
				2585	if (ch != *s) {
				2586	status = 1;
				2587	*s = ch;
				2588	}
				2589	s++;
				2590	}
				2591
				2592	return status;
				2593	}
				2594
				2595	static
				2596	int fixswapcase(PyUnicodeObject *self)
				2597	{
				2598	int len = self->length;
				2599	Py_UNICODE *s = self->str;
				2600	int status = 0;
				2601
				2602	while (len-- > 0) {
				2603	if (Py_UNICODE_ISUPPER(*s)) {
				2604	s = Py_UNICODE_TOLOWER(s);
				2605	status = 1;
				2606	} else if (Py_UNICODE_ISLOWER(*s)) {
				2607	s = Py_UNICODE_TOUPPER(s);
				2608	status = 1;
				2609	}
				2610	s++;
				2611	}
				2612
				2613	return status;
				2614	}
				2615
				2616	static
				2617	int fixcapitalize(PyUnicodeObject *self)
				2618	{
				2619	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2620	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2621	return 1;
				2622	}
				2623	return 0;
				2624	}
				2625
				2626	static
				2627	int fixtitle(PyUnicodeObject *self)
				2628	{
				2629	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2630	register Py_UNICODE *e;
				2631	int previous_is_cased;
				2632
				2633	/* Shortcut for single character strings */
				2634	if (PyUnicode_GET_SIZE(self) == 1) {
				2635	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2636	if (*p != ch) {
				2637	*p = ch;
				2638	return 1;
				2639	}
				2640	else
				2641	return 0;
				2642	}
				2643
				2644	e = p + PyUnicode_GET_SIZE(self);
				2645	previous_is_cased = 0;
				2646	for (; p < e; p++) {
				2647	register const Py_UNICODE ch = *p;
				2648
				2649	if (previous_is_cased)
				2650	*p = Py_UNICODE_TOLOWER(ch);
				2651	else
				2652	*p = Py_UNICODE_TOTITLE(ch);
				2653
				2654	if (Py_UNICODE_ISLOWER(ch) \|\|
				2655	Py_UNICODE_ISUPPER(ch) \|\|
				2656	Py_UNICODE_ISTITLE(ch))
				2657	previous_is_cased = 1;
				2658	else
				2659	previous_is_cased = 0;
				2660	}
				2661	return 1;
				2662	}
				2663
				2664	PyObject PyUnicode_Join(PyObject separator,
				2665	PyObject *seq)
				2666	{
				2667	Py_UNICODE *sep;
				2668	int seplen;
				2669	PyUnicodeObject *res = NULL;
				2670	int reslen = 0;
				2671	Py_UNICODE *p;
				2672	int seqlen = 0;
				2673	int sz = 100;
				2674	int i;
				2675
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2676	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2677	if (seqlen < 0 && PyErr_Occurred())
				2678	return NULL;
				2679
				2680	if (separator == NULL) {
				2681	Py_UNICODE blank = ' ';
				2682	sep = &blank;
				2683	seplen = 1;
				2684	}
				2685	else {
				2686	separator = PyUnicode_FromObject(separator);
				2687	if (separator == NULL)
				2688	return NULL;
				2689	sep = PyUnicode_AS_UNICODE(separator);
				2690	seplen = PyUnicode_GET_SIZE(separator);
				2691	}
				2692
				2693	res = _PyUnicode_New(sz);
				2694	if (res == NULL)
				2695	goto onError;
				2696	p = PyUnicode_AS_UNICODE(res);
				2697	reslen = 0;
				2698
				2699	for (i = 0; i < seqlen; i++) {
				2700	int itemlen;
				2701	PyObject *item;
				2702
				2703	item = PySequence_GetItem(seq, i);
				2704	if (item == NULL)
				2705	goto onError;
				2706	if (!PyUnicode_Check(item)) {
				2707	PyObject *v;
				2708	v = PyUnicode_FromObject(item);
				2709	Py_DECREF(item);
				2710	item = v;
				2711	if (item == NULL)
				2712	goto onError;
				2713	}
				2714	itemlen = PyUnicode_GET_SIZE(item);
				2715	while (reslen + itemlen + seplen >= sz) {
				2716	if (_PyUnicode_Resize(res, sz*2))
				2717	goto onError;
				2718	sz *= 2;
				2719	p = PyUnicode_AS_UNICODE(res) + reslen;
				2720	}
				2721	if (i > 0) {
				2722	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2723	p += seplen;
				2724	reslen += seplen;
				2725	}
				2726	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2727	p += itemlen;
				2728	reslen += itemlen;
				2729	Py_DECREF(item);
				2730	}
				2731	if (_PyUnicode_Resize(res, reslen))
				2732	goto onError;
				2733
				2734	Py_XDECREF(separator);
				2735	return (PyObject *)res;
				2736
				2737	onError:
				2738	Py_XDECREF(separator);
				2739	Py_DECREF(res);
				2740	return NULL;
				2741	}
				2742
				2743	static
				2744	PyUnicodeObject pad(PyUnicodeObject self,
				2745	int left,
				2746	int right,
				2747	Py_UNICODE fill)
				2748	{
				2749	PyUnicodeObject *u;
				2750
				2751	if (left < 0)
				2752	left = 0;
				2753	if (right < 0)
				2754	right = 0;
				2755
				2756	if (left == 0 && right == 0) {
				2757	Py_INCREF(self);
				2758	return self;
				2759	}
				2760
				2761	u = _PyUnicode_New(left + self->length + right);
				2762	if (u) {
				2763	if (left)
				2764	Py_UNICODE_FILL(u->str, fill, left);
				2765	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2766	if (right)
				2767	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2768	}
				2769
				2770	return u;
				2771	}
				2772
				2773	#define SPLIT_APPEND(data, left, right) \
				2774	str = PyUnicode_FromUnicode(data + left, right - left); \
				2775	if (!str) \
				2776	goto onError; \
				2777	if (PyList_Append(list, str)) { \
				2778	Py_DECREF(str); \
				2779	goto onError; \
				2780	} \
				2781	else \
				2782	Py_DECREF(str);
				2783
				2784	static
				2785	PyObject split_whitespace(PyUnicodeObject self,
				2786	PyObject *list,
				2787	int maxcount)
				2788	{
				2789	register int i;
				2790	register int j;
				2791	int len = self->length;
				2792	PyObject *str;
				2793
				2794	for (i = j = 0; i < len; ) {
				2795	/* find a token */
				2796	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2797	i++;
				2798	j = i;
				2799	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2800	i++;
				2801	if (j < i) {
				2802	if (maxcount-- <= 0)
				2803	break;
				2804	SPLIT_APPEND(self->str, j, i);
				2805	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2806	i++;
				2807	j = i;
				2808	}
				2809	}
				2810	if (j < len) {
				2811	SPLIT_APPEND(self->str, j, len);
				2812	}
				2813	return list;
				2814
				2815	onError:
				2816	Py_DECREF(list);
				2817	return NULL;
				2818	}
				2819
				2820	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2821	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2822	{
				2823	register int i;
				2824	register int j;
				2825	int len;
				2826	PyObject *list;
				2827	PyObject *str;
				2828	Py_UNICODE *data;
				2829
				2830	string = PyUnicode_FromObject(string);
				2831	if (string == NULL)
				2832	return NULL;
				2833	data = PyUnicode_AS_UNICODE(string);
				2834	len = PyUnicode_GET_SIZE(string);
				2835
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2836	list = PyList_New(0);
				2837	if (!list)
				2838	goto onError;
				2839
				2840	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2841	int eol;
				2842
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2843	/* Find a line and append it */
				2844	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2845	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2846
				2847	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2848	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2849	if (i < len) {
				2850	if (data[i] == '\r' && i + 1 < len &&
				2851	data[i+1] == '\n')
				2852	i += 2;
				2853	else
				2854	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2855	if (keepends)
				2856	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2857	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2858	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2859	j = i;
				2860	}
				2861	if (j < len) {
				2862	SPLIT_APPEND(data, j, len);
				2863	}
				2864
				2865	Py_DECREF(string);
				2866	return list;
				2867
				2868	onError:
				2869	Py_DECREF(list);
				2870	Py_DECREF(string);
				2871	return NULL;
				2872	}
				2873
				2874	static
				2875	PyObject split_char(PyUnicodeObject self,
				2876	PyObject *list,
				2877	Py_UNICODE ch,
				2878	int maxcount)
				2879	{
				2880	register int i;
				2881	register int j;
				2882	int len = self->length;
				2883	PyObject *str;
				2884
				2885	for (i = j = 0; i < len; ) {
				2886	if (self->str[i] == ch) {
				2887	if (maxcount-- <= 0)
				2888	break;
				2889	SPLIT_APPEND(self->str, j, i);
				2890	i = j = i + 1;
				2891	} else
				2892	i++;
				2893	}
				2894	if (j <= len) {
				2895	SPLIT_APPEND(self->str, j, len);
				2896	}
				2897	return list;
				2898
				2899	onError:
				2900	Py_DECREF(list);
				2901	return NULL;
				2902	}
				2903
				2904	static
				2905	PyObject split_substring(PyUnicodeObject self,
				2906	PyObject *list,
				2907	PyUnicodeObject *substring,
				2908	int maxcount)
				2909	{
				2910	register int i;
				2911	register int j;
				2912	int len = self->length;
				2913	int sublen = substring->length;
				2914	PyObject *str;
				2915
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	2916	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2917	if (Py_UNICODE_MATCH(self, i, substring)) {
				2918	if (maxcount-- <= 0)
				2919	break;
				2920	SPLIT_APPEND(self->str, j, i);
				2921	i = j = i + sublen;
				2922	} else
				2923	i++;
				2924	}
				2925	if (j <= len) {
				2926	SPLIT_APPEND(self->str, j, len);
				2927	}
				2928	return list;
				2929
				2930	onError:
				2931	Py_DECREF(list);
				2932	return NULL;
				2933	}
				2934
				2935	#undef SPLIT_APPEND
				2936
				2937	static
				2938	PyObject split(PyUnicodeObject self,
				2939	PyUnicodeObject *substring,
				2940	int maxcount)
				2941	{
				2942	PyObject *list;
				2943
				2944	if (maxcount < 0)
				2945	maxcount = INT_MAX;
				2946
				2947	list = PyList_New(0);
				2948	if (!list)
				2949	return NULL;
				2950
				2951	if (substring == NULL)
				2952	return split_whitespace(self,list,maxcount);
				2953
				2954	else if (substring->length == 1)
				2955	return split_char(self,list,substring->str[0],maxcount);
				2956
				2957	else if (substring->length == 0) {
				2958	Py_DECREF(list);
				2959	PyErr_SetString(PyExc_ValueError, "empty separator");
				2960	return NULL;
				2961	}
				2962	else
				2963	return split_substring(self,list,substring,maxcount);
				2964	}
				2965
				2966	static
				2967	PyObject strip(PyUnicodeObject self,
				2968	int left,
				2969	int right)
				2970	{
				2971	Py_UNICODE *p = self->str;
				2972	int start = 0;
				2973	int end = self->length;
				2974
				2975	if (left)
				2976	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2977	start++;
				2978
				2979	if (right)
				2980	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2981	end--;
				2982
				2983	if (start == 0 && end == self->length) {
				2984	/* couldn't strip anything off, return original string */
				2985	Py_INCREF(self);
				2986	return (PyObject*) self;
				2987	}
				2988
				2989	return (PyObject*) PyUnicode_FromUnicode(
				2990	self->str + start,
				2991	end - start
				2992	);
				2993	}
				2994
				2995	static
				2996	PyObject replace(PyUnicodeObject self,
				2997	PyUnicodeObject *str1,
				2998	PyUnicodeObject *str2,
				2999	int maxcount)
				3000	{
				3001	PyUnicodeObject *u;
				3002
				3003	if (maxcount < 0)
				3004	maxcount = INT_MAX;
				3005
				3006	if (str1->length == 1 && str2->length == 1) {
				3007	int i;
				3008
				3009	/* replace characters */
				3010	if (!findchar(self->str, self->length, str1->str[0])) {
				3011	/* nothing to replace, return original string */
				3012	Py_INCREF(self);
				3013	u = self;
				3014	} else {
				3015	Py_UNICODE u1 = str1->str[0];
				3016	Py_UNICODE u2 = str2->str[0];
				3017
				3018	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3019	self->str,
				3020	self->length
				3021	);
				3022	if (u)
				3023	for (i = 0; i < u->length; i++)
				3024	if (u->str[i] == u1) {
				3025	if (--maxcount < 0)
				3026	break;
				3027	u->str[i] = u2;
				3028	}
				3029	}
				3030
				3031	} else {
				3032	int n, i;
				3033	Py_UNICODE *p;
				3034
				3035	/* replace strings */
				3036	n = count(self, 0, self->length, str1);
				3037	if (n > maxcount)
				3038	n = maxcount;
				3039	if (n == 0) {
				3040	/* nothing to replace, return original string */
				3041	Py_INCREF(self);
				3042	u = self;
				3043	} else {
				3044	u = _PyUnicode_New(
				3045	self->length + n * (str2->length - str1->length));
				3046	if (u) {
				3047	i = 0;
				3048	p = u->str;
				3049	while (i <= self->length - str1->length)
				3050	if (Py_UNICODE_MATCH(self, i, str1)) {
				3051	/* replace string segment */
				3052	Py_UNICODE_COPY(p, str2->str, str2->length);
				3053	p += str2->length;
				3054	i += str1->length;
				3055	if (--n <= 0) {
				3056	/* copy remaining part */
				3057	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3058	break;
				3059	}
				3060	} else
				3061	*p++ = self->str[i++];
				3062	}
				3063	}
				3064	}
				3065
				3066	return (PyObject *) u;
				3067	}
				3068
				3069	/* --- Unicode Object Methods --------------------------------------------- */
				3070
				3071	static char title__doc__[] =
				3072	"S.title() -> unicode\n\
				3073	\n\
				3074	Return a titlecased version of S, i.e. words start with title case\n\
				3075	characters, all remaining cased characters have lower case.";
				3076
				3077	static PyObject*
				3078	unicode_title(PyUnicodeObject self, PyObject args)
				3079	{
				3080	if (!PyArg_NoArgs(args))
				3081	return NULL;
				3082	return fixup(self, fixtitle);
				3083	}
				3084
				3085	static char capitalize__doc__[] =
				3086	"S.capitalize() -> unicode\n\
				3087	\n\
				3088	Return a capitalized version of S, i.e. make the first character\n\
				3089	have upper case.";
				3090
				3091	static PyObject*
				3092	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3093	{
				3094	if (!PyArg_NoArgs(args))
				3095	return NULL;
				3096	return fixup(self, fixcapitalize);
				3097	}
				3098
				3099	#if 0
				3100	static char capwords__doc__[] =
				3101	"S.capwords() -> unicode\n\
				3102	\n\
				3103	Apply .capitalize() to all words in S and return the result with\n\
				3104	normalized whitespace (all whitespace strings are replaced by ' ').";
				3105
				3106	static PyObject*
				3107	unicode_capwords(PyUnicodeObject self, PyObject args)
				3108	{
				3109	PyObject *list;
				3110	PyObject *item;
				3111	int i;
				3112
				3113	if (!PyArg_NoArgs(args))
				3114	return NULL;
				3115
				3116	/* Split into words */
				3117	list = split(self, NULL, -1);
				3118	if (!list)
				3119	return NULL;
				3120
				3121	/* Capitalize each word */
				3122	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3123	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3124	fixcapitalize);
				3125	if (item == NULL)
				3126	goto onError;
				3127	Py_DECREF(PyList_GET_ITEM(list, i));
				3128	PyList_SET_ITEM(list, i, item);
				3129	}
				3130
				3131	/* Join the words to form a new string */
				3132	item = PyUnicode_Join(NULL, list);
				3133
				3134	onError:
				3135	Py_DECREF(list);
				3136	return (PyObject *)item;
				3137	}
				3138	#endif
				3139
				3140	static char center__doc__[] =
				3141	"S.center(width) -> unicode\n\
				3142	\n\
				3143	Return S centered in a Unicode string of length width. Padding is done\n\
				3144	using spaces.";
				3145
				3146	static PyObject *
				3147	unicode_center(PyUnicodeObject self, PyObject args)
				3148	{
				3149	int marg, left;
				3150	int width;
				3151
				3152	if (!PyArg_ParseTuple(args, "i:center", &width))
				3153	return NULL;
				3154
				3155	if (self->length >= width) {
				3156	Py_INCREF(self);
				3157	return (PyObject*) self;
				3158	}
				3159
				3160	marg = width - self->length;
				3161	left = marg / 2 + (marg & width & 1);
				3162
				3163	return (PyObject*) pad(self, left, marg - left, ' ');
				3164	}
				3165
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3166	#if 0
				3167
				3168	/* This code should go into some future Unicode collation support
				3169	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3170	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3171
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3172	/* speedy UTF-16 code point order comparison */
				3173	/* gleaned from: */
				3174	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3175
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3176	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3177	{
				3178	0, 0, 0, 0, 0, 0, 0, 0,
				3179	0, 0, 0, 0, 0, 0, 0, 0,
				3180	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3181	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3182	};
				3183
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3184	static int
				3185	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3186	{
				3187	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3188
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3189	Py_UNICODE *s1 = str1->str;
				3190	Py_UNICODE *s2 = str2->str;
				3191
				3192	len1 = str1->length;
				3193	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3194
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3195	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3196	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3197	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3198
				3199	c1 = *s1++;
				3200	c2 = *s2++;
				3201	if (c1 > (1<<11) * 26)
				3202	c1 += utf16Fixup[c1>>11];
				3203	if (c2 > (1<<11) * 26)
				3204	c2 += utf16Fixup[c2>>11];
				3205
				3206	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3207	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3208	if (diff)
				3209	return (diff < 0) ? -1 : (diff != 0);
				3210	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3211	}
				3212
				3213	return (len1 < len2) ? -1 : (len1 != len2);
				3214	}
				3215
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3216	#else
				3217
				3218	static int
				3219	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3220	{
				3221	register int len1, len2;
				3222
				3223	Py_UNICODE *s1 = str1->str;
				3224	Py_UNICODE *s2 = str2->str;
				3225
				3226	len1 = str1->length;
				3227	len2 = str2->length;
				3228
				3229	while (len1 > 0 && len2 > 0) {
				3230	register long diff;
				3231
				3232	diff = (long)s1++ - (long)s2++;
				3233	if (diff)
				3234	return (diff < 0) ? -1 : (diff != 0);
				3235	len1--; len2--;
				3236	}
				3237
				3238	return (len1 < len2) ? -1 : (len1 != len2);
				3239	}
				3240
				3241	#endif
				3242
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3243	int PyUnicode_Compare(PyObject *left,
				3244	PyObject *right)
				3245	{
				3246	PyUnicodeObject u = NULL, v = NULL;
				3247	int result;
				3248
				3249	/* Coerce the two arguments */
				3250	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3251	if (u == NULL)
				3252	goto onError;
				3253	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3254	if (v == NULL)
				3255	goto onError;
				3256
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3257	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3258	if (v == u) {
				3259	Py_DECREF(u);
				3260	Py_DECREF(v);
				3261	return 0;
				3262	}
				3263
				3264	result = unicode_compare(u, v);
				3265
				3266	Py_DECREF(u);
				3267	Py_DECREF(v);
				3268	return result;
				3269
				3270	onError:
				3271	Py_XDECREF(u);
				3272	Py_XDECREF(v);
				3273	return -1;
				3274	}
				3275
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3276	int PyUnicode_Contains(PyObject *container,
				3277	PyObject *element)
				3278	{
				3279	PyUnicodeObject u = NULL, v = NULL;
				3280	int result;
				3281	register const Py_UNICODE p, e;
				3282	register Py_UNICODE ch;
				3283
				3284	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3285	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3286	if (v == NULL) {
				3287	PyErr_SetString(PyExc_TypeError,
				3288	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3289	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3290	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3291	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3292	if (u == NULL) {
				3293	Py_DECREF(v);
				3294	goto onError;
				3295	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3296
				3297	/* Check v in u */
				3298	if (PyUnicode_GET_SIZE(v) != 1) {
				3299	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3300	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3301	goto onError;
				3302	}
				3303	ch = *PyUnicode_AS_UNICODE(v);
				3304	p = PyUnicode_AS_UNICODE(u);
				3305	e = p + PyUnicode_GET_SIZE(u);
				3306	result = 0;
				3307	while (p < e) {
				3308	if (*p++ == ch) {
				3309	result = 1;
				3310	break;
				3311	}
				3312	}
				3313
				3314	Py_DECREF(u);
				3315	Py_DECREF(v);
				3316	return result;
				3317
				3318	onError:
				3319	Py_XDECREF(u);
				3320	Py_XDECREF(v);
				3321	return -1;
				3322	}
				3323
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3324	/* Concat to string or Unicode object giving a new Unicode object. */
				3325
				3326	PyObject PyUnicode_Concat(PyObject left,
				3327	PyObject *right)
				3328	{
				3329	PyUnicodeObject u = NULL, v = NULL, *w;
				3330
				3331	/* Coerce the two arguments */
				3332	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3333	if (u == NULL)
				3334	goto onError;
				3335	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3336	if (v == NULL)
				3337	goto onError;
				3338
				3339	/* Shortcuts */
				3340	if (v == unicode_empty) {
				3341	Py_DECREF(v);
				3342	return (PyObject *)u;
				3343	}
				3344	if (u == unicode_empty) {
				3345	Py_DECREF(u);
				3346	return (PyObject *)v;
				3347	}
				3348
				3349	/* Concat the two Unicode strings */
				3350	w = _PyUnicode_New(u->length + v->length);
				3351	if (w == NULL)
				3352	goto onError;
				3353	Py_UNICODE_COPY(w->str, u->str, u->length);
				3354	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3355
				3356	Py_DECREF(u);
				3357	Py_DECREF(v);
				3358	return (PyObject *)w;
				3359
				3360	onError:
				3361	Py_XDECREF(u);
				3362	Py_XDECREF(v);
				3363	return NULL;
				3364	}
				3365
				3366	static char count__doc__[] =
				3367	"S.count(sub[, start[, end]]) -> int\n\
				3368	\n\
				3369	Return the number of occurrences of substring sub in Unicode string\n\
				3370	S[start:end]. Optional arguments start and end are\n\
				3371	interpreted as in slice notation.";
				3372
				3373	static PyObject *
				3374	unicode_count(PyUnicodeObject self, PyObject args)
				3375	{
				3376	PyUnicodeObject *substring;
				3377	int start = 0;
				3378	int end = INT_MAX;
				3379	PyObject *result;
				3380
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3381	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3382	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3383	return NULL;
				3384
				3385	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3386	(PyObject *)substring);
				3387	if (substring == NULL)
				3388	return NULL;
				3389
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3390	if (start < 0)
				3391	start += self->length;
				3392	if (start < 0)
				3393	start = 0;
				3394	if (end > self->length)
				3395	end = self->length;
				3396	if (end < 0)
				3397	end += self->length;
				3398	if (end < 0)
				3399	end = 0;
				3400
				3401	result = PyInt_FromLong((long) count(self, start, end, substring));
				3402
				3403	Py_DECREF(substring);
				3404	return result;
				3405	}
				3406
				3407	static char encode__doc__[] =
				3408	"S.encode([encoding[,errors]]) -> string\n\
				3409	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3410	Return an encoded string version of S. Default encoding is the current\n\
				3411	default string encoding. errors may be given to set a different error\n\
				3412	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3413	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3414
				3415	static PyObject *
				3416	unicode_encode(PyUnicodeObject self, PyObject args)
				3417	{
				3418	char *encoding = NULL;
				3419	char *errors = NULL;
				3420	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3421	return NULL;
				3422	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3423	}
				3424
				3425	static char expandtabs__doc__[] =
				3426	"S.expandtabs([tabsize]) -> unicode\n\
				3427	\n\
				3428	Return a copy of S where all tab characters are expanded using spaces.\n\
				3429	If tabsize is not given, a tab size of 8 characters is assumed.";
				3430
				3431	static PyObject*
				3432	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3433	{
				3434	Py_UNICODE *e;
				3435	Py_UNICODE *p;
				3436	Py_UNICODE *q;
				3437	int i, j;
				3438	PyUnicodeObject *u;
				3439	int tabsize = 8;
				3440
				3441	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3442	return NULL;
				3443
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3444	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3445	i = j = 0;
				3446	e = self->str + self->length;
				3447	for (p = self->str; p < e; p++)
				3448	if (*p == '\t') {
				3449	if (tabsize > 0)
				3450	j += tabsize - (j % tabsize);
				3451	}
				3452	else {
				3453	j++;
				3454	if (p == '\n' \|\| p == '\r') {
				3455	i += j;
				3456	j = 0;
				3457	}
				3458	}
				3459
				3460	/* Second pass: create output string and fill it */
				3461	u = _PyUnicode_New(i + j);
				3462	if (!u)
				3463	return NULL;
				3464
				3465	j = 0;
				3466	q = u->str;
				3467
				3468	for (p = self->str; p < e; p++)
				3469	if (*p == '\t') {
				3470	if (tabsize > 0) {
				3471	i = tabsize - (j % tabsize);
				3472	j += i;
				3473	while (i--)
				3474	*q++ = ' ';
				3475	}
				3476	}
				3477	else {
				3478	j++;
				3479	q++ = p;
				3480	if (p == '\n' \|\| p == '\r')
				3481	j = 0;
				3482	}
				3483
				3484	return (PyObject*) u;
				3485	}
				3486
				3487	static char find__doc__[] =
				3488	"S.find(sub [,start [,end]]) -> int\n\
				3489	\n\
				3490	Return the lowest index in S where substring sub is found,\n\
				3491	such that sub is contained within s[start,end]. Optional\n\
				3492	arguments start and end are interpreted as in slice notation.\n\
				3493	\n\
				3494	Return -1 on failure.";
				3495
				3496	static PyObject *
				3497	unicode_find(PyUnicodeObject self, PyObject args)
				3498	{
				3499	PyUnicodeObject *substring;
				3500	int start = 0;
				3501	int end = INT_MAX;
				3502	PyObject *result;
				3503
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3504	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3505	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3506	return NULL;
				3507	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3508	(PyObject *)substring);
				3509	if (substring == NULL)
				3510	return NULL;
				3511
				3512	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3513
				3514	Py_DECREF(substring);
				3515	return result;
				3516	}
				3517
				3518	static PyObject *
				3519	unicode_getitem(PyUnicodeObject *self, int index)
				3520	{
				3521	if (index < 0 \|\| index >= self->length) {
				3522	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3523	return NULL;
				3524	}
				3525
				3526	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3527	}
				3528
				3529	static long
				3530	unicode_hash(PyUnicodeObject *self)
				3531	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3532	/* Since Unicode objects compare equal to their ASCII string
				3533	counterparts, they should use the individual character values
				3534	as basis for their hash value. This is needed to assure that
				3535	strings and Unicode objects behave in the same way as
				3536	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3537
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3538	register int len;
				3539	register Py_UNICODE *p;
				3540	register long x;
				3541
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3542	if (self->hash != -1)
				3543	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3544	len = PyUnicode_GET_SIZE(self);
				3545	p = PyUnicode_AS_UNICODE(self);
				3546	x = *p << 7;
				3547	while (--len >= 0)
				3548	x = (1000003x) ^ p++;
				3549	x ^= PyUnicode_GET_SIZE(self);
				3550	if (x == -1)
				3551	x = -2;
				3552	self->hash = x;
				3553	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3554	}
				3555
				3556	static char index__doc__[] =
				3557	"S.index(sub [,start [,end]]) -> int\n\
				3558	\n\
				3559	Like S.find() but raise ValueError when the substring is not found.";
				3560
				3561	static PyObject *
				3562	unicode_index(PyUnicodeObject self, PyObject args)
				3563	{
				3564	int result;
				3565	PyUnicodeObject *substring;
				3566	int start = 0;
				3567	int end = INT_MAX;
				3568
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3569	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3570	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3571	return NULL;
				3572
				3573	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3574	(PyObject *)substring);
				3575	if (substring == NULL)
				3576	return NULL;
				3577
				3578	result = findstring(self, substring, start, end, 1);
				3579
				3580	Py_DECREF(substring);
				3581	if (result < 0) {
				3582	PyErr_SetString(PyExc_ValueError, "substring not found");
				3583	return NULL;
				3584	}
				3585	return PyInt_FromLong(result);
				3586	}
				3587
				3588	static char islower__doc__[] =
				3589	"S.islower() -> int\n\
				3590	\n\
				3591	Return 1 if all cased characters in S are lowercase and there is\n\
				3592	at least one cased character in S, 0 otherwise.";
				3593
				3594	static PyObject*
				3595	unicode_islower(PyUnicodeObject self, PyObject args)
				3596	{
				3597	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3598	register const Py_UNICODE *e;
				3599	int cased;
				3600
				3601	if (!PyArg_NoArgs(args))
				3602	return NULL;
				3603
				3604	/* Shortcut for single character strings */
				3605	if (PyUnicode_GET_SIZE(self) == 1)
				3606	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3607
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3608	/* Special case for empty strings */
				3609	if (PyString_GET_SIZE(self) == 0)
				3610	return PyInt_FromLong(0);
				3611
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3612	e = p + PyUnicode_GET_SIZE(self);
				3613	cased = 0;
				3614	for (; p < e; p++) {
				3615	register const Py_UNICODE ch = *p;
				3616
				3617	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3618	return PyInt_FromLong(0);
				3619	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3620	cased = 1;
				3621	}
				3622	return PyInt_FromLong(cased);
				3623	}
				3624
				3625	static char isupper__doc__[] =
				3626	"S.isupper() -> int\n\
				3627	\n\
				3628	Return 1 if all cased characters in S are uppercase and there is\n\
				3629	at least one cased character in S, 0 otherwise.";
				3630
				3631	static PyObject*
				3632	unicode_isupper(PyUnicodeObject self, PyObject args)
				3633	{
				3634	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3635	register const Py_UNICODE *e;
				3636	int cased;
				3637
				3638	if (!PyArg_NoArgs(args))
				3639	return NULL;
				3640
				3641	/* Shortcut for single character strings */
				3642	if (PyUnicode_GET_SIZE(self) == 1)
				3643	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3644
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3645	/* Special case for empty strings */
				3646	if (PyString_GET_SIZE(self) == 0)
				3647	return PyInt_FromLong(0);
				3648
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3649	e = p + PyUnicode_GET_SIZE(self);
				3650	cased = 0;
				3651	for (; p < e; p++) {
				3652	register const Py_UNICODE ch = *p;
				3653
				3654	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3655	return PyInt_FromLong(0);
				3656	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3657	cased = 1;
				3658	}
				3659	return PyInt_FromLong(cased);
				3660	}
				3661
				3662	static char istitle__doc__[] =
				3663	"S.istitle() -> int\n\
				3664	\n\
				3665	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3666	may only follow uncased characters and lowercase characters only cased\n\
				3667	ones. Return 0 otherwise.";
				3668
				3669	static PyObject*
				3670	unicode_istitle(PyUnicodeObject self, PyObject args)
				3671	{
				3672	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3673	register const Py_UNICODE *e;
				3674	int cased, previous_is_cased;
				3675
				3676	if (!PyArg_NoArgs(args))
				3677	return NULL;
				3678
				3679	/* Shortcut for single character strings */
				3680	if (PyUnicode_GET_SIZE(self) == 1)
				3681	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3682	(Py_UNICODE_ISUPPER(*p) != 0));
				3683
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3684	/* Special case for empty strings */
				3685	if (PyString_GET_SIZE(self) == 0)
				3686	return PyInt_FromLong(0);
				3687
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3688	e = p + PyUnicode_GET_SIZE(self);
				3689	cased = 0;
				3690	previous_is_cased = 0;
				3691	for (; p < e; p++) {
				3692	register const Py_UNICODE ch = *p;
				3693
				3694	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3695	if (previous_is_cased)
				3696	return PyInt_FromLong(0);
				3697	previous_is_cased = 1;
				3698	cased = 1;
				3699	}
				3700	else if (Py_UNICODE_ISLOWER(ch)) {
				3701	if (!previous_is_cased)
				3702	return PyInt_FromLong(0);
				3703	previous_is_cased = 1;
				3704	cased = 1;
				3705	}
				3706	else
				3707	previous_is_cased = 0;
				3708	}
				3709	return PyInt_FromLong(cased);
				3710	}
				3711
				3712	static char isspace__doc__[] =
				3713	"S.isspace() -> int\n\
				3714	\n\
				3715	Return 1 if there are only whitespace characters in S,\n\
				3716	0 otherwise.";
				3717
				3718	static PyObject*
				3719	unicode_isspace(PyUnicodeObject self, PyObject args)
				3720	{
				3721	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3722	register const Py_UNICODE *e;
				3723
				3724	if (!PyArg_NoArgs(args))
				3725	return NULL;
				3726
				3727	/* Shortcut for single character strings */
				3728	if (PyUnicode_GET_SIZE(self) == 1 &&
				3729	Py_UNICODE_ISSPACE(*p))
				3730	return PyInt_FromLong(1);
				3731
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3732	/* Special case for empty strings */
				3733	if (PyString_GET_SIZE(self) == 0)
				3734	return PyInt_FromLong(0);
				3735
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3736	e = p + PyUnicode_GET_SIZE(self);
				3737	for (; p < e; p++) {
				3738	if (!Py_UNICODE_ISSPACE(*p))
				3739	return PyInt_FromLong(0);
				3740	}
				3741	return PyInt_FromLong(1);
				3742	}
				3743
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3744	static char isalpha__doc__[] =
				3745	"S.isalpha() -> int\n\
				3746	\n\
				3747	Return 1 if all characters in S are alphabetic\n\
				3748	and there is at least one character in S, 0 otherwise.";
				3749
				3750	static PyObject*
				3751	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3752	{
				3753	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3754	register const Py_UNICODE *e;
				3755
				3756	if (!PyArg_NoArgs(args))
				3757	return NULL;
				3758
				3759	/* Shortcut for single character strings */
				3760	if (PyUnicode_GET_SIZE(self) == 1 &&
				3761	Py_UNICODE_ISALPHA(*p))
				3762	return PyInt_FromLong(1);
				3763
				3764	/* Special case for empty strings */
				3765	if (PyString_GET_SIZE(self) == 0)
				3766	return PyInt_FromLong(0);
				3767
				3768	e = p + PyUnicode_GET_SIZE(self);
				3769	for (; p < e; p++) {
				3770	if (!Py_UNICODE_ISALPHA(*p))
				3771	return PyInt_FromLong(0);
				3772	}
				3773	return PyInt_FromLong(1);
				3774	}
				3775
				3776	static char isalnum__doc__[] =
				3777	"S.isalnum() -> int\n\
				3778	\n\
				3779	Return 1 if all characters in S are alphanumeric\n\
				3780	and there is at least one character in S, 0 otherwise.";
				3781
				3782	static PyObject*
				3783	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3784	{
				3785	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3786	register const Py_UNICODE *e;
				3787
				3788	if (!PyArg_NoArgs(args))
				3789	return NULL;
				3790
				3791	/* Shortcut for single character strings */
				3792	if (PyUnicode_GET_SIZE(self) == 1 &&
				3793	Py_UNICODE_ISALNUM(*p))
				3794	return PyInt_FromLong(1);
				3795
				3796	/* Special case for empty strings */
				3797	if (PyString_GET_SIZE(self) == 0)
				3798	return PyInt_FromLong(0);
				3799
				3800	e = p + PyUnicode_GET_SIZE(self);
				3801	for (; p < e; p++) {
				3802	if (!Py_UNICODE_ISALNUM(*p))
				3803	return PyInt_FromLong(0);
				3804	}
				3805	return PyInt_FromLong(1);
				3806	}
				3807
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3808	static char isdecimal__doc__[] =
				3809	"S.isdecimal() -> int\n\
				3810	\n\
				3811	Return 1 if there are only decimal characters in S,\n\
				3812	0 otherwise.";
				3813
				3814	static PyObject*
				3815	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3816	{
				3817	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3818	register const Py_UNICODE *e;
				3819
				3820	if (!PyArg_NoArgs(args))
				3821	return NULL;
				3822
				3823	/* Shortcut for single character strings */
				3824	if (PyUnicode_GET_SIZE(self) == 1 &&
				3825	Py_UNICODE_ISDECIMAL(*p))
				3826	return PyInt_FromLong(1);
				3827
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3828	/* Special case for empty strings */
				3829	if (PyString_GET_SIZE(self) == 0)
				3830	return PyInt_FromLong(0);
				3831
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3832	e = p + PyUnicode_GET_SIZE(self);
				3833	for (; p < e; p++) {
				3834	if (!Py_UNICODE_ISDECIMAL(*p))
				3835	return PyInt_FromLong(0);
				3836	}
				3837	return PyInt_FromLong(1);
				3838	}
				3839
				3840	static char isdigit__doc__[] =
				3841	"S.isdigit() -> int\n\
				3842	\n\
				3843	Return 1 if there are only digit characters in S,\n\
				3844	0 otherwise.";
				3845
				3846	static PyObject*
				3847	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3848	{
				3849	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3850	register const Py_UNICODE *e;
				3851
				3852	if (!PyArg_NoArgs(args))
				3853	return NULL;
				3854
				3855	/* Shortcut for single character strings */
				3856	if (PyUnicode_GET_SIZE(self) == 1 &&
				3857	Py_UNICODE_ISDIGIT(*p))
				3858	return PyInt_FromLong(1);
				3859
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3860	/* Special case for empty strings */
				3861	if (PyString_GET_SIZE(self) == 0)
				3862	return PyInt_FromLong(0);
				3863
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3864	e = p + PyUnicode_GET_SIZE(self);
				3865	for (; p < e; p++) {
				3866	if (!Py_UNICODE_ISDIGIT(*p))
				3867	return PyInt_FromLong(0);
				3868	}
				3869	return PyInt_FromLong(1);
				3870	}
				3871
				3872	static char isnumeric__doc__[] =
				3873	"S.isnumeric() -> int\n\
				3874	\n\
				3875	Return 1 if there are only numeric characters in S,\n\
				3876	0 otherwise.";
				3877
				3878	static PyObject*
				3879	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3880	{
				3881	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3882	register const Py_UNICODE *e;
				3883
				3884	if (!PyArg_NoArgs(args))
				3885	return NULL;
				3886
				3887	/* Shortcut for single character strings */
				3888	if (PyUnicode_GET_SIZE(self) == 1 &&
				3889	Py_UNICODE_ISNUMERIC(*p))
				3890	return PyInt_FromLong(1);
				3891
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3892	/* Special case for empty strings */
				3893	if (PyString_GET_SIZE(self) == 0)
				3894	return PyInt_FromLong(0);
				3895
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3896	e = p + PyUnicode_GET_SIZE(self);
				3897	for (; p < e; p++) {
				3898	if (!Py_UNICODE_ISNUMERIC(*p))
				3899	return PyInt_FromLong(0);
				3900	}
				3901	return PyInt_FromLong(1);
				3902	}
				3903
				3904	static char join__doc__[] =
				3905	"S.join(sequence) -> unicode\n\
				3906	\n\
				3907	Return a string which is the concatenation of the strings in the\n\
				3908	sequence. The separator between elements is S.";
				3909
				3910	static PyObject*
				3911	unicode_join(PyUnicodeObject self, PyObject args)
				3912	{
				3913	PyObject *data;
				3914	if (!PyArg_ParseTuple(args, "O:join", &data))
				3915	return NULL;
				3916
				3917	return PyUnicode_Join((PyObject *)self, data);
				3918	}
				3919
				3920	static int
				3921	unicode_length(PyUnicodeObject *self)
				3922	{
				3923	return self->length;
				3924	}
				3925
				3926	static char ljust__doc__[] =
				3927	"S.ljust(width) -> unicode\n\
				3928	\n\
				3929	Return S left justified in a Unicode string of length width. Padding is\n\
				3930	done using spaces.";
				3931
				3932	static PyObject *
				3933	unicode_ljust(PyUnicodeObject self, PyObject args)
				3934	{
				3935	int width;
				3936	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3937	return NULL;
				3938
				3939	if (self->length >= width) {
				3940	Py_INCREF(self);
				3941	return (PyObject*) self;
				3942	}
				3943
				3944	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3945	}
				3946
				3947	static char lower__doc__[] =
				3948	"S.lower() -> unicode\n\
				3949	\n\
				3950	Return a copy of the string S converted to lowercase.";
				3951
				3952	static PyObject*
				3953	unicode_lower(PyUnicodeObject self, PyObject args)
				3954	{
				3955	if (!PyArg_NoArgs(args))
				3956	return NULL;
				3957	return fixup(self, fixlower);
				3958	}
				3959
				3960	static char lstrip__doc__[] =
				3961	"S.lstrip() -> unicode\n\
				3962	\n\
				3963	Return a copy of the string S with leading whitespace removed.";
				3964
				3965	static PyObject *
				3966	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3967	{
				3968	if (!PyArg_NoArgs(args))
				3969	return NULL;
				3970	return strip(self, 1, 0);
				3971	}
				3972
				3973	static PyObject*
				3974	unicode_repeat(PyUnicodeObject *str, int len)
				3975	{
				3976	PyUnicodeObject *u;
				3977	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3978	int nchars;
				3979	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3980
				3981	if (len < 0)
				3982	len = 0;
				3983
				3984	if (len == 1) {
				3985	/* no repeat, return original string */
				3986	Py_INCREF(str);
				3987	return (PyObject*) str;
				3988	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3989
				3990	/* ensure # of chars needed doesn't overflow int and # of bytes
				3991	* needed doesn't overflow size_t
				3992	*/
				3993	nchars = len * str->length;
				3994	if (len && nchars / len != str->length) {
				3995	PyErr_SetString(PyExc_OverflowError,
				3996	"repeated string is too long");
				3997	return NULL;
				3998	}
				3999	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4000	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4001	PyErr_SetString(PyExc_OverflowError,
				4002	"repeated string is too long");
				4003	return NULL;
				4004	}
				4005	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4006	if (!u)
				4007	return NULL;
				4008
				4009	p = u->str;
				4010
				4011	while (len-- > 0) {
				4012	Py_UNICODE_COPY(p, str->str, str->length);
				4013	p += str->length;
				4014	}
				4015
				4016	return (PyObject*) u;
				4017	}
				4018
				4019	PyObject PyUnicode_Replace(PyObject obj,
				4020	PyObject *subobj,
				4021	PyObject *replobj,
				4022	int maxcount)
				4023	{
				4024	PyObject *self;
				4025	PyObject *str1;
				4026	PyObject *str2;
				4027	PyObject *result;
				4028
				4029	self = PyUnicode_FromObject(obj);
				4030	if (self == NULL)
				4031	return NULL;
				4032	str1 = PyUnicode_FromObject(subobj);
				4033	if (str1 == NULL) {
				4034	Py_DECREF(self);
				4035	return NULL;
				4036	}
				4037	str2 = PyUnicode_FromObject(replobj);
				4038	if (str2 == NULL) {
				4039	Py_DECREF(self);
				4040	Py_DECREF(str1);
				4041	return NULL;
				4042	}
				4043	result = replace((PyUnicodeObject *)self,
				4044	(PyUnicodeObject *)str1,
				4045	(PyUnicodeObject *)str2,
				4046	maxcount);
				4047	Py_DECREF(self);
				4048	Py_DECREF(str1);
				4049	Py_DECREF(str2);
				4050	return result;
				4051	}
				4052
				4053	static char replace__doc__[] =
				4054	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4055	\n\
				4056	Return a copy of S with all occurrences of substring\n\
				4057	old replaced by new. If the optional argument maxsplit is\n\
				4058	given, only the first maxsplit occurrences are replaced.";
				4059
				4060	static PyObject*
				4061	unicode_replace(PyUnicodeObject self, PyObject args)
				4062	{
				4063	PyUnicodeObject *str1;
				4064	PyUnicodeObject *str2;
				4065	int maxcount = -1;
				4066	PyObject *result;
				4067
				4068	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4069	return NULL;
				4070	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4071	if (str1 == NULL)
				4072	return NULL;
				4073	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4074	if (str2 == NULL)
				4075	return NULL;
				4076
				4077	result = replace(self, str1, str2, maxcount);
				4078
				4079	Py_DECREF(str1);
				4080	Py_DECREF(str2);
				4081	return result;
				4082	}
				4083
				4084	static
				4085	PyObject unicode_repr(PyObject unicode)
				4086	{
				4087	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4088	PyUnicode_GET_SIZE(unicode),
				4089	1);
				4090	}
				4091
				4092	static char rfind__doc__[] =
				4093	"S.rfind(sub [,start [,end]]) -> int\n\
				4094	\n\
				4095	Return the highest index in S where substring sub is found,\n\
				4096	such that sub is contained within s[start,end]. Optional\n\
				4097	arguments start and end are interpreted as in slice notation.\n\
				4098	\n\
				4099	Return -1 on failure.";
				4100
				4101	static PyObject *
				4102	unicode_rfind(PyUnicodeObject self, PyObject args)
				4103	{
				4104	PyUnicodeObject *substring;
				4105	int start = 0;
				4106	int end = INT_MAX;
				4107	PyObject *result;
				4108
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4109	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4110	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4111	return NULL;
				4112	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4113	(PyObject *)substring);
				4114	if (substring == NULL)
				4115	return NULL;
				4116
				4117	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4118
				4119	Py_DECREF(substring);
				4120	return result;
				4121	}
				4122
				4123	static char rindex__doc__[] =
				4124	"S.rindex(sub [,start [,end]]) -> int\n\
				4125	\n\
				4126	Like S.rfind() but raise ValueError when the substring is not found.";
				4127
				4128	static PyObject *
				4129	unicode_rindex(PyUnicodeObject self, PyObject args)
				4130	{
				4131	int result;
				4132	PyUnicodeObject *substring;
				4133	int start = 0;
				4134	int end = INT_MAX;
				4135
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4136	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4137	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4138	return NULL;
				4139	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4140	(PyObject *)substring);
				4141	if (substring == NULL)
				4142	return NULL;
				4143
				4144	result = findstring(self, substring, start, end, -1);
				4145
				4146	Py_DECREF(substring);
				4147	if (result < 0) {
				4148	PyErr_SetString(PyExc_ValueError, "substring not found");
				4149	return NULL;
				4150	}
				4151	return PyInt_FromLong(result);
				4152	}
				4153
				4154	static char rjust__doc__[] =
				4155	"S.rjust(width) -> unicode\n\
				4156	\n\
				4157	Return S right justified in a Unicode string of length width. Padding is\n\
				4158	done using spaces.";
				4159
				4160	static PyObject *
				4161	unicode_rjust(PyUnicodeObject self, PyObject args)
				4162	{
				4163	int width;
				4164	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4165	return NULL;
				4166
				4167	if (self->length >= width) {
				4168	Py_INCREF(self);
				4169	return (PyObject*) self;
				4170	}
				4171
				4172	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4173	}
				4174
				4175	static char rstrip__doc__[] =
				4176	"S.rstrip() -> unicode\n\
				4177	\n\
				4178	Return a copy of the string S with trailing whitespace removed.";
				4179
				4180	static PyObject *
				4181	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4182	{
				4183	if (!PyArg_NoArgs(args))
				4184	return NULL;
				4185	return strip(self, 0, 1);
				4186	}
				4187
				4188	static PyObject*
				4189	unicode_slice(PyUnicodeObject *self, int start, int end)
				4190	{
				4191	/* standard clamping */
				4192	if (start < 0)
				4193	start = 0;
				4194	if (end < 0)
				4195	end = 0;
				4196	if (end > self->length)
				4197	end = self->length;
				4198	if (start == 0 && end == self->length) {
				4199	/* full slice, return original string */
				4200	Py_INCREF(self);
				4201	return (PyObject*) self;
				4202	}
				4203	if (start > end)
				4204	start = end;
				4205	/* copy slice */
				4206	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4207	end - start);
				4208	}
				4209
				4210	PyObject PyUnicode_Split(PyObject s,
				4211	PyObject *sep,
				4212	int maxsplit)
				4213	{
				4214	PyObject *result;
				4215
				4216	s = PyUnicode_FromObject(s);
				4217	if (s == NULL)
				4218	return NULL;
				4219	if (sep != NULL) {
				4220	sep = PyUnicode_FromObject(sep);
				4221	if (sep == NULL) {
				4222	Py_DECREF(s);
				4223	return NULL;
				4224	}
				4225	}
				4226
				4227	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4228
				4229	Py_DECREF(s);
				4230	Py_XDECREF(sep);
				4231	return result;
				4232	}
				4233
				4234	static char split__doc__[] =
				4235	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4236	\n\
				4237	Return a list of the words in S, using sep as the\n\
				4238	delimiter string. If maxsplit is given, at most maxsplit\n\
				4239	splits are done. If sep is not specified, any whitespace string\n\
				4240	is a separator.";
				4241
				4242	static PyObject*
				4243	unicode_split(PyUnicodeObject self, PyObject args)
				4244	{
				4245	PyObject *substring = Py_None;
				4246	int maxcount = -1;
				4247
				4248	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4249	return NULL;
				4250
				4251	if (substring == Py_None)
				4252	return split(self, NULL, maxcount);
				4253	else if (PyUnicode_Check(substring))
				4254	return split(self, (PyUnicodeObject *)substring, maxcount);
				4255	else
				4256	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4257	}
				4258
				4259	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4260	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4261	\n\
				4262	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4263	Line breaks are not included in the resulting list unless keepends\n\
				4264	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4265
				4266	static PyObject*
				4267	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4268	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4269	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4270
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4271	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4272	return NULL;
				4273
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4274	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4275	}
				4276
				4277	static
				4278	PyObject unicode_str(PyUnicodeObject self)
				4279	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4280	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4281	}
				4282
				4283	static char strip__doc__[] =
				4284	"S.strip() -> unicode\n\
				4285	\n\
				4286	Return a copy of S with leading and trailing whitespace removed.";
				4287
				4288	static PyObject *
				4289	unicode_strip(PyUnicodeObject self, PyObject args)
				4290	{
				4291	if (!PyArg_NoArgs(args))
				4292	return NULL;
				4293	return strip(self, 1, 1);
				4294	}
				4295
				4296	static char swapcase__doc__[] =
				4297	"S.swapcase() -> unicode\n\
				4298	\n\
				4299	Return a copy of S with uppercase characters converted to lowercase\n\
				4300	and vice versa.";
				4301
				4302	static PyObject*
				4303	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4304	{
				4305	if (!PyArg_NoArgs(args))
				4306	return NULL;
				4307	return fixup(self, fixswapcase);
				4308	}
				4309
				4310	static char translate__doc__[] =
				4311	"S.translate(table) -> unicode\n\
				4312	\n\
				4313	Return a copy of the string S, where all characters have been mapped\n\
				4314	through the given translation table, which must be a mapping of\n\
				4315	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4316	are left untouched. Characters mapped to None are deleted.";
				4317
				4318	static PyObject*
				4319	unicode_translate(PyUnicodeObject self, PyObject args)
				4320	{
				4321	PyObject *table;
				4322
				4323	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4324	return NULL;
				4325	return PyUnicode_TranslateCharmap(self->str,
				4326	self->length,
				4327	table,
				4328	"ignore");
				4329	}
				4330
				4331	static char upper__doc__[] =
				4332	"S.upper() -> unicode\n\
				4333	\n\
				4334	Return a copy of S converted to uppercase.";
				4335
				4336	static PyObject*
				4337	unicode_upper(PyUnicodeObject self, PyObject args)
				4338	{
				4339	if (!PyArg_NoArgs(args))
				4340	return NULL;
				4341	return fixup(self, fixupper);
				4342	}
				4343
				4344	#if 0
				4345	static char zfill__doc__[] =
				4346	"S.zfill(width) -> unicode\n\
				4347	\n\
				4348	Pad a numeric string x with zeros on the left, to fill a field\n\
				4349	of the specified width. The string x is never truncated.";
				4350
				4351	static PyObject *
				4352	unicode_zfill(PyUnicodeObject self, PyObject args)
				4353	{
				4354	int fill;
				4355	PyUnicodeObject *u;
				4356
				4357	int width;
				4358	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4359	return NULL;
				4360
				4361	if (self->length >= width) {
				4362	Py_INCREF(self);
				4363	return (PyObject*) self;
				4364	}
				4365
				4366	fill = width - self->length;
				4367
				4368	u = pad(self, fill, 0, '0');
				4369
				4370	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4371	/* move sign to beginning of string */
				4372	u->str[0] = u->str[fill];
				4373	u->str[fill] = '0';
				4374	}
				4375
				4376	return (PyObject*) u;
				4377	}
				4378	#endif
				4379
				4380	#if 0
				4381	static PyObject*
				4382	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4383	{
				4384	if (!PyArg_NoArgs(args))
				4385	return NULL;
				4386	return PyInt_FromLong(unicode_freelist_size);
				4387	}
				4388	#endif
				4389
				4390	static char startswith__doc__[] =
				4391	"S.startswith(prefix[, start[, end]]) -> int\n\
				4392	\n\
				4393	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4394	optional start, test S beginning at that position. With optional end, stop\n\
				4395	comparing S at that position.";
				4396
				4397	static PyObject *
				4398	unicode_startswith(PyUnicodeObject *self,
				4399	PyObject *args)
				4400	{
				4401	PyUnicodeObject *substring;
				4402	int start = 0;
				4403	int end = INT_MAX;
				4404	PyObject *result;
				4405
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4406	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4407	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4408	return NULL;
				4409	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4410	(PyObject *)substring);
				4411	if (substring == NULL)
				4412	return NULL;
				4413
				4414	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4415
				4416	Py_DECREF(substring);
				4417	return result;
				4418	}
				4419
				4420
				4421	static char endswith__doc__[] =
				4422	"S.endswith(suffix[, start[, end]]) -> int\n\
				4423	\n\
				4424	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4425	optional start, test S beginning at that position. With optional end, stop\n\
				4426	comparing S at that position.";
				4427
				4428	static PyObject *
				4429	unicode_endswith(PyUnicodeObject *self,
				4430	PyObject *args)
				4431	{
				4432	PyUnicodeObject *substring;
				4433	int start = 0;
				4434	int end = INT_MAX;
				4435	PyObject *result;
				4436
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4437	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4438	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4439	return NULL;
				4440	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4441	(PyObject *)substring);
				4442	if (substring == NULL)
				4443	return NULL;
				4444
				4445	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4446
				4447	Py_DECREF(substring);
				4448	return result;
				4449	}
				4450
				4451
				4452	static PyMethodDef unicode_methods[] = {
				4453
				4454	/* Order is according to common usage: often used methods should
				4455	appear first, since lookup is done sequentially. */
				4456
				4457	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4458	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4459	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4460	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4461	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4462	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4463	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4464	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4465	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4466	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4467	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4468	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4469	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4470	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4471	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4472	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4473	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4474	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4475	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4476	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4477	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4478	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4479	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4480	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4481	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4482	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4483	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4484	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4485	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4486	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4487	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4488	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4489	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4490	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4491	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4492	#if 0
				4493	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4494	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4495	#endif
				4496
				4497	#if 0
				4498	/* This one is just used for debugging the implementation. */
				4499	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4500	#endif
				4501
				4502	{NULL, NULL}
				4503	};
				4504
				4505	static PyObject *
				4506	unicode_getattr(PyUnicodeObject self, char name)
				4507	{
				4508	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4509	}
				4510
				4511	static PySequenceMethods unicode_as_sequence = {
				4512	(inquiry) unicode_length, /* sq_length */
				4513	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4514	(intargfunc) unicode_repeat, /* sq_repeat */
				4515	(intargfunc) unicode_getitem, /* sq_item */
				4516	(intintargfunc) unicode_slice, /* sq_slice */
				4517	0, /* sq_ass_item */
				4518	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4519	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4520	};
				4521
				4522	static int
				4523	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4524	int index,
				4525	const void **ptr)
				4526	{
				4527	if (index != 0) {
				4528	PyErr_SetString(PyExc_SystemError,
				4529	"accessing non-existent unicode segment");
				4530	return -1;
				4531	}
				4532	ptr = (void ) self->str;
				4533	return PyUnicode_GET_DATA_SIZE(self);
				4534	}
				4535
				4536	static int
				4537	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4538	const void **ptr)
				4539	{
				4540	PyErr_SetString(PyExc_TypeError,
				4541	"cannot use unicode as modifyable buffer");
				4542	return -1;
				4543	}
				4544
				4545	static int
				4546	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4547	int *lenp)
				4548	{
				4549	if (lenp)
				4550	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4551	return 1;
				4552	}
				4553
				4554	static int
				4555	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4556	int index,
				4557	const void **ptr)
				4558	{
				4559	PyObject *str;
				4560
				4561	if (index != 0) {
				4562	PyErr_SetString(PyExc_SystemError,
				4563	"accessing non-existent unicode segment");
				4564	return -1;
				4565	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4566	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4567	if (str == NULL)
				4568	return -1;
				4569	ptr = (void ) PyString_AS_STRING(str);
				4570	return PyString_GET_SIZE(str);
				4571	}
				4572
				4573	/* Helpers for PyUnicode_Format() */
				4574
				4575	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4576	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4577	{
				4578	int argidx = *p_argidx;
				4579	if (argidx < arglen) {
				4580	(*p_argidx)++;
				4581	if (arglen < 0)
				4582	return args;
				4583	else
				4584	return PyTuple_GetItem(args, argidx);
				4585	}
				4586	PyErr_SetString(PyExc_TypeError,
				4587	"not enough arguments for format string");
				4588	return NULL;
				4589	}
				4590
				4591	#define F_LJUST (1<<0)
				4592	#define F_SIGN (1<<1)
				4593	#define F_BLANK (1<<2)
				4594	#define F_ALT (1<<3)
				4595	#define F_ZERO (1<<4)
				4596
				4597	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4598	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4599	{
				4600	register int i;
				4601	int len;
				4602	va_list va;
				4603	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4604	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4605
				4606	/* First, format the string as char array, then expand to Py_UNICODE
				4607	array. */
				4608	charbuffer = (char *)buffer;
				4609	len = vsprintf(charbuffer, format, va);
				4610	for (i = len - 1; i >= 0; i--)
				4611	buffer[i] = (Py_UNICODE) charbuffer[i];
				4612
				4613	va_end(va);
				4614	return len;
				4615	}
				4616
				4617	static int
				4618	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4619	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4620	int flags,
				4621	int prec,
				4622	int type,
				4623	PyObject *v)
				4624	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4625	/* fmt = '%#.' + `prec` + `type`
				4626	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4627	char fmt[20];
				4628	double x;
				4629
				4630	x = PyFloat_AsDouble(v);
				4631	if (x == -1.0 && PyErr_Occurred())
				4632	return -1;
				4633	if (prec < 0)
				4634	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4635	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4636	type = 'g';
				4637	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4638	/* worst case length calc to ensure no buffer overrun:
				4639	fmt = %#.<prec>g
				4640	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4641	for any double rep.)
				4642	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4643	If prec=0 the effective precision is 1 (the leading digit is
				4644	always given), therefore increase by one to 10+prec. */
				4645	if (buflen <= (size_t)10 + (size_t)prec) {
				4646	PyErr_SetString(PyExc_OverflowError,
				4647	"formatted float is too long (precision too long?)");
				4648	return -1;
				4649	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4650	return usprintf(buf, fmt, x);
				4651	}
				4652
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4653	static PyObject*
				4654	formatlong(PyObject *val, int flags, int prec, int type)
				4655	{
				4656	char *buf;
				4657	int i, len;
				4658	PyObject str; / temporary string object. */
				4659	PyUnicodeObject *result;
				4660
				4661	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4662	if (!str)
				4663	return NULL;
				4664	result = _PyUnicode_New(len);
				4665	for (i = 0; i < len; i++)
				4666	result->str[i] = buf[i];
				4667	result->str[len] = 0;
				4668	Py_DECREF(str);
				4669	return (PyObject*)result;
				4670	}
				4671
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4672	static int
				4673	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4674	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4675	int flags,
				4676	int prec,
				4677	int type,
				4678	PyObject *v)
				4679	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4680	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4681	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4682	+ 1 + 1 = 24*/
				4683	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4684	long x;
				4685
				4686	x = PyInt_AsLong(v);
				4687	if (x == -1 && PyErr_Occurred())
				4688	return -1;
				4689	if (prec < 0)
				4690	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4691	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4692	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4693	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4694	PyErr_SetString(PyExc_OverflowError,
				4695	"formatted integer is too long (precision too long?)");
				4696	return -1;
				4697	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4698	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4699	return usprintf(buf, fmt, x);
				4700	}
				4701
				4702	static int
				4703	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4704	size_t buflen,
				4705	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4706	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4707	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4708	if (PyUnicode_Check(v)) {
				4709	if (PyUnicode_GET_SIZE(v) != 1)
				4710	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4711	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4712	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4713
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4714	else if (PyString_Check(v)) {
				4715	if (PyString_GET_SIZE(v) != 1)
				4716	goto onError;
				4717	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4718	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4719
				4720	else {
				4721	/* Integer input truncated to a character */
				4722	long x;
				4723	x = PyInt_AsLong(v);
				4724	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4725	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4726	buf[0] = (char) x;
				4727	}
				4728	buf[1] = '\0';
				4729	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4730
				4731	onError:
				4732	PyErr_SetString(PyExc_TypeError,
				4733	"%c requires int or char");
				4734	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4735	}
				4736
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4737	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4738
				4739	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4740	chars are formatted. XXX This is a magic number. Each formatting
				4741	routine does bounds checking to ensure no overflow, but a better
				4742	solution may be to malloc a buffer of appropriate size for each
				4743	format. For now, the current solution is sufficient.
				4744	*/
				4745	#define FORMATBUFLEN (size_t)120
				4746
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4747	PyObject PyUnicode_Format(PyObject format,
				4748	PyObject *args)
				4749	{
				4750	Py_UNICODE fmt, res;
				4751	int fmtcnt, rescnt, reslen, arglen, argidx;
				4752	int args_owned = 0;
				4753	PyUnicodeObject *result = NULL;
				4754	PyObject *dict = NULL;
				4755	PyObject *uformat;
				4756
				4757	if (format == NULL \|\| args == NULL) {
				4758	PyErr_BadInternalCall();
				4759	return NULL;
				4760	}
				4761	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4762	if (uformat == NULL)
				4763	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4764	fmt = PyUnicode_AS_UNICODE(uformat);
				4765	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4766
				4767	reslen = rescnt = fmtcnt + 100;
				4768	result = _PyUnicode_New(reslen);
				4769	if (result == NULL)
				4770	goto onError;
				4771	res = PyUnicode_AS_UNICODE(result);
				4772
				4773	if (PyTuple_Check(args)) {
				4774	arglen = PyTuple_Size(args);
				4775	argidx = 0;
				4776	}
				4777	else {
				4778	arglen = -1;
				4779	argidx = -2;
				4780	}
				4781	if (args->ob_type->tp_as_mapping)
				4782	dict = args;
				4783
				4784	while (--fmtcnt >= 0) {
				4785	if (*fmt != '%') {
				4786	if (--rescnt < 0) {
				4787	rescnt = fmtcnt + 100;
				4788	reslen += rescnt;
				4789	if (_PyUnicode_Resize(result, reslen) < 0)
				4790	return NULL;
				4791	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4792	--rescnt;
				4793	}
				4794	res++ = fmt++;
				4795	}
				4796	else {
				4797	/* Got a format specifier */
				4798	int flags = 0;
				4799	int width = -1;
				4800	int prec = -1;
				4801	int size = 0;
				4802	Py_UNICODE c = '\0';
				4803	Py_UNICODE fill;
				4804	PyObject *v = NULL;
				4805	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4806	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4807	Py_UNICODE sign;
				4808	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4809	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4810
				4811	fmt++;
				4812	if (*fmt == '(') {
				4813	Py_UNICODE *keystart;
				4814	int keylen;
				4815	PyObject *key;
				4816	int pcount = 1;
				4817
				4818	if (dict == NULL) {
				4819	PyErr_SetString(PyExc_TypeError,
				4820	"format requires a mapping");
				4821	goto onError;
				4822	}
				4823	++fmt;
				4824	--fmtcnt;
				4825	keystart = fmt;
				4826	/* Skip over balanced parentheses */
				4827	while (pcount > 0 && --fmtcnt >= 0) {
				4828	if (*fmt == ')')
				4829	--pcount;
				4830	else if (*fmt == '(')
				4831	++pcount;
				4832	fmt++;
				4833	}
				4834	keylen = fmt - keystart - 1;
				4835	if (fmtcnt < 0 \|\| pcount > 0) {
				4836	PyErr_SetString(PyExc_ValueError,
				4837	"incomplete format key");
				4838	goto onError;
				4839	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4840	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4841	then looked up since Python uses strings to hold
				4842	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4843	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4844	key = PyUnicode_EncodeUTF8(keystart,
				4845	keylen,
				4846	NULL);
				4847	if (key == NULL)
				4848	goto onError;
				4849	if (args_owned) {
				4850	Py_DECREF(args);
				4851	args_owned = 0;
				4852	}
				4853	args = PyObject_GetItem(dict, key);
				4854	Py_DECREF(key);
				4855	if (args == NULL) {
				4856	goto onError;
				4857	}
				4858	args_owned = 1;
				4859	arglen = -1;
				4860	argidx = -2;
				4861	}
				4862	while (--fmtcnt >= 0) {
				4863	switch (c = *fmt++) {
				4864	case '-': flags \|= F_LJUST; continue;
				4865	case '+': flags \|= F_SIGN; continue;
				4866	case ' ': flags \|= F_BLANK; continue;
				4867	case '#': flags \|= F_ALT; continue;
				4868	case '0': flags \|= F_ZERO; continue;
				4869	}
				4870	break;
				4871	}
				4872	if (c == '*') {
				4873	v = getnextarg(args, arglen, &argidx);
				4874	if (v == NULL)
				4875	goto onError;
				4876	if (!PyInt_Check(v)) {
				4877	PyErr_SetString(PyExc_TypeError,
				4878	"* wants int");
				4879	goto onError;
				4880	}
				4881	width = PyInt_AsLong(v);
				4882	if (width < 0) {
				4883	flags \|= F_LJUST;
				4884	width = -width;
				4885	}
				4886	if (--fmtcnt >= 0)
				4887	c = *fmt++;
				4888	}
				4889	else if (c >= '0' && c <= '9') {
				4890	width = c - '0';
				4891	while (--fmtcnt >= 0) {
				4892	c = *fmt++;
				4893	if (c < '0' \|\| c > '9')
				4894	break;
				4895	if ((width*10) / 10 != width) {
				4896	PyErr_SetString(PyExc_ValueError,
				4897	"width too big");
				4898	goto onError;
				4899	}
				4900	width = width*10 + (c - '0');
				4901	}
				4902	}
				4903	if (c == '.') {
				4904	prec = 0;
				4905	if (--fmtcnt >= 0)
				4906	c = *fmt++;
				4907	if (c == '*') {
				4908	v = getnextarg(args, arglen, &argidx);
				4909	if (v == NULL)
				4910	goto onError;
				4911	if (!PyInt_Check(v)) {
				4912	PyErr_SetString(PyExc_TypeError,
				4913	"* wants int");
				4914	goto onError;
				4915	}
				4916	prec = PyInt_AsLong(v);
				4917	if (prec < 0)
				4918	prec = 0;
				4919	if (--fmtcnt >= 0)
				4920	c = *fmt++;
				4921	}
				4922	else if (c >= '0' && c <= '9') {
				4923	prec = c - '0';
				4924	while (--fmtcnt >= 0) {
				4925	c = Py_CHARMASK(*fmt++);
				4926	if (c < '0' \|\| c > '9')
				4927	break;
				4928	if ((prec*10) / 10 != prec) {
				4929	PyErr_SetString(PyExc_ValueError,
				4930	"prec too big");
				4931	goto onError;
				4932	}
				4933	prec = prec*10 + (c - '0');
				4934	}
				4935	}
				4936	} /* prec */
				4937	if (fmtcnt >= 0) {
				4938	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4939	size = c;
				4940	if (--fmtcnt >= 0)
				4941	c = *fmt++;
				4942	}
				4943	}
				4944	if (fmtcnt < 0) {
				4945	PyErr_SetString(PyExc_ValueError,
				4946	"incomplete format");
				4947	goto onError;
				4948	}
				4949	if (c != '%') {
				4950	v = getnextarg(args, arglen, &argidx);
				4951	if (v == NULL)
				4952	goto onError;
				4953	}
				4954	sign = 0;
				4955	fill = ' ';
				4956	switch (c) {
				4957
				4958	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4959	pbuf = formatbuf;
				4960	/* presume that buffer length is at least 1 */
				4961	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4962	len = 1;
				4963	break;
				4964
				4965	case 's':
				4966	case 'r':
				4967	if (PyUnicode_Check(v) && c == 's') {
				4968	temp = v;
				4969	Py_INCREF(temp);
				4970	}
				4971	else {
				4972	PyObject *unicode;
				4973	if (c == 's')
				4974	temp = PyObject_Str(v);
				4975	else
				4976	temp = PyObject_Repr(v);
				4977	if (temp == NULL)
				4978	goto onError;
				4979	if (!PyString_Check(temp)) {
				4980	/* XXX Note: this should never happen, since
				4981	PyObject_Repr() and PyObject_Str() assure
				4982	this */
				4983	Py_DECREF(temp);
				4984	PyErr_SetString(PyExc_TypeError,
				4985	"%s argument has non-string str()");
				4986	goto onError;
				4987	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4988	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4989	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4990	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4991	"strict");
				4992	Py_DECREF(temp);
				4993	temp = unicode;
				4994	if (temp == NULL)
				4995	goto onError;
				4996	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4997	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4998	len = PyUnicode_GET_SIZE(temp);
				4999	if (prec >= 0 && len > prec)
				5000	len = prec;
				5001	break;
				5002
				5003	case 'i':
				5004	case 'd':
				5005	case 'u':
				5006	case 'o':
				5007	case 'x':
				5008	case 'X':
				5009	if (c == 'i')
				5010	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5011	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5012	temp = formatlong(v, flags, prec, c);
				5013	if (!temp)
				5014	goto onError;
				5015	pbuf = PyUnicode_AS_UNICODE(temp);
				5016	len = PyUnicode_GET_SIZE(temp);
				5017	/* unbounded ints can always produce
				5018	a sign character! */
				5019	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5020	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5021	else {
				5022	pbuf = formatbuf;
				5023	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5024	flags, prec, c, v);
				5025	if (len < 0)
				5026	goto onError;
				5027	/* only d conversion is signed */
				5028	sign = c == 'd';
				5029	}
				5030	if (flags & F_ZERO)
				5031	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5032	break;
				5033
				5034	case 'e':
				5035	case 'E':
				5036	case 'f':
				5037	case 'g':
				5038	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5039	pbuf = formatbuf;
				5040	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5041	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5042	if (len < 0)
				5043	goto onError;
				5044	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5045	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5046	fill = '0';
				5047	break;
				5048
				5049	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5050	pbuf = formatbuf;
				5051	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5052	if (len < 0)
				5053	goto onError;
				5054	break;
				5055
				5056	default:
				5057	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5058	"unsupported format character '%c' (0x%x) "
				5059	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5060	(31<=c && c<=126) ? c : '?',
				5061	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5062	goto onError;
				5063	}
				5064	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5065	if (pbuf == '-' \|\| pbuf == '+') {
				5066	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5067	len--;
				5068	}
				5069	else if (flags & F_SIGN)
				5070	sign = '+';
				5071	else if (flags & F_BLANK)
				5072	sign = ' ';
				5073	else
				5074	sign = 0;
				5075	}
				5076	if (width < len)
				5077	width = len;
				5078	if (rescnt < width + (sign != 0)) {
				5079	reslen -= rescnt;
				5080	rescnt = width + fmtcnt + 100;
				5081	reslen += rescnt;
				5082	if (_PyUnicode_Resize(result, reslen) < 0)
				5083	return NULL;
				5084	res = PyUnicode_AS_UNICODE(result)
				5085	+ reslen - rescnt;
				5086	}
				5087	if (sign) {
				5088	if (fill != ' ')
				5089	*res++ = sign;
				5090	rescnt--;
				5091	if (width > len)
				5092	width--;
				5093	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5094	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5095	assert(pbuf[0] == '0');
				5096	assert(pbuf[1] == c);
				5097	if (fill != ' ') {
				5098	res++ = pbuf++;
				5099	res++ = pbuf++;
				5100	}
				5101	rescnt -= 2;
				5102	width -= 2;
				5103	if (width < 0)
				5104	width = 0;
				5105	len -= 2;
				5106	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5107	if (width > len && !(flags & F_LJUST)) {
				5108	do {
				5109	--rescnt;
				5110	*res++ = fill;
				5111	} while (--width > len);
				5112	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5113	if (fill == ' ') {
				5114	if (sign)
				5115	*res++ = sign;
				5116	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5117	assert(pbuf[0] == '0');
				5118	assert(pbuf[1] == c);
				5119	res++ = pbuf++;
				5120	res++ = pbuf++;
				5121	}
				5122	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5123	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5124	res += len;
				5125	rescnt -= len;
				5126	while (--width >= len) {
				5127	--rescnt;
				5128	*res++ = ' ';
				5129	}
				5130	if (dict && (argidx < arglen) && c != '%') {
				5131	PyErr_SetString(PyExc_TypeError,
				5132	"not all arguments converted");
				5133	goto onError;
				5134	}
				5135	Py_XDECREF(temp);
				5136	} /* '%' */
				5137	} /* until end */
				5138	if (argidx < arglen && !dict) {
				5139	PyErr_SetString(PyExc_TypeError,
				5140	"not all arguments converted");
				5141	goto onError;
				5142	}
				5143
				5144	if (args_owned) {
				5145	Py_DECREF(args);
				5146	}
				5147	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5148	if (_PyUnicode_Resize(result, reslen - rescnt))
				5149	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5150	return (PyObject *)result;
				5151
				5152	onError:
				5153	Py_XDECREF(result);
				5154	Py_DECREF(uformat);
				5155	if (args_owned) {
				5156	Py_DECREF(args);
				5157	}
				5158	return NULL;
				5159	}
				5160
				5161	static PyBufferProcs unicode_as_buffer = {
				5162	(getreadbufferproc) unicode_buffer_getreadbuf,
				5163	(getwritebufferproc) unicode_buffer_getwritebuf,
				5164	(getsegcountproc) unicode_buffer_getsegcount,
				5165	(getcharbufferproc) unicode_buffer_getcharbuf,
				5166	};
				5167
				5168	PyTypeObject PyUnicode_Type = {
				5169	PyObject_HEAD_INIT(&PyType_Type)
				5170	0, /* ob_size */
				5171	"unicode", /* tp_name */
				5172	sizeof(PyUnicodeObject), /* tp_size */
				5173	0, /* tp_itemsize */
				5174	/* Slots */
				5175	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5176	0, /* tp_print */
				5177	(getattrfunc)unicode_getattr, /* tp_getattr */
				5178	0, /* tp_setattr */
				5179	(cmpfunc) unicode_compare, /* tp_compare */
				5180	(reprfunc) unicode_repr, /* tp_repr */
				5181	0, /* tp_as_number */
				5182	&unicode_as_sequence, /* tp_as_sequence */
				5183	0, /* tp_as_mapping */
				5184	(hashfunc) unicode_hash, /* tp_hash*/
				5185	0, /* tp_call*/
				5186	(reprfunc) unicode_str, /* tp_str */
				5187	(getattrofunc) NULL, /* tp_getattro */
				5188	(setattrofunc) NULL, /* tp_setattro */
				5189	&unicode_as_buffer, /* tp_as_buffer */
				5190	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5191	};
				5192
				5193	/* Initialize the Unicode implementation */
				5194
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5195	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5196	{
				5197	/* Doublecheck the configuration... */
				5198	if (sizeof(Py_UNICODE) != 2)
				5199	Py_FatalError("Unicode configuration error: "
				5200	"sizeof(Py_UNICODE) != 2 bytes");
				5201
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5202	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5203	unicode_freelist = NULL;
				5204	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5205	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5206	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5207	}
				5208
				5209	/* Finalize the Unicode implementation */
				5210
				5211	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5212	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5213	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5214	PyUnicodeObject *u;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5215
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5216	Py_XDECREF(unicode_empty);
				5217	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5218
				5219	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5220	PyUnicodeObject *v = u;
				5221	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5222	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5223	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5224	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5225	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5226	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5227	unicode_freelist = NULL;
				5228	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5229	}