Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 5c193dda4ce837e7fc2cc25eebb9cb726ba80a0d [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
				86	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	88
				89	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	90	static PyUnicodeObject *unicode_freelist;
				91	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	93	/* Default encoding to use and assume when NULL is passed as encoding
				94	parameter; it is initialized by _PyUnicode_Init().
				95
				96	Always use the PyUnicode_SetDefaultEncoding() and
				97	PyUnicode_GetDefaultEncoding() APIs to access this global.
				98
				99	*/
				100
				101	static char unicode_default_encoding[100];
				102
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	103	/* --- Unicode Object ----------------------------------------------------- */
				104
				105	static
				106	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				107	int length)
				108	{
				109	void *oldstr;
				110
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	111	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	112	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	113	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	114
				115	/* Resizing unicode_empty is not allowed. */
				116	if (unicode == unicode_empty) {
				117	PyErr_SetString(PyExc_SystemError,
				118	"can't resize empty unicode object");
				119	return -1;
				120	}
				121
				122	/* We allocate one more byte to make sure the string is
				123	Ux0000 terminated -- XXX is this needed ? */
				124	oldstr = unicode->str;
				125	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				126	if (!unicode->str) {
				127	unicode->str = oldstr;
				128	PyErr_NoMemory();
				129	return -1;
				130	}
				131	unicode->str[length] = 0;
				132	unicode->length = length;
				133
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	134	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	136	if (unicode->defenc) {
				137	Py_DECREF(unicode->defenc);
				138	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	}
				140	unicode->hash = -1;
				141
				142	return 0;
				143	}
				144
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	int PyUnicode_Resize(PyObject **unicode,
				146	int length)
				147	{
				148	PyUnicodeObject *v;
				149
				150	if (unicode == NULL) {
				151	PyErr_BadInternalCall();
				152	return -1;
				153	}
				154	v = (PyUnicodeObject )unicode;
				155	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				156	PyErr_BadInternalCall();
				157	return -1;
				158	}
				159	return _PyUnicode_Resize(v, length);
				160	}
				161
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	162	/* We allocate one more byte to make sure the string is
				163	Ux0000 terminated -- XXX is this needed ?
				164
				165	XXX This allocator could further be enhanced by assuring that the
				166	free list never reduces its size below 1.
				167
				168	*/
				169
				170	static
				171	PyUnicodeObject *_PyUnicode_New(int length)
				172	{
				173	register PyUnicodeObject *unicode;
				174
				175	/* Optimization for empty strings */
				176	if (length == 0 && unicode_empty != NULL) {
				177	Py_INCREF(unicode_empty);
				178	return unicode_empty;
				179	}
				180
				181	/* Unicode freelist & memory allocation */
				182	if (unicode_freelist) {
				183	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	184	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	185	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	186	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	187	/* Keep-Alive optimization: we only upsize the buffer,
				188	never downsize it. */
				189	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	190	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	191	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	192	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	193	}
				194	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	195	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	197	}
				198	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	}
				200	else {
				201	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				202	if (unicode == NULL)
				203	return NULL;
				204	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				205	}
				206
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	207	if (!unicode->str) {
				208	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	209	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	211	unicode->str[length] = 0;
				212	unicode->length = length;
				213	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	214	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	215	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	216
				217	onError:
				218	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	219	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	220	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	221	}
				222
				223	static
				224	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				225	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	226	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	227	/* Keep-Alive optimization */
				228	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	229	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	230	unicode->str = NULL;
				231	unicode->length = 0;
				232	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	233	if (unicode->defenc) {
				234	Py_DECREF(unicode->defenc);
				235	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	236	}
				237	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	238	(PyUnicodeObject *)unicode = unicode_freelist;
				239	unicode_freelist = unicode;
				240	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	}
				242	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	243	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	244	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	246	}
				247	}
				248
				249	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				250	int size)
				251	{
				252	PyUnicodeObject *unicode;
				253
				254	unicode = _PyUnicode_New(size);
				255	if (!unicode)
				256	return NULL;
				257
				258	/* Copy the Unicode data into the new object */
				259	if (u != NULL)
				260	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				261
				262	return (PyObject *)unicode;
				263	}
				264
				265	#ifdef HAVE_WCHAR_H
				266
				267	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				268	int size)
				269	{
				270	PyUnicodeObject *unicode;
				271
				272	if (w == NULL) {
				273	PyErr_BadInternalCall();
				274	return NULL;
				275	}
				276
				277	unicode = _PyUnicode_New(size);
				278	if (!unicode)
				279	return NULL;
				280
				281	/* Copy the wchar_t data into the new object */
				282	#ifdef HAVE_USABLE_WCHAR_T
				283	memcpy(unicode->str, w, size * sizeof(wchar_t));
				284	#else
				285	{
				286	register Py_UNICODE *u;
				287	register int i;
				288	u = PyUnicode_AS_UNICODE(unicode);
				289	for (i = size; i >= 0; i--)
				290	u++ = w++;
				291	}
				292	#endif
				293
				294	return (PyObject *)unicode;
				295	}
				296
				297	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				298	register wchar_t *w,
				299	int size)
				300	{
				301	if (unicode == NULL) {
				302	PyErr_BadInternalCall();
				303	return -1;
				304	}
				305	if (size > PyUnicode_GET_SIZE(unicode))
				306	size = PyUnicode_GET_SIZE(unicode);
				307	#ifdef HAVE_USABLE_WCHAR_T
				308	memcpy(w, unicode->str, size * sizeof(wchar_t));
				309	#else
				310	{
				311	register Py_UNICODE *u;
				312	register int i;
				313	u = PyUnicode_AS_UNICODE(unicode);
				314	for (i = size; i >= 0; i--)
				315	w++ = u++;
				316	}
				317	#endif
				318
				319	return size;
				320	}
				321
				322	#endif
				323
				324	PyObject PyUnicode_FromObject(register PyObject obj)
				325	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	326	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				327	}
				328
				329	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				330	const char *encoding,
				331	const char *errors)
				332	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	333	const char *s;
				334	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	335	int owned = 0;
				336	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	337
				338	if (obj == NULL) {
				339	PyErr_BadInternalCall();
				340	return NULL;
				341	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	342
				343	/* Coerce object */
				344	if (PyInstance_Check(obj)) {
				345	PyObject *func;
				346	func = PyObject_GetAttrString(obj, "__str__");
				347	if (func == NULL) {
				348	PyErr_SetString(PyExc_TypeError,
				349	"coercing to Unicode: instance doesn't define __str__");
				350	return NULL;
				351	}
				352	obj = PyEval_CallObject(func, NULL);
				353	Py_DECREF(func);
				354	if (obj == NULL)
				355	return NULL;
				356	owned = 1;
				357	}
				358	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	359	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	360	v = obj;
				361	if (encoding) {
				362	PyErr_SetString(PyExc_TypeError,
				363	"decoding Unicode is not supported");
				364	return NULL;
				365	}
				366	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	367	}
				368	else if (PyString_Check(obj)) {
				369	s = PyString_AS_STRING(obj);
				370	len = PyString_GET_SIZE(obj);
				371	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	372	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				373	/* Overwrite the error message with something more useful in
				374	case of a TypeError. */
				375	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	376	PyErr_Format(PyExc_TypeError,
				377	"coercing to Unicode: need string or buffer, "
				378	"%.80s found",
				379	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	380	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	381	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	382
				383	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	384	if (len == 0) {
				385	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	386	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	387	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	388	else
				389	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	390
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	391	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	392	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	393	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	394	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	395	return v;
				396
				397	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	398	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	399	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	400	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	401	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	402	}
				403
				404	PyObject PyUnicode_Decode(const char s,
				405	int size,
				406	const char *encoding,
				407	const char *errors)
				408	{
				409	PyObject buffer = NULL, unicode;
				410
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	411	if (encoding == NULL)
				412	encoding = PyUnicode_GetDefaultEncoding();
				413
				414	/* Shortcuts for common default encodings */
				415	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	416	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	417	else if (strcmp(encoding, "latin-1") == 0)
				418	return PyUnicode_DecodeLatin1(s, size, errors);
				419	else if (strcmp(encoding, "ascii") == 0)
				420	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	421
				422	/* Decode via the codec registry */
				423	buffer = PyBuffer_FromMemory((void *)s, size);
				424	if (buffer == NULL)
				425	goto onError;
				426	unicode = PyCodec_Decode(buffer, encoding, errors);
				427	if (unicode == NULL)
				428	goto onError;
				429	if (!PyUnicode_Check(unicode)) {
				430	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	431	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	432	unicode->ob_type->tp_name);
				433	Py_DECREF(unicode);
				434	goto onError;
				435	}
				436	Py_DECREF(buffer);
				437	return unicode;
				438
				439	onError:
				440	Py_XDECREF(buffer);
				441	return NULL;
				442	}
				443
				444	PyObject PyUnicode_Encode(const Py_UNICODE s,
				445	int size,
				446	const char *encoding,
				447	const char *errors)
				448	{
				449	PyObject v, unicode;
				450
				451	unicode = PyUnicode_FromUnicode(s, size);
				452	if (unicode == NULL)
				453	return NULL;
				454	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				455	Py_DECREF(unicode);
				456	return v;
				457	}
				458
				459	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				460	const char *encoding,
				461	const char *errors)
				462	{
				463	PyObject *v;
				464
				465	if (!PyUnicode_Check(unicode)) {
				466	PyErr_BadArgument();
				467	goto onError;
				468	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	469
				470	if (encoding == NULL)
				471	encoding = PyUnicode_GetDefaultEncoding();
				472
				473	/* Shortcuts for common default encodings */
				474	if (errors == NULL) {
				475	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	476	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	477	else if (strcmp(encoding, "latin-1") == 0)
				478	return PyUnicode_AsLatin1String(unicode);
				479	else if (strcmp(encoding, "ascii") == 0)
				480	return PyUnicode_AsASCIIString(unicode);
				481	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	482
				483	/* Encode via the codec registry */
				484	v = PyCodec_Encode(unicode, encoding, errors);
				485	if (v == NULL)
				486	goto onError;
				487	/* XXX Should we really enforce this ? */
				488	if (!PyString_Check(v)) {
				489	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	490	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	491	v->ob_type->tp_name);
				492	Py_DECREF(v);
				493	goto onError;
				494	}
				495	return v;
				496
				497	onError:
				498	return NULL;
				499	}
				500
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	501	/* Return a Python string holding the default encoded value of the
				502	Unicode object.
				503
				504	The resulting string is cached in the Unicode object for subsequent
				505	usage by this function. The cached version is needed to implement
				506	the character buffer interface and will live (at least) as long as
				507	the Unicode object itself.
				508
				509	The refcount of the string is not incremented.
				510
				511	* Exported for internal use by the interpreter only !!! *
				512
				513	*/
				514
				515	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				516	const char *errors)
				517	{
				518	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				519
				520	if (v)
				521	return v;
				522	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				523	if (v && errors == NULL)
				524	((PyUnicodeObject *)unicode)->defenc = v;
				525	return v;
				526	}
				527
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	528	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				529	{
				530	if (!PyUnicode_Check(unicode)) {
				531	PyErr_BadArgument();
				532	goto onError;
				533	}
				534	return PyUnicode_AS_UNICODE(unicode);
				535
				536	onError:
				537	return NULL;
				538	}
				539
				540	int PyUnicode_GetSize(PyObject *unicode)
				541	{
				542	if (!PyUnicode_Check(unicode)) {
				543	PyErr_BadArgument();
				544	goto onError;
				545	}
				546	return PyUnicode_GET_SIZE(unicode);
				547
				548	onError:
				549	return -1;
				550	}
				551
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	552	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	553	{
				554	return unicode_default_encoding;
				555	}
				556
				557	int PyUnicode_SetDefaultEncoding(const char *encoding)
				558	{
				559	PyObject *v;
				560
				561	/* Make sure the encoding is valid. As side effect, this also
				562	loads the encoding into the codec registry cache. */
				563	v = _PyCodec_Lookup(encoding);
				564	if (v == NULL)
				565	goto onError;
				566	Py_DECREF(v);
				567	strncpy(unicode_default_encoding,
				568	encoding,
				569	sizeof(unicode_default_encoding));
				570	return 0;
				571
				572	onError:
				573	return -1;
				574	}
				575
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	576	/* --- UTF-8 Codec -------------------------------------------------------- */
				577
				578	static
				579	char utf8_code_length[256] = {
				580	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				581	illegal prefix. see RFC 2279 for details */
				582	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				583	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				584	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				585	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				586	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				587	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				588	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				589	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				590	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				591	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				592	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				593	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				594	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				595	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				596	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				597	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				598	};
				599
				600	static
				601	int utf8_decoding_error(const char **source,
				602	Py_UNICODE **dest,
				603	const char *errors,
				604	const char *details)
				605	{
				606	if ((errors == NULL) \|\|
				607	(strcmp(errors,"strict") == 0)) {
				608	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	609	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	610	details);
				611	return -1;
				612	}
				613	else if (strcmp(errors,"ignore") == 0) {
				614	(*source)++;
				615	return 0;
				616	}
				617	else if (strcmp(errors,"replace") == 0) {
				618	(*source)++;
				619	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				620	(*dest)++;
				621	return 0;
				622	}
				623	else {
				624	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	625	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	626	errors);
				627	return -1;
				628	}
				629	}
				630
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	631	PyObject PyUnicode_DecodeUTF8(const char s,
				632	int size,
				633	const char *errors)
				634	{
				635	int n;
				636	const char *e;
				637	PyUnicodeObject *unicode;
				638	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	639	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	640
				641	/* Note: size will always be longer than the resulting Unicode
				642	character count */
				643	unicode = _PyUnicode_New(size);
				644	if (!unicode)
				645	return NULL;
				646	if (size == 0)
				647	return (PyObject *)unicode;
				648
				649	/* Unpack UTF-8 encoded data */
				650	p = unicode->str;
				651	e = s + size;
				652
				653	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	654	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	655
				656	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	657	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	658	s++;
				659	continue;
				660	}
				661
				662	n = utf8_code_length[ch];
				663
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	664	if (s + n > e) {
				665	errmsg = "unexpected end of data";
				666	goto utf8Error;
				667	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	668
				669	switch (n) {
				670
				671	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	672	errmsg = "unexpected code byte";
				673	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	674	break;
				675
				676	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	677	errmsg = "internal error";
				678	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	679	break;
				680
				681	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	682	if ((s[1] & 0xc0) != 0x80) {
				683	errmsg = "invalid data";
				684	goto utf8Error;
				685	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	686	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	687	if (ch < 0x80) {
				688	errmsg = "illegal encoding";
				689	goto utf8Error;
				690	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	691	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	692	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	693	break;
				694
				695	case 3:
				696	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	697	(s[2] & 0xc0) != 0x80) {
				698	errmsg = "invalid data";
				699	goto utf8Error;
				700	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	701	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	702	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				703	errmsg = "illegal encoding";
				704	goto utf8Error;
				705	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	706	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	707	*p++ = (Py_UNICODE)ch;
				708	break;
				709
				710	case 4:
				711	if ((s[1] & 0xc0) != 0x80 \|\|
				712	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	713	(s[3] & 0xc0) != 0x80) {
				714	errmsg = "invalid data";
				715	goto utf8Error;
				716	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	717	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				718	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				719	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	720	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				721	byte encoding */
				722	(ch > 0x10ffff)) { /* maximum value allowed for
				723	UTF-16 */
				724	errmsg = "illegal encoding";
				725	goto utf8Error;
				726	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	727	/* compute and append the two surrogates: */
				728
				729	/* translate from 10000..10FFFF to 0..FFFF */
				730	ch -= 0x10000;
				731
				732	/* high surrogate = top 10 bits added to D800 */
				733	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				734
				735	/* low surrogate = bottom 10 bits added to DC00 */
				736	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	737	break;
				738
				739	default:
				740	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	741	errmsg = "unsupported Unicode code range";
				742	goto utf8Error;
				743	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	744	}
				745	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	746	continue;
				747
				748	utf8Error:
				749	if (utf8_decoding_error(&s, &p, errors, errmsg))
				750	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	751	}
				752
				753	/* Adjust length */
				754	if (_PyUnicode_Resize(unicode, p - unicode->str))
				755	goto onError;
				756
				757	return (PyObject *)unicode;
				758
				759	onError:
				760	Py_DECREF(unicode);
				761	return NULL;
				762	}
				763
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	764	/* Not used anymore, now that the encoder supports UTF-16
				765	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	766	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	767	static
				768	int utf8_encoding_error(const Py_UNICODE **source,
				769	char **dest,
				770	const char *errors,
				771	const char *details)
				772	{
				773	if ((errors == NULL) \|\|
				774	(strcmp(errors,"strict") == 0)) {
				775	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	776	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	777	details);
				778	return -1;
				779	}
				780	else if (strcmp(errors,"ignore") == 0) {
				781	return 0;
				782	}
				783	else if (strcmp(errors,"replace") == 0) {
				784	**dest = '?';
				785	(*dest)++;
				786	return 0;
				787	}
				788	else {
				789	PyErr_Format(PyExc_ValueError,
				790	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	791	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	792	errors);
				793	return -1;
				794	}
				795	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	796	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	797
				798	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				799	int size,
				800	const char *errors)
				801	{
				802	PyObject *v;
				803	char *p;
				804	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	805	Py_UCS4 ch2;
				806	unsigned int cbAllocated = 3 * size;
				807	unsigned int cbWritten = 0;
				808	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	809
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	810	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	811	if (v == NULL)
				812	return NULL;
				813	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	814	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	815
				816	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	817	while (i < size) {
				818	Py_UCS4 ch = s[i++];
				819	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	820	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	821	cbWritten++;
				822	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	823	else if (ch < 0x0800) {
				824	*p++ = 0xc0 \| (ch >> 6);
				825	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	826	cbWritten += 2;
				827	}
				828	else {
				829	/* Check for high surrogate */
				830	if (0xD800 <= ch && ch <= 0xDBFF) {
				831	if (i != size) {
				832	ch2 = s[i];
				833	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				834
				835	if (cbWritten >= (cbAllocated - 4)) {
				836	/* Provide enough room for some more
				837	surrogates */
				838	cbAllocated += 4*10;
				839	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	840	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	841	}
				842
				843	/* combine the two values */
				844	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				845
				846	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	847	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	848	i++;
				849	cbWritten += 4;
				850	}
				851	}
				852	}
				853	else {
				854	*p++ = (char)(0xe0 \| (ch >> 12));
				855	cbWritten += 3;
				856	}
				857	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				858	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	859	}
				860	}
				861	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	862	if (_PyString_Resize(&v, p - q))
				863	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	864	return v;
				865
				866	onError:
				867	Py_DECREF(v);
				868	return NULL;
				869	}
				870
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	871	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				872	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	873	if (!PyUnicode_Check(unicode)) {
				874	PyErr_BadArgument();
				875	return NULL;
				876	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	877	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				878	PyUnicode_GET_SIZE(unicode),
				879	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	880	}
				881
				882	/* --- UTF-16 Codec ------------------------------------------------------- */
				883
				884	static
				885	int utf16_decoding_error(const Py_UNICODE **source,
				886	Py_UNICODE **dest,
				887	const char *errors,
				888	const char *details)
				889	{
				890	if ((errors == NULL) \|\|
				891	(strcmp(errors,"strict") == 0)) {
				892	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	893	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	894	details);
				895	return -1;
				896	}
				897	else if (strcmp(errors,"ignore") == 0) {
				898	return 0;
				899	}
				900	else if (strcmp(errors,"replace") == 0) {
				901	if (dest) {
				902	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				903	(*dest)++;
				904	}
				905	return 0;
				906	}
				907	else {
				908	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	909	"UTF-16 decoding error; "
				910	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	911	errors);
				912	return -1;
				913	}
				914	}
				915
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	916	PyObject PyUnicode_DecodeUTF16(const char s,
				917	int size,
				918	const char *errors,
				919	int *byteorder)
				920	{
				921	PyUnicodeObject *unicode;
				922	Py_UNICODE *p;
				923	const Py_UNICODE q, e;
				924	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	925	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	926
				927	/* size should be an even number */
				928	if (size % sizeof(Py_UNICODE) != 0) {
				929	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				930	return NULL;
				931	/* The remaining input chars are ignored if we fall through
				932	here... */
				933	}
				934
				935	/* Note: size will always be longer than the resulting Unicode
				936	character count */
				937	unicode = _PyUnicode_New(size);
				938	if (!unicode)
				939	return NULL;
				940	if (size == 0)
				941	return (PyObject *)unicode;
				942
				943	/* Unpack UTF-16 encoded data */
				944	p = unicode->str;
				945	q = (Py_UNICODE *)s;
				946	e = q + (size / sizeof(Py_UNICODE));
				947
				948	if (byteorder)
				949	bo = *byteorder;
				950
				951	while (q < e) {
				952	register Py_UNICODE ch = *q++;
				953
				954	/* Check for BOM marks (U+FEFF) in the input and adjust
				955	current byte order setting accordingly. Swap input
				956	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				957	!) */
				958	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				959	if (ch == 0xFEFF) {
				960	bo = -1;
				961	continue;
				962	} else if (ch == 0xFFFE) {
				963	bo = 1;
				964	continue;
				965	}
				966	if (bo == 1)
				967	ch = (ch >> 8) \| (ch << 8);
				968	#else
				969	if (ch == 0xFEFF) {
				970	bo = 1;
				971	continue;
				972	} else if (ch == 0xFFFE) {
				973	bo = -1;
				974	continue;
				975	}
				976	if (bo == -1)
				977	ch = (ch >> 8) \| (ch << 8);
				978	#endif
				979	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				980	*p++ = ch;
				981	continue;
				982	}
				983
				984	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	985	if (q >= e) {
				986	errmsg = "unexpected end of data";
				987	goto utf16Error;
				988	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	989	if (0xDC00 <= q && q <= 0xDFFF) {
				990	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	991	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	992	/* This is valid data (a UTF-16 surrogate pair), but
				993	we are not able to store this information since our
				994	Py_UNICODE type only has 16 bits... this might
				995	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	996	errmsg = "code pairs are not supported";
				997	goto utf16Error;
				998	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	999	else
				1000	continue;
				1001	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1002	errmsg = "illegal encoding";
				1003	/* Fall through to report the error */
				1004
				1005	utf16Error:
				1006	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1007	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1008	}
				1009
				1010	if (byteorder)
				1011	*byteorder = bo;
				1012
				1013	/* Adjust length */
				1014	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1015	goto onError;
				1016
				1017	return (PyObject *)unicode;
				1018
				1019	onError:
				1020	Py_DECREF(unicode);
				1021	return NULL;
				1022	}
				1023
				1024	#undef UTF16_ERROR
				1025
				1026	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1027	int size,
				1028	const char *errors,
				1029	int byteorder)
				1030	{
				1031	PyObject *v;
				1032	Py_UNICODE *p;
				1033	char *q;
				1034
				1035	/* We don't create UTF-16 pairs... */
				1036	v = PyString_FromStringAndSize(NULL,
				1037	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1038	if (v == NULL)
				1039	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1040
				1041	q = PyString_AS_STRING(v);
				1042	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1043	if (byteorder == 0)
				1044	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1045	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1046	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1047	if (byteorder == 0 \|\|
				1048	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1049	byteorder == -1
				1050	#else
				1051	byteorder == 1
				1052	#endif
				1053	)
				1054	memcpy(p, s, size * sizeof(Py_UNICODE));
				1055	else
				1056	while (size-- > 0) {
				1057	Py_UNICODE ch = *s++;
				1058	*p++ = (ch >> 8) \| (ch << 8);
				1059	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1060	return v;
				1061	}
				1062
				1063	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1064	{
				1065	if (!PyUnicode_Check(unicode)) {
				1066	PyErr_BadArgument();
				1067	return NULL;
				1068	}
				1069	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1070	PyUnicode_GET_SIZE(unicode),
				1071	NULL,
				1072	0);
				1073	}
				1074
				1075	/* --- Unicode Escape Codec ----------------------------------------------- */
				1076
				1077	static
				1078	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1079	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1080	const char *errors,
				1081	const char *details)
				1082	{
				1083	if ((errors == NULL) \|\|
				1084	(strcmp(errors,"strict") == 0)) {
				1085	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1086	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1087	details);
				1088	return -1;
				1089	}
				1090	else if (strcmp(errors,"ignore") == 0) {
				1091	return 0;
				1092	}
				1093	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1094	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1095	return 0;
				1096	}
				1097	else {
				1098	PyErr_Format(PyExc_ValueError,
				1099	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1100	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1101	errors);
				1102	return -1;
				1103	}
				1104	}
				1105
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1106	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1107
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1108	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1109	int size,
				1110	const char *errors)
				1111	{
				1112	PyUnicodeObject *v;
				1113	Py_UNICODE p = NULL, buf = NULL;
				1114	const char *end;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1115	Py_UCS4 chr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1116
				1117	/* Escaped strings will always be longer than the resulting
				1118	Unicode string, so we start with size here and then reduce the
				1119	length after conversion to the true value. */
				1120	v = _PyUnicode_New(size);
				1121	if (v == NULL)
				1122	goto onError;
				1123	if (size == 0)
				1124	return (PyObject *)v;
				1125	p = buf = PyUnicode_AS_UNICODE(v);
				1126	end = s + size;
				1127	while (s < end) {
				1128	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1129	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1130	int i;
				1131
				1132	/* Non-escape characters are interpreted as Unicode ordinals */
				1133	if (*s != '\\') {
				1134	p++ = (unsigned char)s++;
				1135	continue;
				1136	}
				1137
				1138	/* \ - Escapes */
				1139	s++;
				1140	switch (*s++) {
				1141
				1142	/* \x escapes */
				1143	case '\n': break;
				1144	case '\\': *p++ = '\\'; break;
				1145	case '\'': *p++ = '\''; break;
				1146	case '\"': *p++ = '\"'; break;
				1147	case 'b': *p++ = '\b'; break;
				1148	case 'f': p++ = '\014'; break; / FF */
				1149	case 't': *p++ = '\t'; break;
				1150	case 'n': *p++ = '\n'; break;
				1151	case 'r': *p++ = '\r'; break;
				1152	case 'v': p++ = '\013'; break; / VT */
				1153	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1154
				1155	/* \OOO (octal) escapes */
				1156	case '0': case '1': case '2': case '3':
				1157	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1158	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1159	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1160	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1161	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1162	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1163	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1164	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1165	break;
				1166
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1167	/* \xXX with two hex digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1168	case 'x':
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1169	for (x = 0, i = 0; i < 2; i++) {
				1170	c = (unsigned char)s[i];
				1171	if (!isxdigit(c)) {
				1172	if (unicodeescape_decoding_error(&s, &x, errors,
				1173	"truncated \\xXX"))
				1174	goto onError;
				1175	i++;
				1176	break;
				1177	}
				1178	x = (x<<4) & ~0xF;
				1179	if (c >= '0' && c <= '9')
				1180	x += c - '0';
				1181	else if (c >= 'a' && c <= 'f')
				1182	x += 10 + c - 'a';
				1183	else
				1184	x += 10 + c - 'A';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1185	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1186	s += i;
				1187	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1188	break;
				1189
				1190	/* \uXXXX with 4 hex digits */
				1191	case 'u':
				1192	for (x = 0, i = 0; i < 4; i++) {
				1193	c = (unsigned char)s[i];
				1194	if (!isxdigit(c)) {
				1195	if (unicodeescape_decoding_error(&s, &x, errors,
				1196	"truncated \\uXXXX"))
				1197	goto onError;
				1198	i++;
				1199	break;
				1200	}
				1201	x = (x<<4) & ~0xF;
				1202	if (c >= '0' && c <= '9')
				1203	x += c - '0';
				1204	else if (c >= 'a' && c <= 'f')
				1205	x += 10 + c - 'a';
				1206	else
				1207	x += 10 + c - 'A';
				1208	}
				1209	s += i;
				1210	*p++ = x;
				1211	break;
				1212
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1213	/* \UXXXXXXXX with 8 hex digits */
				1214	case 'U':
				1215	for (chr = 0, i = 0; i < 8; i++) {
				1216	c = (unsigned char)s[i];
				1217	if (!isxdigit(c)) {
				1218	if (unicodeescape_decoding_error(&s, &x, errors,
				1219	"truncated \\uXXXX"))
				1220	goto onError;
				1221	i++;
				1222	break;
				1223	}
				1224	chr = (chr<<4) & ~0xF;
				1225	if (c >= '0' && c <= '9')
				1226	chr += c - '0';
				1227	else if (c >= 'a' && c <= 'f')
				1228	chr += 10 + c - 'a';
				1229	else
				1230	chr += 10 + c - 'A';
				1231	}
				1232	s += i;
				1233	goto store;
				1234
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1235	case 'N':
				1236	/* Ok, we need to deal with Unicode Character Names now,
				1237	* make sure we've imported the hash table data...
				1238	*/
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1239	if (ucnhash_CAPI == NULL) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1240	PyObject mod = 0, v = 0;
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1241	mod = PyImport_ImportModule("unicodedata");
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1242	if (mod == NULL)
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1243	goto ucnhashError;
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1244	v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1245	Py_DECREF(mod);
				1246	if (v == NULL)
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1247	goto ucnhashError;
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1248	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1249	Py_DECREF(v);
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1250	if (ucnhash_CAPI == NULL)
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1251	goto ucnhashError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1252	}
				1253
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1254	if (*s == '{') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1255	const char *start = s + 1;
				1256	const char *endBrace = start;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1257
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1258	/* look for the closing brace */
				1259	while (*endBrace != '}' && endBrace < end)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1260	endBrace++;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1261	if (endBrace != end && *endBrace == '}') {
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1262	if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1263	if (unicodeescape_decoding_error(
				1264	&s, &x, errors,
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	1265	"Invalid Unicode Character Name")
				1266	)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1267	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1268	goto ucnFallthrough;
				1269	}
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1270	s = endBrace + 1;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1271	goto store;
				1272	} else {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1273	if (unicodeescape_decoding_error(
				1274	&s, &x, errors,
				1275	"Unicode name missing closing brace"))
				1276	goto onError;
				1277	goto ucnFallthrough;
				1278	}
				1279	break;
				1280	}
				1281	if (unicodeescape_decoding_error(
				1282	&s, &x, errors,
				1283	"Missing opening brace for Unicode Character Name escape"))
				1284	goto onError;
				1285	ucnFallthrough:
				1286	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1287	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1288	*p++ = '\\';
				1289	*p++ = (unsigned char)s[-1];
				1290	break;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1291	store:
				1292	/* when we get here, chr is a 32-bit unicode character */
				1293	if (chr <= 0xffff)
				1294	/* UCS-2 character */
				1295	*p++ = (Py_UNICODE) chr;
				1296	else if (chr <= 0x10ffff) {
				1297	/* UCS-4 character. store as two surrogate characters */
				1298	chr -= 0x10000L;
				1299	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
				1300	*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
				1301	} else {
				1302	if (unicodeescape_decoding_error(
				1303	&s, &x, errors,
				1304	"Illegal Unicode character")
				1305	)
				1306	goto onError;
				1307	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1308	}
				1309	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1310	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1311	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1312	return (PyObject *)v;
				1313
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1314	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1315	PyErr_SetString(
				1316	PyExc_UnicodeError,
				1317	"\\N escapes not supported (can't load unicodedata module)"
				1318	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1319	return NULL;
				1320
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1321	onError:
				1322	Py_XDECREF(v);
				1323	return NULL;
				1324	}
				1325
				1326	/* Return a Unicode-Escape string version of the Unicode object.
				1327
				1328	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1329	appropriate.
				1330
				1331	*/
				1332
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1333	static const Py_UNICODE findchar(const Py_UNICODE s,
				1334	int size,
				1335	Py_UNICODE ch);
				1336
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1337	static
				1338	PyObject unicodeescape_string(const Py_UNICODE s,
				1339	int size,
				1340	int quotes)
				1341	{
				1342	PyObject *repr;
				1343	char *p;
				1344	char *q;
				1345
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame^]	1346	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1347
				1348	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1349	if (repr == NULL)
				1350	return NULL;
				1351
				1352	p = q = PyString_AS_STRING(repr);
				1353
				1354	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1355	*p++ = 'u';
				1356	*p++ = (findchar(s, size, '\'') &&
				1357	!findchar(s, size, '"')) ? '"' : '\'';
				1358	}
				1359	while (size-- > 0) {
				1360	Py_UNICODE ch = *s++;
				1361	/* Escape quotes */
				1362	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1363	*p++ = '\\';
				1364	*p++ = (char) ch;
				1365	}
				1366	/* Map 16-bit characters to '\uxxxx' */
				1367	else if (ch >= 256) {
				1368	*p++ = '\\';
				1369	*p++ = 'u';
				1370	*p++ = hexdigit[(ch >> 12) & 0xf];
				1371	*p++ = hexdigit[(ch >> 8) & 0xf];
				1372	*p++ = hexdigit[(ch >> 4) & 0xf];
				1373	*p++ = hexdigit[ch & 15];
				1374	}
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame^]	1375	/* Map special whitespace to '\t', \n', '\r' */
				1376	else if (ch == '\t') {
				1377	*p++ = '\\';
				1378	*p++ = 't';
				1379	}
				1380	else if (ch == '\n') {
				1381	*p++ = '\\';
				1382	*p++ = 'n';
				1383	}
				1384	else if (ch == '\r') {
				1385	*p++ = '\\';
				1386	*p++ = 'r';
				1387	}
				1388	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1389	else if (ch < ' ' \|\| ch >= 128) {
				1390	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame^]	1391	*p++ = 'x';
				1392	*p++ = hexdigit[(ch >> 4) & 0xf];
				1393	*p++ = hexdigit[ch & 15];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1394	}
				1395	/* Copy everything else as-is */
				1396	else
				1397	*p++ = (char) ch;
				1398	}
				1399	if (quotes)
				1400	*p++ = q[1];
				1401
				1402	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1403	if (_PyString_Resize(&repr, p - q))
				1404	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1405
				1406	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1407
				1408	onError:
				1409	Py_DECREF(repr);
				1410	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1411	}
				1412
				1413	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1414	int size)
				1415	{
				1416	return unicodeescape_string(s, size, 0);
				1417	}
				1418
				1419	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1420	{
				1421	if (!PyUnicode_Check(unicode)) {
				1422	PyErr_BadArgument();
				1423	return NULL;
				1424	}
				1425	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1426	PyUnicode_GET_SIZE(unicode));
				1427	}
				1428
				1429	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1430
				1431	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1432	int size,
				1433	const char *errors)
				1434	{
				1435	PyUnicodeObject *v;
				1436	Py_UNICODE p, buf;
				1437	const char *end;
				1438	const char *bs;
				1439
				1440	/* Escaped strings will always be longer than the resulting
				1441	Unicode string, so we start with size here and then reduce the
				1442	length after conversion to the true value. */
				1443	v = _PyUnicode_New(size);
				1444	if (v == NULL)
				1445	goto onError;
				1446	if (size == 0)
				1447	return (PyObject *)v;
				1448	p = buf = PyUnicode_AS_UNICODE(v);
				1449	end = s + size;
				1450	while (s < end) {
				1451	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1452	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1453	int i;
				1454
				1455	/* Non-escape characters are interpreted as Unicode ordinals */
				1456	if (*s != '\\') {
				1457	p++ = (unsigned char)s++;
				1458	continue;
				1459	}
				1460
				1461	/* \u-escapes are only interpreted iff the number of leading
				1462	backslashes if odd */
				1463	bs = s;
				1464	for (;s < end;) {
				1465	if (*s != '\\')
				1466	break;
				1467	p++ = (unsigned char)s++;
				1468	}
				1469	if (((s - bs) & 1) == 0 \|\|
				1470	s >= end \|\|
				1471	*s != 'u') {
				1472	continue;
				1473	}
				1474	p--;
				1475	s++;
				1476
				1477	/* \uXXXX with 4 hex digits */
				1478	for (x = 0, i = 0; i < 4; i++) {
				1479	c = (unsigned char)s[i];
				1480	if (!isxdigit(c)) {
				1481	if (unicodeescape_decoding_error(&s, &x, errors,
				1482	"truncated \\uXXXX"))
				1483	goto onError;
				1484	i++;
				1485	break;
				1486	}
				1487	x = (x<<4) & ~0xF;
				1488	if (c >= '0' && c <= '9')
				1489	x += c - '0';
				1490	else if (c >= 'a' && c <= 'f')
				1491	x += 10 + c - 'a';
				1492	else
				1493	x += 10 + c - 'A';
				1494	}
				1495	s += i;
				1496	*p++ = x;
				1497	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1498	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1499	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1500	return (PyObject *)v;
				1501
				1502	onError:
				1503	Py_XDECREF(v);
				1504	return NULL;
				1505	}
				1506
				1507	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1508	int size)
				1509	{
				1510	PyObject *repr;
				1511	char *p;
				1512	char *q;
				1513
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame^]	1514	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1515
				1516	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1517	if (repr == NULL)
				1518	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1519	if (size == 0)
				1520	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1521
				1522	p = q = PyString_AS_STRING(repr);
				1523	while (size-- > 0) {
				1524	Py_UNICODE ch = *s++;
				1525	/* Map 16-bit characters to '\uxxxx' */
				1526	if (ch >= 256) {
				1527	*p++ = '\\';
				1528	*p++ = 'u';
				1529	*p++ = hexdigit[(ch >> 12) & 0xf];
				1530	*p++ = hexdigit[(ch >> 8) & 0xf];
				1531	*p++ = hexdigit[(ch >> 4) & 0xf];
				1532	*p++ = hexdigit[ch & 15];
				1533	}
				1534	/* Copy everything else as-is */
				1535	else
				1536	*p++ = (char) ch;
				1537	}
				1538	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1539	if (_PyString_Resize(&repr, p - q))
				1540	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1541
				1542	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1543
				1544	onError:
				1545	Py_DECREF(repr);
				1546	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1547	}
				1548
				1549	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1550	{
				1551	if (!PyUnicode_Check(unicode)) {
				1552	PyErr_BadArgument();
				1553	return NULL;
				1554	}
				1555	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1556	PyUnicode_GET_SIZE(unicode));
				1557	}
				1558
				1559	/* --- Latin-1 Codec ------------------------------------------------------ */
				1560
				1561	PyObject PyUnicode_DecodeLatin1(const char s,
				1562	int size,
				1563	const char *errors)
				1564	{
				1565	PyUnicodeObject *v;
				1566	Py_UNICODE *p;
				1567
				1568	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1569	v = _PyUnicode_New(size);
				1570	if (v == NULL)
				1571	goto onError;
				1572	if (size == 0)
				1573	return (PyObject *)v;
				1574	p = PyUnicode_AS_UNICODE(v);
				1575	while (size-- > 0)
				1576	p++ = (unsigned char)s++;
				1577	return (PyObject *)v;
				1578
				1579	onError:
				1580	Py_XDECREF(v);
				1581	return NULL;
				1582	}
				1583
				1584	static
				1585	int latin1_encoding_error(const Py_UNICODE **source,
				1586	char **dest,
				1587	const char *errors,
				1588	const char *details)
				1589	{
				1590	if ((errors == NULL) \|\|
				1591	(strcmp(errors,"strict") == 0)) {
				1592	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1593	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1594	details);
				1595	return -1;
				1596	}
				1597	else if (strcmp(errors,"ignore") == 0) {
				1598	return 0;
				1599	}
				1600	else if (strcmp(errors,"replace") == 0) {
				1601	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1602	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1603	return 0;
				1604	}
				1605	else {
				1606	PyErr_Format(PyExc_ValueError,
				1607	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1608	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1609	errors);
				1610	return -1;
				1611	}
				1612	}
				1613
				1614	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1615	int size,
				1616	const char *errors)
				1617	{
				1618	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1619	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1620
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1621	repr = PyString_FromStringAndSize(NULL, size);
				1622	if (repr == NULL)
				1623	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1624	if (size == 0)
				1625	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1626
				1627	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1628	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1629	while (size-- > 0) {
				1630	Py_UNICODE ch = *p++;
				1631	if (ch >= 256) {
				1632	if (latin1_encoding_error(&p, &s, errors,
				1633	"ordinal not in range(256)"))
				1634	goto onError;
				1635	}
				1636	else
				1637	*s++ = (char)ch;
				1638	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1639	/* Resize if error handling skipped some characters */
				1640	if (s - start < PyString_GET_SIZE(repr))
				1641	if (_PyString_Resize(&repr, s - start))
				1642	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1643	return repr;
				1644
				1645	onError:
				1646	Py_DECREF(repr);
				1647	return NULL;
				1648	}
				1649
				1650	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1651	{
				1652	if (!PyUnicode_Check(unicode)) {
				1653	PyErr_BadArgument();
				1654	return NULL;
				1655	}
				1656	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1657	PyUnicode_GET_SIZE(unicode),
				1658	NULL);
				1659	}
				1660
				1661	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1662
				1663	static
				1664	int ascii_decoding_error(const char **source,
				1665	Py_UNICODE **dest,
				1666	const char *errors,
				1667	const char *details)
				1668	{
				1669	if ((errors == NULL) \|\|
				1670	(strcmp(errors,"strict") == 0)) {
				1671	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1672	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1673	details);
				1674	return -1;
				1675	}
				1676	else if (strcmp(errors,"ignore") == 0) {
				1677	return 0;
				1678	}
				1679	else if (strcmp(errors,"replace") == 0) {
				1680	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1681	(*dest)++;
				1682	return 0;
				1683	}
				1684	else {
				1685	PyErr_Format(PyExc_ValueError,
				1686	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1687	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1688	errors);
				1689	return -1;
				1690	}
				1691	}
				1692
				1693	PyObject PyUnicode_DecodeASCII(const char s,
				1694	int size,
				1695	const char *errors)
				1696	{
				1697	PyUnicodeObject *v;
				1698	Py_UNICODE *p;
				1699
				1700	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1701	v = _PyUnicode_New(size);
				1702	if (v == NULL)
				1703	goto onError;
				1704	if (size == 0)
				1705	return (PyObject *)v;
				1706	p = PyUnicode_AS_UNICODE(v);
				1707	while (size-- > 0) {
				1708	register unsigned char c;
				1709
				1710	c = (unsigned char)*s++;
				1711	if (c < 128)
				1712	*p++ = c;
				1713	else if (ascii_decoding_error(&s, &p, errors,
				1714	"ordinal not in range(128)"))
				1715	goto onError;
				1716	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1717	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1718	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1719	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1720	return (PyObject *)v;
				1721
				1722	onError:
				1723	Py_XDECREF(v);
				1724	return NULL;
				1725	}
				1726
				1727	static
				1728	int ascii_encoding_error(const Py_UNICODE **source,
				1729	char **dest,
				1730	const char *errors,
				1731	const char *details)
				1732	{
				1733	if ((errors == NULL) \|\|
				1734	(strcmp(errors,"strict") == 0)) {
				1735	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1736	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1737	details);
				1738	return -1;
				1739	}
				1740	else if (strcmp(errors,"ignore") == 0) {
				1741	return 0;
				1742	}
				1743	else if (strcmp(errors,"replace") == 0) {
				1744	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1745	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1746	return 0;
				1747	}
				1748	else {
				1749	PyErr_Format(PyExc_ValueError,
				1750	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1751	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1752	errors);
				1753	return -1;
				1754	}
				1755	}
				1756
				1757	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1758	int size,
				1759	const char *errors)
				1760	{
				1761	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1762	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1763
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1764	repr = PyString_FromStringAndSize(NULL, size);
				1765	if (repr == NULL)
				1766	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1767	if (size == 0)
				1768	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1769
				1770	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1771	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1772	while (size-- > 0) {
				1773	Py_UNICODE ch = *p++;
				1774	if (ch >= 128) {
				1775	if (ascii_encoding_error(&p, &s, errors,
				1776	"ordinal not in range(128)"))
				1777	goto onError;
				1778	}
				1779	else
				1780	*s++ = (char)ch;
				1781	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1782	/* Resize if error handling skipped some characters */
				1783	if (s - start < PyString_GET_SIZE(repr))
				1784	if (_PyString_Resize(&repr, s - start))
				1785	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1786	return repr;
				1787
				1788	onError:
				1789	Py_DECREF(repr);
				1790	return NULL;
				1791	}
				1792
				1793	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1794	{
				1795	if (!PyUnicode_Check(unicode)) {
				1796	PyErr_BadArgument();
				1797	return NULL;
				1798	}
				1799	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1800	PyUnicode_GET_SIZE(unicode),
				1801	NULL);
				1802	}
				1803
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1804	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1805
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1806	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1807
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1808	PyObject PyUnicode_DecodeMBCS(const char s,
				1809	int size,
				1810	const char *errors)
				1811	{
				1812	PyUnicodeObject *v;
				1813	Py_UNICODE *p;
				1814
				1815	/* First get the size of the result */
				1816	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1817	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1818	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1819
				1820	v = _PyUnicode_New(usize);
				1821	if (v == NULL)
				1822	return NULL;
				1823	if (usize == 0)
				1824	return (PyObject *)v;
				1825	p = PyUnicode_AS_UNICODE(v);
				1826	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1827	Py_DECREF(v);
				1828	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1829	}
				1830
				1831	return (PyObject *)v;
				1832	}
				1833
				1834	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1835	int size,
				1836	const char *errors)
				1837	{
				1838	PyObject *repr;
				1839	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1840	DWORD mbcssize;
				1841
				1842	/* If there are no characters, bail now! */
				1843	if (size==0)
				1844	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1845
				1846	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1847	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1848	if (mbcssize==0)
				1849	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1850
				1851	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1852	if (repr == NULL)
				1853	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1854	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1855	return repr;
				1856
				1857	/* Do the conversion */
				1858	s = PyString_AS_STRING(repr);
				1859	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1860	Py_DECREF(repr);
				1861	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1862	}
				1863	return repr;
				1864	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1865
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1866	#endif /* MS_WIN32 */
				1867
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1868	/* --- Character Mapping Codec -------------------------------------------- */
				1869
				1870	static
				1871	int charmap_decoding_error(const char **source,
				1872	Py_UNICODE **dest,
				1873	const char *errors,
				1874	const char *details)
				1875	{
				1876	if ((errors == NULL) \|\|
				1877	(strcmp(errors,"strict") == 0)) {
				1878	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1879	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1880	details);
				1881	return -1;
				1882	}
				1883	else if (strcmp(errors,"ignore") == 0) {
				1884	return 0;
				1885	}
				1886	else if (strcmp(errors,"replace") == 0) {
				1887	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1888	(*dest)++;
				1889	return 0;
				1890	}
				1891	else {
				1892	PyErr_Format(PyExc_ValueError,
				1893	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1894	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1895	errors);
				1896	return -1;
				1897	}
				1898	}
				1899
				1900	PyObject PyUnicode_DecodeCharmap(const char s,
				1901	int size,
				1902	PyObject *mapping,
				1903	const char *errors)
				1904	{
				1905	PyUnicodeObject *v;
				1906	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1907	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1908
				1909	/* Default to Latin-1 */
				1910	if (mapping == NULL)
				1911	return PyUnicode_DecodeLatin1(s, size, errors);
				1912
				1913	v = _PyUnicode_New(size);
				1914	if (v == NULL)
				1915	goto onError;
				1916	if (size == 0)
				1917	return (PyObject *)v;
				1918	p = PyUnicode_AS_UNICODE(v);
				1919	while (size-- > 0) {
				1920	unsigned char ch = *s++;
				1921	PyObject w, x;
				1922
				1923	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1924	w = PyInt_FromLong((long)ch);
				1925	if (w == NULL)
				1926	goto onError;
				1927	x = PyObject_GetItem(mapping, w);
				1928	Py_DECREF(w);
				1929	if (x == NULL) {
				1930	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1931	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1932	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1933	x = Py_None;
				1934	Py_INCREF(x);
				1935	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1936	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1937	}
				1938
				1939	/* Apply mapping */
				1940	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1941	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1942	if (value < 0 \|\| value > 65535) {
				1943	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1944	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1945	Py_DECREF(x);
				1946	goto onError;
				1947	}
				1948	*p++ = (Py_UNICODE)value;
				1949	}
				1950	else if (x == Py_None) {
				1951	/* undefined mapping */
				1952	if (charmap_decoding_error(&s, &p, errors,
				1953	"character maps to <undefined>")) {
				1954	Py_DECREF(x);
				1955	goto onError;
				1956	}
				1957	}
				1958	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1959	int targetsize = PyUnicode_GET_SIZE(x);
				1960
				1961	if (targetsize == 1)
				1962	/* 1-1 mapping */
				1963	p++ = PyUnicode_AS_UNICODE(x);
				1964
				1965	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1966	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1967	if (targetsize > extrachars) {
				1968	/* resize first */
				1969	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				1970	int needed = (targetsize - extrachars) + \
				1971	(targetsize << 2);
				1972	extrachars += needed;
				1973	if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1974	Py_DECREF(x);
				1975	goto onError;
				1976	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1977	p = PyUnicode_AS_UNICODE(v) + oldpos;
				1978	}
				1979	Py_UNICODE_COPY(p,
				1980	PyUnicode_AS_UNICODE(x),
				1981	targetsize);
				1982	p += targetsize;
				1983	extrachars -= targetsize;
				1984	}
				1985	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1986	}
				1987	else {
				1988	/* wrong return value */
				1989	PyErr_SetString(PyExc_TypeError,
				1990	"character mapping must return integer, None or unicode");
				1991	Py_DECREF(x);
				1992	goto onError;
				1993	}
				1994	Py_DECREF(x);
				1995	}
				1996	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1997	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1998	goto onError;
				1999	return (PyObject *)v;
				2000
				2001	onError:
				2002	Py_XDECREF(v);
				2003	return NULL;
				2004	}
				2005
				2006	static
				2007	int charmap_encoding_error(const Py_UNICODE **source,
				2008	char **dest,
				2009	const char *errors,
				2010	const char *details)
				2011	{
				2012	if ((errors == NULL) \|\|
				2013	(strcmp(errors,"strict") == 0)) {
				2014	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2015	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2016	details);
				2017	return -1;
				2018	}
				2019	else if (strcmp(errors,"ignore") == 0) {
				2020	return 0;
				2021	}
				2022	else if (strcmp(errors,"replace") == 0) {
				2023	**dest = '?';
				2024	(*dest)++;
				2025	return 0;
				2026	}
				2027	else {
				2028	PyErr_Format(PyExc_ValueError,
				2029	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2030	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2031	errors);
				2032	return -1;
				2033	}
				2034	}
				2035
				2036	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2037	int size,
				2038	PyObject *mapping,
				2039	const char *errors)
				2040	{
				2041	PyObject *v;
				2042	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2043	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2044
				2045	/* Default to Latin-1 */
				2046	if (mapping == NULL)
				2047	return PyUnicode_EncodeLatin1(p, size, errors);
				2048
				2049	v = PyString_FromStringAndSize(NULL, size);
				2050	if (v == NULL)
				2051	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2052	if (size == 0)
				2053	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2054	s = PyString_AS_STRING(v);
				2055	while (size-- > 0) {
				2056	Py_UNICODE ch = *p++;
				2057	PyObject w, x;
				2058
				2059	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2060	w = PyInt_FromLong((long)ch);
				2061	if (w == NULL)
				2062	goto onError;
				2063	x = PyObject_GetItem(mapping, w);
				2064	Py_DECREF(w);
				2065	if (x == NULL) {
				2066	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2067	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2068	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2069	x = Py_None;
				2070	Py_INCREF(x);
				2071	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2072	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2073	}
				2074
				2075	/* Apply mapping */
				2076	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2077	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2078	if (value < 0 \|\| value > 255) {
				2079	PyErr_SetString(PyExc_TypeError,
				2080	"character mapping must be in range(256)");
				2081	Py_DECREF(x);
				2082	goto onError;
				2083	}
				2084	*s++ = (char)value;
				2085	}
				2086	else if (x == Py_None) {
				2087	/* undefined mapping */
				2088	if (charmap_encoding_error(&p, &s, errors,
				2089	"character maps to <undefined>")) {
				2090	Py_DECREF(x);
				2091	goto onError;
				2092	}
				2093	}
				2094	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2095	int targetsize = PyString_GET_SIZE(x);
				2096
				2097	if (targetsize == 1)
				2098	/* 1-1 mapping */
				2099	s++ = PyString_AS_STRING(x);
				2100
				2101	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2102	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2103	if (targetsize > extrachars) {
				2104	/* resize first */
				2105	int oldpos = (int)(s - PyString_AS_STRING(v));
				2106	int needed = (targetsize - extrachars) + \
				2107	(targetsize << 2);
				2108	extrachars += needed;
				2109	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2110	Py_DECREF(x);
				2111	goto onError;
				2112	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2113	s = PyString_AS_STRING(v) + oldpos;
				2114	}
				2115	memcpy(s,
				2116	PyString_AS_STRING(x),
				2117	targetsize);
				2118	s += targetsize;
				2119	extrachars -= targetsize;
				2120	}
				2121	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2122	}
				2123	else {
				2124	/* wrong return value */
				2125	PyErr_SetString(PyExc_TypeError,
				2126	"character mapping must return integer, None or unicode");
				2127	Py_DECREF(x);
				2128	goto onError;
				2129	}
				2130	Py_DECREF(x);
				2131	}
				2132	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2133	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2134	goto onError;
				2135	return v;
				2136
				2137	onError:
				2138	Py_DECREF(v);
				2139	return NULL;
				2140	}
				2141
				2142	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2143	PyObject *mapping)
				2144	{
				2145	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2146	PyErr_BadArgument();
				2147	return NULL;
				2148	}
				2149	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2150	PyUnicode_GET_SIZE(unicode),
				2151	mapping,
				2152	NULL);
				2153	}
				2154
				2155	static
				2156	int translate_error(const Py_UNICODE **source,
				2157	Py_UNICODE **dest,
				2158	const char *errors,
				2159	const char *details)
				2160	{
				2161	if ((errors == NULL) \|\|
				2162	(strcmp(errors,"strict") == 0)) {
				2163	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2164	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2165	details);
				2166	return -1;
				2167	}
				2168	else if (strcmp(errors,"ignore") == 0) {
				2169	return 0;
				2170	}
				2171	else if (strcmp(errors,"replace") == 0) {
				2172	**dest = '?';
				2173	(*dest)++;
				2174	return 0;
				2175	}
				2176	else {
				2177	PyErr_Format(PyExc_ValueError,
				2178	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2179	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2180	errors);
				2181	return -1;
				2182	}
				2183	}
				2184
				2185	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2186	int size,
				2187	PyObject *mapping,
				2188	const char *errors)
				2189	{
				2190	PyUnicodeObject *v;
				2191	Py_UNICODE *p;
				2192
				2193	if (mapping == NULL) {
				2194	PyErr_BadArgument();
				2195	return NULL;
				2196	}
				2197
				2198	/* Output will never be longer than input */
				2199	v = _PyUnicode_New(size);
				2200	if (v == NULL)
				2201	goto onError;
				2202	if (size == 0)
				2203	goto done;
				2204	p = PyUnicode_AS_UNICODE(v);
				2205	while (size-- > 0) {
				2206	Py_UNICODE ch = *s++;
				2207	PyObject w, x;
				2208
				2209	/* Get mapping */
				2210	w = PyInt_FromLong(ch);
				2211	if (w == NULL)
				2212	goto onError;
				2213	x = PyObject_GetItem(mapping, w);
				2214	Py_DECREF(w);
				2215	if (x == NULL) {
				2216	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2217	/* No mapping found: default to 1-1 mapping */
				2218	PyErr_Clear();
				2219	*p++ = ch;
				2220	continue;
				2221	}
				2222	goto onError;
				2223	}
				2224
				2225	/* Apply mapping */
				2226	if (PyInt_Check(x))
				2227	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2228	else if (x == Py_None) {
				2229	/* undefined mapping */
				2230	if (translate_error(&s, &p, errors,
				2231	"character maps to <undefined>")) {
				2232	Py_DECREF(x);
				2233	goto onError;
				2234	}
				2235	}
				2236	else if (PyUnicode_Check(x)) {
				2237	if (PyUnicode_GET_SIZE(x) != 1) {
				2238	/* 1-n mapping */
				2239	PyErr_SetString(PyExc_NotImplementedError,
				2240	"1-n mappings are currently not implemented");
				2241	Py_DECREF(x);
				2242	goto onError;
				2243	}
				2244	p++ = PyUnicode_AS_UNICODE(x);
				2245	}
				2246	else {
				2247	/* wrong return value */
				2248	PyErr_SetString(PyExc_TypeError,
				2249	"translate mapping must return integer, None or unicode");
				2250	Py_DECREF(x);
				2251	goto onError;
				2252	}
				2253	Py_DECREF(x);
				2254	}
				2255	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2256	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2257	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2258
				2259	done:
				2260	return (PyObject *)v;
				2261
				2262	onError:
				2263	Py_XDECREF(v);
				2264	return NULL;
				2265	}
				2266
				2267	PyObject PyUnicode_Translate(PyObject str,
				2268	PyObject *mapping,
				2269	const char *errors)
				2270	{
				2271	PyObject *result;
				2272
				2273	str = PyUnicode_FromObject(str);
				2274	if (str == NULL)
				2275	goto onError;
				2276	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2277	PyUnicode_GET_SIZE(str),
				2278	mapping,
				2279	errors);
				2280	Py_DECREF(str);
				2281	return result;
				2282
				2283	onError:
				2284	Py_XDECREF(str);
				2285	return NULL;
				2286	}
				2287
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2288	/* --- Decimal Encoder ---------------------------------------------------- */
				2289
				2290	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2291	int length,
				2292	char *output,
				2293	const char *errors)
				2294	{
				2295	Py_UNICODE p, end;
				2296
				2297	if (output == NULL) {
				2298	PyErr_BadArgument();
				2299	return -1;
				2300	}
				2301
				2302	p = s;
				2303	end = s + length;
				2304	while (p < end) {
				2305	register Py_UNICODE ch = *p++;
				2306	int decimal;
				2307
				2308	if (Py_UNICODE_ISSPACE(ch)) {
				2309	*output++ = ' ';
				2310	continue;
				2311	}
				2312	decimal = Py_UNICODE_TODECIMAL(ch);
				2313	if (decimal >= 0) {
				2314	*output++ = '0' + decimal;
				2315	continue;
				2316	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2317	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2318	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2319	continue;
				2320	}
				2321	/* All other characters are considered invalid */
				2322	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2323	PyErr_SetString(PyExc_ValueError,
				2324	"invalid decimal Unicode string");
				2325	goto onError;
				2326	}
				2327	else if (strcmp(errors, "ignore") == 0)
				2328	continue;
				2329	else if (strcmp(errors, "replace") == 0) {
				2330	*output++ = '?';
				2331	continue;
				2332	}
				2333	}
				2334	/* 0-terminate the output string */
				2335	*output++ = '\0';
				2336	return 0;
				2337
				2338	onError:
				2339	return -1;
				2340	}
				2341
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2342	/* --- Helpers ------------------------------------------------------------ */
				2343
				2344	static
				2345	int count(PyUnicodeObject *self,
				2346	int start,
				2347	int end,
				2348	PyUnicodeObject *substring)
				2349	{
				2350	int count = 0;
				2351
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2352	if (start < 0)
				2353	start += self->length;
				2354	if (start < 0)
				2355	start = 0;
				2356	if (end > self->length)
				2357	end = self->length;
				2358	if (end < 0)
				2359	end += self->length;
				2360	if (end < 0)
				2361	end = 0;
				2362
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2363	if (substring->length == 0)
				2364	return (end - start + 1);
				2365
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2366	end -= substring->length;
				2367
				2368	while (start <= end)
				2369	if (Py_UNICODE_MATCH(self, start, substring)) {
				2370	count++;
				2371	start += substring->length;
				2372	} else
				2373	start++;
				2374
				2375	return count;
				2376	}
				2377
				2378	int PyUnicode_Count(PyObject *str,
				2379	PyObject *substr,
				2380	int start,
				2381	int end)
				2382	{
				2383	int result;
				2384
				2385	str = PyUnicode_FromObject(str);
				2386	if (str == NULL)
				2387	return -1;
				2388	substr = PyUnicode_FromObject(substr);
				2389	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2390	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2391	return -1;
				2392	}
				2393
				2394	result = count((PyUnicodeObject *)str,
				2395	start, end,
				2396	(PyUnicodeObject *)substr);
				2397
				2398	Py_DECREF(str);
				2399	Py_DECREF(substr);
				2400	return result;
				2401	}
				2402
				2403	static
				2404	int findstring(PyUnicodeObject *self,
				2405	PyUnicodeObject *substring,
				2406	int start,
				2407	int end,
				2408	int direction)
				2409	{
				2410	if (start < 0)
				2411	start += self->length;
				2412	if (start < 0)
				2413	start = 0;
				2414
				2415	if (substring->length == 0)
				2416	return start;
				2417
				2418	if (end > self->length)
				2419	end = self->length;
				2420	if (end < 0)
				2421	end += self->length;
				2422	if (end < 0)
				2423	end = 0;
				2424
				2425	end -= substring->length;
				2426
				2427	if (direction < 0) {
				2428	for (; end >= start; end--)
				2429	if (Py_UNICODE_MATCH(self, end, substring))
				2430	return end;
				2431	} else {
				2432	for (; start <= end; start++)
				2433	if (Py_UNICODE_MATCH(self, start, substring))
				2434	return start;
				2435	}
				2436
				2437	return -1;
				2438	}
				2439
				2440	int PyUnicode_Find(PyObject *str,
				2441	PyObject *substr,
				2442	int start,
				2443	int end,
				2444	int direction)
				2445	{
				2446	int result;
				2447
				2448	str = PyUnicode_FromObject(str);
				2449	if (str == NULL)
				2450	return -1;
				2451	substr = PyUnicode_FromObject(substr);
				2452	if (substr == NULL) {
				2453	Py_DECREF(substr);
				2454	return -1;
				2455	}
				2456
				2457	result = findstring((PyUnicodeObject *)str,
				2458	(PyUnicodeObject *)substr,
				2459	start, end, direction);
				2460	Py_DECREF(str);
				2461	Py_DECREF(substr);
				2462	return result;
				2463	}
				2464
				2465	static
				2466	int tailmatch(PyUnicodeObject *self,
				2467	PyUnicodeObject *substring,
				2468	int start,
				2469	int end,
				2470	int direction)
				2471	{
				2472	if (start < 0)
				2473	start += self->length;
				2474	if (start < 0)
				2475	start = 0;
				2476
				2477	if (substring->length == 0)
				2478	return 1;
				2479
				2480	if (end > self->length)
				2481	end = self->length;
				2482	if (end < 0)
				2483	end += self->length;
				2484	if (end < 0)
				2485	end = 0;
				2486
				2487	end -= substring->length;
				2488	if (end < start)
				2489	return 0;
				2490
				2491	if (direction > 0) {
				2492	if (Py_UNICODE_MATCH(self, end, substring))
				2493	return 1;
				2494	} else {
				2495	if (Py_UNICODE_MATCH(self, start, substring))
				2496	return 1;
				2497	}
				2498
				2499	return 0;
				2500	}
				2501
				2502	int PyUnicode_Tailmatch(PyObject *str,
				2503	PyObject *substr,
				2504	int start,
				2505	int end,
				2506	int direction)
				2507	{
				2508	int result;
				2509
				2510	str = PyUnicode_FromObject(str);
				2511	if (str == NULL)
				2512	return -1;
				2513	substr = PyUnicode_FromObject(substr);
				2514	if (substr == NULL) {
				2515	Py_DECREF(substr);
				2516	return -1;
				2517	}
				2518
				2519	result = tailmatch((PyUnicodeObject *)str,
				2520	(PyUnicodeObject *)substr,
				2521	start, end, direction);
				2522	Py_DECREF(str);
				2523	Py_DECREF(substr);
				2524	return result;
				2525	}
				2526
				2527	static
				2528	const Py_UNICODE findchar(const Py_UNICODE s,
				2529	int size,
				2530	Py_UNICODE ch)
				2531	{
				2532	/* like wcschr, but doesn't stop at NULL characters */
				2533
				2534	while (size-- > 0) {
				2535	if (*s == ch)
				2536	return s;
				2537	s++;
				2538	}
				2539
				2540	return NULL;
				2541	}
				2542
				2543	/* Apply fixfct filter to the Unicode object self and return a
				2544	reference to the modified object */
				2545
				2546	static
				2547	PyObject fixup(PyUnicodeObject self,
				2548	int (fixfct)(PyUnicodeObject s))
				2549	{
				2550
				2551	PyUnicodeObject *u;
				2552
				2553	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2554	self->length);
				2555	if (u == NULL)
				2556	return NULL;
				2557	if (!fixfct(u)) {
				2558	/* fixfct should return TRUE if it modified the buffer. If
				2559	FALSE, return a reference to the original buffer instead
				2560	(to save space, not time) */
				2561	Py_INCREF(self);
				2562	Py_DECREF(u);
				2563	return (PyObject*) self;
				2564	}
				2565	return (PyObject*) u;
				2566	}
				2567
				2568	static
				2569	int fixupper(PyUnicodeObject *self)
				2570	{
				2571	int len = self->length;
				2572	Py_UNICODE *s = self->str;
				2573	int status = 0;
				2574
				2575	while (len-- > 0) {
				2576	register Py_UNICODE ch;
				2577
				2578	ch = Py_UNICODE_TOUPPER(*s);
				2579	if (ch != *s) {
				2580	status = 1;
				2581	*s = ch;
				2582	}
				2583	s++;
				2584	}
				2585
				2586	return status;
				2587	}
				2588
				2589	static
				2590	int fixlower(PyUnicodeObject *self)
				2591	{
				2592	int len = self->length;
				2593	Py_UNICODE *s = self->str;
				2594	int status = 0;
				2595
				2596	while (len-- > 0) {
				2597	register Py_UNICODE ch;
				2598
				2599	ch = Py_UNICODE_TOLOWER(*s);
				2600	if (ch != *s) {
				2601	status = 1;
				2602	*s = ch;
				2603	}
				2604	s++;
				2605	}
				2606
				2607	return status;
				2608	}
				2609
				2610	static
				2611	int fixswapcase(PyUnicodeObject *self)
				2612	{
				2613	int len = self->length;
				2614	Py_UNICODE *s = self->str;
				2615	int status = 0;
				2616
				2617	while (len-- > 0) {
				2618	if (Py_UNICODE_ISUPPER(*s)) {
				2619	s = Py_UNICODE_TOLOWER(s);
				2620	status = 1;
				2621	} else if (Py_UNICODE_ISLOWER(*s)) {
				2622	s = Py_UNICODE_TOUPPER(s);
				2623	status = 1;
				2624	}
				2625	s++;
				2626	}
				2627
				2628	return status;
				2629	}
				2630
				2631	static
				2632	int fixcapitalize(PyUnicodeObject *self)
				2633	{
				2634	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2635	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2636	return 1;
				2637	}
				2638	return 0;
				2639	}
				2640
				2641	static
				2642	int fixtitle(PyUnicodeObject *self)
				2643	{
				2644	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2645	register Py_UNICODE *e;
				2646	int previous_is_cased;
				2647
				2648	/* Shortcut for single character strings */
				2649	if (PyUnicode_GET_SIZE(self) == 1) {
				2650	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2651	if (*p != ch) {
				2652	*p = ch;
				2653	return 1;
				2654	}
				2655	else
				2656	return 0;
				2657	}
				2658
				2659	e = p + PyUnicode_GET_SIZE(self);
				2660	previous_is_cased = 0;
				2661	for (; p < e; p++) {
				2662	register const Py_UNICODE ch = *p;
				2663
				2664	if (previous_is_cased)
				2665	*p = Py_UNICODE_TOLOWER(ch);
				2666	else
				2667	*p = Py_UNICODE_TOTITLE(ch);
				2668
				2669	if (Py_UNICODE_ISLOWER(ch) \|\|
				2670	Py_UNICODE_ISUPPER(ch) \|\|
				2671	Py_UNICODE_ISTITLE(ch))
				2672	previous_is_cased = 1;
				2673	else
				2674	previous_is_cased = 0;
				2675	}
				2676	return 1;
				2677	}
				2678
				2679	PyObject PyUnicode_Join(PyObject separator,
				2680	PyObject *seq)
				2681	{
				2682	Py_UNICODE *sep;
				2683	int seplen;
				2684	PyUnicodeObject *res = NULL;
				2685	int reslen = 0;
				2686	Py_UNICODE *p;
				2687	int seqlen = 0;
				2688	int sz = 100;
				2689	int i;
				2690
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2691	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2692	if (seqlen < 0 && PyErr_Occurred())
				2693	return NULL;
				2694
				2695	if (separator == NULL) {
				2696	Py_UNICODE blank = ' ';
				2697	sep = &blank;
				2698	seplen = 1;
				2699	}
				2700	else {
				2701	separator = PyUnicode_FromObject(separator);
				2702	if (separator == NULL)
				2703	return NULL;
				2704	sep = PyUnicode_AS_UNICODE(separator);
				2705	seplen = PyUnicode_GET_SIZE(separator);
				2706	}
				2707
				2708	res = _PyUnicode_New(sz);
				2709	if (res == NULL)
				2710	goto onError;
				2711	p = PyUnicode_AS_UNICODE(res);
				2712	reslen = 0;
				2713
				2714	for (i = 0; i < seqlen; i++) {
				2715	int itemlen;
				2716	PyObject *item;
				2717
				2718	item = PySequence_GetItem(seq, i);
				2719	if (item == NULL)
				2720	goto onError;
				2721	if (!PyUnicode_Check(item)) {
				2722	PyObject *v;
				2723	v = PyUnicode_FromObject(item);
				2724	Py_DECREF(item);
				2725	item = v;
				2726	if (item == NULL)
				2727	goto onError;
				2728	}
				2729	itemlen = PyUnicode_GET_SIZE(item);
				2730	while (reslen + itemlen + seplen >= sz) {
				2731	if (_PyUnicode_Resize(res, sz*2))
				2732	goto onError;
				2733	sz *= 2;
				2734	p = PyUnicode_AS_UNICODE(res) + reslen;
				2735	}
				2736	if (i > 0) {
				2737	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2738	p += seplen;
				2739	reslen += seplen;
				2740	}
				2741	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2742	p += itemlen;
				2743	reslen += itemlen;
				2744	Py_DECREF(item);
				2745	}
				2746	if (_PyUnicode_Resize(res, reslen))
				2747	goto onError;
				2748
				2749	Py_XDECREF(separator);
				2750	return (PyObject *)res;
				2751
				2752	onError:
				2753	Py_XDECREF(separator);
				2754	Py_DECREF(res);
				2755	return NULL;
				2756	}
				2757
				2758	static
				2759	PyUnicodeObject pad(PyUnicodeObject self,
				2760	int left,
				2761	int right,
				2762	Py_UNICODE fill)
				2763	{
				2764	PyUnicodeObject *u;
				2765
				2766	if (left < 0)
				2767	left = 0;
				2768	if (right < 0)
				2769	right = 0;
				2770
				2771	if (left == 0 && right == 0) {
				2772	Py_INCREF(self);
				2773	return self;
				2774	}
				2775
				2776	u = _PyUnicode_New(left + self->length + right);
				2777	if (u) {
				2778	if (left)
				2779	Py_UNICODE_FILL(u->str, fill, left);
				2780	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2781	if (right)
				2782	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2783	}
				2784
				2785	return u;
				2786	}
				2787
				2788	#define SPLIT_APPEND(data, left, right) \
				2789	str = PyUnicode_FromUnicode(data + left, right - left); \
				2790	if (!str) \
				2791	goto onError; \
				2792	if (PyList_Append(list, str)) { \
				2793	Py_DECREF(str); \
				2794	goto onError; \
				2795	} \
				2796	else \
				2797	Py_DECREF(str);
				2798
				2799	static
				2800	PyObject split_whitespace(PyUnicodeObject self,
				2801	PyObject *list,
				2802	int maxcount)
				2803	{
				2804	register int i;
				2805	register int j;
				2806	int len = self->length;
				2807	PyObject *str;
				2808
				2809	for (i = j = 0; i < len; ) {
				2810	/* find a token */
				2811	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2812	i++;
				2813	j = i;
				2814	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2815	i++;
				2816	if (j < i) {
				2817	if (maxcount-- <= 0)
				2818	break;
				2819	SPLIT_APPEND(self->str, j, i);
				2820	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2821	i++;
				2822	j = i;
				2823	}
				2824	}
				2825	if (j < len) {
				2826	SPLIT_APPEND(self->str, j, len);
				2827	}
				2828	return list;
				2829
				2830	onError:
				2831	Py_DECREF(list);
				2832	return NULL;
				2833	}
				2834
				2835	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2836	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2837	{
				2838	register int i;
				2839	register int j;
				2840	int len;
				2841	PyObject *list;
				2842	PyObject *str;
				2843	Py_UNICODE *data;
				2844
				2845	string = PyUnicode_FromObject(string);
				2846	if (string == NULL)
				2847	return NULL;
				2848	data = PyUnicode_AS_UNICODE(string);
				2849	len = PyUnicode_GET_SIZE(string);
				2850
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2851	list = PyList_New(0);
				2852	if (!list)
				2853	goto onError;
				2854
				2855	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2856	int eol;
				2857
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2858	/* Find a line and append it */
				2859	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2860	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2861
				2862	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2863	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2864	if (i < len) {
				2865	if (data[i] == '\r' && i + 1 < len &&
				2866	data[i+1] == '\n')
				2867	i += 2;
				2868	else
				2869	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2870	if (keepends)
				2871	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2872	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2873	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2874	j = i;
				2875	}
				2876	if (j < len) {
				2877	SPLIT_APPEND(data, j, len);
				2878	}
				2879
				2880	Py_DECREF(string);
				2881	return list;
				2882
				2883	onError:
				2884	Py_DECREF(list);
				2885	Py_DECREF(string);
				2886	return NULL;
				2887	}
				2888
				2889	static
				2890	PyObject split_char(PyUnicodeObject self,
				2891	PyObject *list,
				2892	Py_UNICODE ch,
				2893	int maxcount)
				2894	{
				2895	register int i;
				2896	register int j;
				2897	int len = self->length;
				2898	PyObject *str;
				2899
				2900	for (i = j = 0; i < len; ) {
				2901	if (self->str[i] == ch) {
				2902	if (maxcount-- <= 0)
				2903	break;
				2904	SPLIT_APPEND(self->str, j, i);
				2905	i = j = i + 1;
				2906	} else
				2907	i++;
				2908	}
				2909	if (j <= len) {
				2910	SPLIT_APPEND(self->str, j, len);
				2911	}
				2912	return list;
				2913
				2914	onError:
				2915	Py_DECREF(list);
				2916	return NULL;
				2917	}
				2918
				2919	static
				2920	PyObject split_substring(PyUnicodeObject self,
				2921	PyObject *list,
				2922	PyUnicodeObject *substring,
				2923	int maxcount)
				2924	{
				2925	register int i;
				2926	register int j;
				2927	int len = self->length;
				2928	int sublen = substring->length;
				2929	PyObject *str;
				2930
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	2931	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2932	if (Py_UNICODE_MATCH(self, i, substring)) {
				2933	if (maxcount-- <= 0)
				2934	break;
				2935	SPLIT_APPEND(self->str, j, i);
				2936	i = j = i + sublen;
				2937	} else
				2938	i++;
				2939	}
				2940	if (j <= len) {
				2941	SPLIT_APPEND(self->str, j, len);
				2942	}
				2943	return list;
				2944
				2945	onError:
				2946	Py_DECREF(list);
				2947	return NULL;
				2948	}
				2949
				2950	#undef SPLIT_APPEND
				2951
				2952	static
				2953	PyObject split(PyUnicodeObject self,
				2954	PyUnicodeObject *substring,
				2955	int maxcount)
				2956	{
				2957	PyObject *list;
				2958
				2959	if (maxcount < 0)
				2960	maxcount = INT_MAX;
				2961
				2962	list = PyList_New(0);
				2963	if (!list)
				2964	return NULL;
				2965
				2966	if (substring == NULL)
				2967	return split_whitespace(self,list,maxcount);
				2968
				2969	else if (substring->length == 1)
				2970	return split_char(self,list,substring->str[0],maxcount);
				2971
				2972	else if (substring->length == 0) {
				2973	Py_DECREF(list);
				2974	PyErr_SetString(PyExc_ValueError, "empty separator");
				2975	return NULL;
				2976	}
				2977	else
				2978	return split_substring(self,list,substring,maxcount);
				2979	}
				2980
				2981	static
				2982	PyObject strip(PyUnicodeObject self,
				2983	int left,
				2984	int right)
				2985	{
				2986	Py_UNICODE *p = self->str;
				2987	int start = 0;
				2988	int end = self->length;
				2989
				2990	if (left)
				2991	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2992	start++;
				2993
				2994	if (right)
				2995	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2996	end--;
				2997
				2998	if (start == 0 && end == self->length) {
				2999	/* couldn't strip anything off, return original string */
				3000	Py_INCREF(self);
				3001	return (PyObject*) self;
				3002	}
				3003
				3004	return (PyObject*) PyUnicode_FromUnicode(
				3005	self->str + start,
				3006	end - start
				3007	);
				3008	}
				3009
				3010	static
				3011	PyObject replace(PyUnicodeObject self,
				3012	PyUnicodeObject *str1,
				3013	PyUnicodeObject *str2,
				3014	int maxcount)
				3015	{
				3016	PyUnicodeObject *u;
				3017
				3018	if (maxcount < 0)
				3019	maxcount = INT_MAX;
				3020
				3021	if (str1->length == 1 && str2->length == 1) {
				3022	int i;
				3023
				3024	/* replace characters */
				3025	if (!findchar(self->str, self->length, str1->str[0])) {
				3026	/* nothing to replace, return original string */
				3027	Py_INCREF(self);
				3028	u = self;
				3029	} else {
				3030	Py_UNICODE u1 = str1->str[0];
				3031	Py_UNICODE u2 = str2->str[0];
				3032
				3033	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3034	self->str,
				3035	self->length
				3036	);
				3037	if (u)
				3038	for (i = 0; i < u->length; i++)
				3039	if (u->str[i] == u1) {
				3040	if (--maxcount < 0)
				3041	break;
				3042	u->str[i] = u2;
				3043	}
				3044	}
				3045
				3046	} else {
				3047	int n, i;
				3048	Py_UNICODE *p;
				3049
				3050	/* replace strings */
				3051	n = count(self, 0, self->length, str1);
				3052	if (n > maxcount)
				3053	n = maxcount;
				3054	if (n == 0) {
				3055	/* nothing to replace, return original string */
				3056	Py_INCREF(self);
				3057	u = self;
				3058	} else {
				3059	u = _PyUnicode_New(
				3060	self->length + n * (str2->length - str1->length));
				3061	if (u) {
				3062	i = 0;
				3063	p = u->str;
				3064	while (i <= self->length - str1->length)
				3065	if (Py_UNICODE_MATCH(self, i, str1)) {
				3066	/* replace string segment */
				3067	Py_UNICODE_COPY(p, str2->str, str2->length);
				3068	p += str2->length;
				3069	i += str1->length;
				3070	if (--n <= 0) {
				3071	/* copy remaining part */
				3072	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3073	break;
				3074	}
				3075	} else
				3076	*p++ = self->str[i++];
				3077	}
				3078	}
				3079	}
				3080
				3081	return (PyObject *) u;
				3082	}
				3083
				3084	/* --- Unicode Object Methods --------------------------------------------- */
				3085
				3086	static char title__doc__[] =
				3087	"S.title() -> unicode\n\
				3088	\n\
				3089	Return a titlecased version of S, i.e. words start with title case\n\
				3090	characters, all remaining cased characters have lower case.";
				3091
				3092	static PyObject*
				3093	unicode_title(PyUnicodeObject self, PyObject args)
				3094	{
				3095	if (!PyArg_NoArgs(args))
				3096	return NULL;
				3097	return fixup(self, fixtitle);
				3098	}
				3099
				3100	static char capitalize__doc__[] =
				3101	"S.capitalize() -> unicode\n\
				3102	\n\
				3103	Return a capitalized version of S, i.e. make the first character\n\
				3104	have upper case.";
				3105
				3106	static PyObject*
				3107	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3108	{
				3109	if (!PyArg_NoArgs(args))
				3110	return NULL;
				3111	return fixup(self, fixcapitalize);
				3112	}
				3113
				3114	#if 0
				3115	static char capwords__doc__[] =
				3116	"S.capwords() -> unicode\n\
				3117	\n\
				3118	Apply .capitalize() to all words in S and return the result with\n\
				3119	normalized whitespace (all whitespace strings are replaced by ' ').";
				3120
				3121	static PyObject*
				3122	unicode_capwords(PyUnicodeObject self, PyObject args)
				3123	{
				3124	PyObject *list;
				3125	PyObject *item;
				3126	int i;
				3127
				3128	if (!PyArg_NoArgs(args))
				3129	return NULL;
				3130
				3131	/* Split into words */
				3132	list = split(self, NULL, -1);
				3133	if (!list)
				3134	return NULL;
				3135
				3136	/* Capitalize each word */
				3137	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3138	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3139	fixcapitalize);
				3140	if (item == NULL)
				3141	goto onError;
				3142	Py_DECREF(PyList_GET_ITEM(list, i));
				3143	PyList_SET_ITEM(list, i, item);
				3144	}
				3145
				3146	/* Join the words to form a new string */
				3147	item = PyUnicode_Join(NULL, list);
				3148
				3149	onError:
				3150	Py_DECREF(list);
				3151	return (PyObject *)item;
				3152	}
				3153	#endif
				3154
				3155	static char center__doc__[] =
				3156	"S.center(width) -> unicode\n\
				3157	\n\
				3158	Return S centered in a Unicode string of length width. Padding is done\n\
				3159	using spaces.";
				3160
				3161	static PyObject *
				3162	unicode_center(PyUnicodeObject self, PyObject args)
				3163	{
				3164	int marg, left;
				3165	int width;
				3166
				3167	if (!PyArg_ParseTuple(args, "i:center", &width))
				3168	return NULL;
				3169
				3170	if (self->length >= width) {
				3171	Py_INCREF(self);
				3172	return (PyObject*) self;
				3173	}
				3174
				3175	marg = width - self->length;
				3176	left = marg / 2 + (marg & width & 1);
				3177
				3178	return (PyObject*) pad(self, left, marg - left, ' ');
				3179	}
				3180
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3181	#if 0
				3182
				3183	/* This code should go into some future Unicode collation support
				3184	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3185	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3186
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3187	/* speedy UTF-16 code point order comparison */
				3188	/* gleaned from: */
				3189	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3190
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3191	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3192	{
				3193	0, 0, 0, 0, 0, 0, 0, 0,
				3194	0, 0, 0, 0, 0, 0, 0, 0,
				3195	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3196	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3197	};
				3198
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3199	static int
				3200	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3201	{
				3202	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3203
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3204	Py_UNICODE *s1 = str1->str;
				3205	Py_UNICODE *s2 = str2->str;
				3206
				3207	len1 = str1->length;
				3208	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3209
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3210	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3211	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3212	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3213
				3214	c1 = *s1++;
				3215	c2 = *s2++;
				3216	if (c1 > (1<<11) * 26)
				3217	c1 += utf16Fixup[c1>>11];
				3218	if (c2 > (1<<11) * 26)
				3219	c2 += utf16Fixup[c2>>11];
				3220
				3221	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3222	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3223	if (diff)
				3224	return (diff < 0) ? -1 : (diff != 0);
				3225	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3226	}
				3227
				3228	return (len1 < len2) ? -1 : (len1 != len2);
				3229	}
				3230
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3231	#else
				3232
				3233	static int
				3234	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3235	{
				3236	register int len1, len2;
				3237
				3238	Py_UNICODE *s1 = str1->str;
				3239	Py_UNICODE *s2 = str2->str;
				3240
				3241	len1 = str1->length;
				3242	len2 = str2->length;
				3243
				3244	while (len1 > 0 && len2 > 0) {
				3245	register long diff;
				3246
				3247	diff = (long)s1++ - (long)s2++;
				3248	if (diff)
				3249	return (diff < 0) ? -1 : (diff != 0);
				3250	len1--; len2--;
				3251	}
				3252
				3253	return (len1 < len2) ? -1 : (len1 != len2);
				3254	}
				3255
				3256	#endif
				3257
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3258	int PyUnicode_Compare(PyObject *left,
				3259	PyObject *right)
				3260	{
				3261	PyUnicodeObject u = NULL, v = NULL;
				3262	int result;
				3263
				3264	/* Coerce the two arguments */
				3265	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3266	if (u == NULL)
				3267	goto onError;
				3268	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3269	if (v == NULL)
				3270	goto onError;
				3271
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3272	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3273	if (v == u) {
				3274	Py_DECREF(u);
				3275	Py_DECREF(v);
				3276	return 0;
				3277	}
				3278
				3279	result = unicode_compare(u, v);
				3280
				3281	Py_DECREF(u);
				3282	Py_DECREF(v);
				3283	return result;
				3284
				3285	onError:
				3286	Py_XDECREF(u);
				3287	Py_XDECREF(v);
				3288	return -1;
				3289	}
				3290
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3291	int PyUnicode_Contains(PyObject *container,
				3292	PyObject *element)
				3293	{
				3294	PyUnicodeObject u = NULL, v = NULL;
				3295	int result;
				3296	register const Py_UNICODE p, e;
				3297	register Py_UNICODE ch;
				3298
				3299	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3300	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3301	if (v == NULL) {
				3302	PyErr_SetString(PyExc_TypeError,
				3303	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3304	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3305	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3306	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3307	if (u == NULL) {
				3308	Py_DECREF(v);
				3309	goto onError;
				3310	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3311
				3312	/* Check v in u */
				3313	if (PyUnicode_GET_SIZE(v) != 1) {
				3314	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3315	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3316	goto onError;
				3317	}
				3318	ch = *PyUnicode_AS_UNICODE(v);
				3319	p = PyUnicode_AS_UNICODE(u);
				3320	e = p + PyUnicode_GET_SIZE(u);
				3321	result = 0;
				3322	while (p < e) {
				3323	if (*p++ == ch) {
				3324	result = 1;
				3325	break;
				3326	}
				3327	}
				3328
				3329	Py_DECREF(u);
				3330	Py_DECREF(v);
				3331	return result;
				3332
				3333	onError:
				3334	Py_XDECREF(u);
				3335	Py_XDECREF(v);
				3336	return -1;
				3337	}
				3338
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3339	/* Concat to string or Unicode object giving a new Unicode object. */
				3340
				3341	PyObject PyUnicode_Concat(PyObject left,
				3342	PyObject *right)
				3343	{
				3344	PyUnicodeObject u = NULL, v = NULL, *w;
				3345
				3346	/* Coerce the two arguments */
				3347	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3348	if (u == NULL)
				3349	goto onError;
				3350	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3351	if (v == NULL)
				3352	goto onError;
				3353
				3354	/* Shortcuts */
				3355	if (v == unicode_empty) {
				3356	Py_DECREF(v);
				3357	return (PyObject *)u;
				3358	}
				3359	if (u == unicode_empty) {
				3360	Py_DECREF(u);
				3361	return (PyObject *)v;
				3362	}
				3363
				3364	/* Concat the two Unicode strings */
				3365	w = _PyUnicode_New(u->length + v->length);
				3366	if (w == NULL)
				3367	goto onError;
				3368	Py_UNICODE_COPY(w->str, u->str, u->length);
				3369	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3370
				3371	Py_DECREF(u);
				3372	Py_DECREF(v);
				3373	return (PyObject *)w;
				3374
				3375	onError:
				3376	Py_XDECREF(u);
				3377	Py_XDECREF(v);
				3378	return NULL;
				3379	}
				3380
				3381	static char count__doc__[] =
				3382	"S.count(sub[, start[, end]]) -> int\n\
				3383	\n\
				3384	Return the number of occurrences of substring sub in Unicode string\n\
				3385	S[start:end]. Optional arguments start and end are\n\
				3386	interpreted as in slice notation.";
				3387
				3388	static PyObject *
				3389	unicode_count(PyUnicodeObject self, PyObject args)
				3390	{
				3391	PyUnicodeObject *substring;
				3392	int start = 0;
				3393	int end = INT_MAX;
				3394	PyObject *result;
				3395
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3396	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3397	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3398	return NULL;
				3399
				3400	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3401	(PyObject *)substring);
				3402	if (substring == NULL)
				3403	return NULL;
				3404
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3405	if (start < 0)
				3406	start += self->length;
				3407	if (start < 0)
				3408	start = 0;
				3409	if (end > self->length)
				3410	end = self->length;
				3411	if (end < 0)
				3412	end += self->length;
				3413	if (end < 0)
				3414	end = 0;
				3415
				3416	result = PyInt_FromLong((long) count(self, start, end, substring));
				3417
				3418	Py_DECREF(substring);
				3419	return result;
				3420	}
				3421
				3422	static char encode__doc__[] =
				3423	"S.encode([encoding[,errors]]) -> string\n\
				3424	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3425	Return an encoded string version of S. Default encoding is the current\n\
				3426	default string encoding. errors may be given to set a different error\n\
				3427	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3428	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3429
				3430	static PyObject *
				3431	unicode_encode(PyUnicodeObject self, PyObject args)
				3432	{
				3433	char *encoding = NULL;
				3434	char *errors = NULL;
				3435	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3436	return NULL;
				3437	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3438	}
				3439
				3440	static char expandtabs__doc__[] =
				3441	"S.expandtabs([tabsize]) -> unicode\n\
				3442	\n\
				3443	Return a copy of S where all tab characters are expanded using spaces.\n\
				3444	If tabsize is not given, a tab size of 8 characters is assumed.";
				3445
				3446	static PyObject*
				3447	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3448	{
				3449	Py_UNICODE *e;
				3450	Py_UNICODE *p;
				3451	Py_UNICODE *q;
				3452	int i, j;
				3453	PyUnicodeObject *u;
				3454	int tabsize = 8;
				3455
				3456	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3457	return NULL;
				3458
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3459	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3460	i = j = 0;
				3461	e = self->str + self->length;
				3462	for (p = self->str; p < e; p++)
				3463	if (*p == '\t') {
				3464	if (tabsize > 0)
				3465	j += tabsize - (j % tabsize);
				3466	}
				3467	else {
				3468	j++;
				3469	if (p == '\n' \|\| p == '\r') {
				3470	i += j;
				3471	j = 0;
				3472	}
				3473	}
				3474
				3475	/* Second pass: create output string and fill it */
				3476	u = _PyUnicode_New(i + j);
				3477	if (!u)
				3478	return NULL;
				3479
				3480	j = 0;
				3481	q = u->str;
				3482
				3483	for (p = self->str; p < e; p++)
				3484	if (*p == '\t') {
				3485	if (tabsize > 0) {
				3486	i = tabsize - (j % tabsize);
				3487	j += i;
				3488	while (i--)
				3489	*q++ = ' ';
				3490	}
				3491	}
				3492	else {
				3493	j++;
				3494	q++ = p;
				3495	if (p == '\n' \|\| p == '\r')
				3496	j = 0;
				3497	}
				3498
				3499	return (PyObject*) u;
				3500	}
				3501
				3502	static char find__doc__[] =
				3503	"S.find(sub [,start [,end]]) -> int\n\
				3504	\n\
				3505	Return the lowest index in S where substring sub is found,\n\
				3506	such that sub is contained within s[start,end]. Optional\n\
				3507	arguments start and end are interpreted as in slice notation.\n\
				3508	\n\
				3509	Return -1 on failure.";
				3510
				3511	static PyObject *
				3512	unicode_find(PyUnicodeObject self, PyObject args)
				3513	{
				3514	PyUnicodeObject *substring;
				3515	int start = 0;
				3516	int end = INT_MAX;
				3517	PyObject *result;
				3518
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3519	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3520	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3521	return NULL;
				3522	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3523	(PyObject *)substring);
				3524	if (substring == NULL)
				3525	return NULL;
				3526
				3527	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3528
				3529	Py_DECREF(substring);
				3530	return result;
				3531	}
				3532
				3533	static PyObject *
				3534	unicode_getitem(PyUnicodeObject *self, int index)
				3535	{
				3536	if (index < 0 \|\| index >= self->length) {
				3537	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3538	return NULL;
				3539	}
				3540
				3541	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3542	}
				3543
				3544	static long
				3545	unicode_hash(PyUnicodeObject *self)
				3546	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3547	/* Since Unicode objects compare equal to their ASCII string
				3548	counterparts, they should use the individual character values
				3549	as basis for their hash value. This is needed to assure that
				3550	strings and Unicode objects behave in the same way as
				3551	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3552
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3553	register int len;
				3554	register Py_UNICODE *p;
				3555	register long x;
				3556
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3557	if (self->hash != -1)
				3558	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3559	len = PyUnicode_GET_SIZE(self);
				3560	p = PyUnicode_AS_UNICODE(self);
				3561	x = *p << 7;
				3562	while (--len >= 0)
				3563	x = (1000003x) ^ p++;
				3564	x ^= PyUnicode_GET_SIZE(self);
				3565	if (x == -1)
				3566	x = -2;
				3567	self->hash = x;
				3568	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3569	}
				3570
				3571	static char index__doc__[] =
				3572	"S.index(sub [,start [,end]]) -> int\n\
				3573	\n\
				3574	Like S.find() but raise ValueError when the substring is not found.";
				3575
				3576	static PyObject *
				3577	unicode_index(PyUnicodeObject self, PyObject args)
				3578	{
				3579	int result;
				3580	PyUnicodeObject *substring;
				3581	int start = 0;
				3582	int end = INT_MAX;
				3583
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3584	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3585	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3586	return NULL;
				3587
				3588	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3589	(PyObject *)substring);
				3590	if (substring == NULL)
				3591	return NULL;
				3592
				3593	result = findstring(self, substring, start, end, 1);
				3594
				3595	Py_DECREF(substring);
				3596	if (result < 0) {
				3597	PyErr_SetString(PyExc_ValueError, "substring not found");
				3598	return NULL;
				3599	}
				3600	return PyInt_FromLong(result);
				3601	}
				3602
				3603	static char islower__doc__[] =
				3604	"S.islower() -> int\n\
				3605	\n\
				3606	Return 1 if all cased characters in S are lowercase and there is\n\
				3607	at least one cased character in S, 0 otherwise.";
				3608
				3609	static PyObject*
				3610	unicode_islower(PyUnicodeObject self, PyObject args)
				3611	{
				3612	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3613	register const Py_UNICODE *e;
				3614	int cased;
				3615
				3616	if (!PyArg_NoArgs(args))
				3617	return NULL;
				3618
				3619	/* Shortcut for single character strings */
				3620	if (PyUnicode_GET_SIZE(self) == 1)
				3621	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3622
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3623	/* Special case for empty strings */
				3624	if (PyString_GET_SIZE(self) == 0)
				3625	return PyInt_FromLong(0);
				3626
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3627	e = p + PyUnicode_GET_SIZE(self);
				3628	cased = 0;
				3629	for (; p < e; p++) {
				3630	register const Py_UNICODE ch = *p;
				3631
				3632	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3633	return PyInt_FromLong(0);
				3634	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3635	cased = 1;
				3636	}
				3637	return PyInt_FromLong(cased);
				3638	}
				3639
				3640	static char isupper__doc__[] =
				3641	"S.isupper() -> int\n\
				3642	\n\
				3643	Return 1 if all cased characters in S are uppercase and there is\n\
				3644	at least one cased character in S, 0 otherwise.";
				3645
				3646	static PyObject*
				3647	unicode_isupper(PyUnicodeObject self, PyObject args)
				3648	{
				3649	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3650	register const Py_UNICODE *e;
				3651	int cased;
				3652
				3653	if (!PyArg_NoArgs(args))
				3654	return NULL;
				3655
				3656	/* Shortcut for single character strings */
				3657	if (PyUnicode_GET_SIZE(self) == 1)
				3658	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3659
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3660	/* Special case for empty strings */
				3661	if (PyString_GET_SIZE(self) == 0)
				3662	return PyInt_FromLong(0);
				3663
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3664	e = p + PyUnicode_GET_SIZE(self);
				3665	cased = 0;
				3666	for (; p < e; p++) {
				3667	register const Py_UNICODE ch = *p;
				3668
				3669	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3670	return PyInt_FromLong(0);
				3671	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3672	cased = 1;
				3673	}
				3674	return PyInt_FromLong(cased);
				3675	}
				3676
				3677	static char istitle__doc__[] =
				3678	"S.istitle() -> int\n\
				3679	\n\
				3680	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3681	may only follow uncased characters and lowercase characters only cased\n\
				3682	ones. Return 0 otherwise.";
				3683
				3684	static PyObject*
				3685	unicode_istitle(PyUnicodeObject self, PyObject args)
				3686	{
				3687	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3688	register const Py_UNICODE *e;
				3689	int cased, previous_is_cased;
				3690
				3691	if (!PyArg_NoArgs(args))
				3692	return NULL;
				3693
				3694	/* Shortcut for single character strings */
				3695	if (PyUnicode_GET_SIZE(self) == 1)
				3696	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3697	(Py_UNICODE_ISUPPER(*p) != 0));
				3698
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3699	/* Special case for empty strings */
				3700	if (PyString_GET_SIZE(self) == 0)
				3701	return PyInt_FromLong(0);
				3702
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3703	e = p + PyUnicode_GET_SIZE(self);
				3704	cased = 0;
				3705	previous_is_cased = 0;
				3706	for (; p < e; p++) {
				3707	register const Py_UNICODE ch = *p;
				3708
				3709	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3710	if (previous_is_cased)
				3711	return PyInt_FromLong(0);
				3712	previous_is_cased = 1;
				3713	cased = 1;
				3714	}
				3715	else if (Py_UNICODE_ISLOWER(ch)) {
				3716	if (!previous_is_cased)
				3717	return PyInt_FromLong(0);
				3718	previous_is_cased = 1;
				3719	cased = 1;
				3720	}
				3721	else
				3722	previous_is_cased = 0;
				3723	}
				3724	return PyInt_FromLong(cased);
				3725	}
				3726
				3727	static char isspace__doc__[] =
				3728	"S.isspace() -> int\n\
				3729	\n\
				3730	Return 1 if there are only whitespace characters in S,\n\
				3731	0 otherwise.";
				3732
				3733	static PyObject*
				3734	unicode_isspace(PyUnicodeObject self, PyObject args)
				3735	{
				3736	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3737	register const Py_UNICODE *e;
				3738
				3739	if (!PyArg_NoArgs(args))
				3740	return NULL;
				3741
				3742	/* Shortcut for single character strings */
				3743	if (PyUnicode_GET_SIZE(self) == 1 &&
				3744	Py_UNICODE_ISSPACE(*p))
				3745	return PyInt_FromLong(1);
				3746
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3747	/* Special case for empty strings */
				3748	if (PyString_GET_SIZE(self) == 0)
				3749	return PyInt_FromLong(0);
				3750
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3751	e = p + PyUnicode_GET_SIZE(self);
				3752	for (; p < e; p++) {
				3753	if (!Py_UNICODE_ISSPACE(*p))
				3754	return PyInt_FromLong(0);
				3755	}
				3756	return PyInt_FromLong(1);
				3757	}
				3758
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3759	static char isalpha__doc__[] =
				3760	"S.isalpha() -> int\n\
				3761	\n\
				3762	Return 1 if all characters in S are alphabetic\n\
				3763	and there is at least one character in S, 0 otherwise.";
				3764
				3765	static PyObject*
				3766	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3767	{
				3768	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3769	register const Py_UNICODE *e;
				3770
				3771	if (!PyArg_NoArgs(args))
				3772	return NULL;
				3773
				3774	/* Shortcut for single character strings */
				3775	if (PyUnicode_GET_SIZE(self) == 1 &&
				3776	Py_UNICODE_ISALPHA(*p))
				3777	return PyInt_FromLong(1);
				3778
				3779	/* Special case for empty strings */
				3780	if (PyString_GET_SIZE(self) == 0)
				3781	return PyInt_FromLong(0);
				3782
				3783	e = p + PyUnicode_GET_SIZE(self);
				3784	for (; p < e; p++) {
				3785	if (!Py_UNICODE_ISALPHA(*p))
				3786	return PyInt_FromLong(0);
				3787	}
				3788	return PyInt_FromLong(1);
				3789	}
				3790
				3791	static char isalnum__doc__[] =
				3792	"S.isalnum() -> int\n\
				3793	\n\
				3794	Return 1 if all characters in S are alphanumeric\n\
				3795	and there is at least one character in S, 0 otherwise.";
				3796
				3797	static PyObject*
				3798	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3799	{
				3800	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3801	register const Py_UNICODE *e;
				3802
				3803	if (!PyArg_NoArgs(args))
				3804	return NULL;
				3805
				3806	/* Shortcut for single character strings */
				3807	if (PyUnicode_GET_SIZE(self) == 1 &&
				3808	Py_UNICODE_ISALNUM(*p))
				3809	return PyInt_FromLong(1);
				3810
				3811	/* Special case for empty strings */
				3812	if (PyString_GET_SIZE(self) == 0)
				3813	return PyInt_FromLong(0);
				3814
				3815	e = p + PyUnicode_GET_SIZE(self);
				3816	for (; p < e; p++) {
				3817	if (!Py_UNICODE_ISALNUM(*p))
				3818	return PyInt_FromLong(0);
				3819	}
				3820	return PyInt_FromLong(1);
				3821	}
				3822
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3823	static char isdecimal__doc__[] =
				3824	"S.isdecimal() -> int\n\
				3825	\n\
				3826	Return 1 if there are only decimal characters in S,\n\
				3827	0 otherwise.";
				3828
				3829	static PyObject*
				3830	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3831	{
				3832	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3833	register const Py_UNICODE *e;
				3834
				3835	if (!PyArg_NoArgs(args))
				3836	return NULL;
				3837
				3838	/* Shortcut for single character strings */
				3839	if (PyUnicode_GET_SIZE(self) == 1 &&
				3840	Py_UNICODE_ISDECIMAL(*p))
				3841	return PyInt_FromLong(1);
				3842
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3843	/* Special case for empty strings */
				3844	if (PyString_GET_SIZE(self) == 0)
				3845	return PyInt_FromLong(0);
				3846
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3847	e = p + PyUnicode_GET_SIZE(self);
				3848	for (; p < e; p++) {
				3849	if (!Py_UNICODE_ISDECIMAL(*p))
				3850	return PyInt_FromLong(0);
				3851	}
				3852	return PyInt_FromLong(1);
				3853	}
				3854
				3855	static char isdigit__doc__[] =
				3856	"S.isdigit() -> int\n\
				3857	\n\
				3858	Return 1 if there are only digit characters in S,\n\
				3859	0 otherwise.";
				3860
				3861	static PyObject*
				3862	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3863	{
				3864	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3865	register const Py_UNICODE *e;
				3866
				3867	if (!PyArg_NoArgs(args))
				3868	return NULL;
				3869
				3870	/* Shortcut for single character strings */
				3871	if (PyUnicode_GET_SIZE(self) == 1 &&
				3872	Py_UNICODE_ISDIGIT(*p))
				3873	return PyInt_FromLong(1);
				3874
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3875	/* Special case for empty strings */
				3876	if (PyString_GET_SIZE(self) == 0)
				3877	return PyInt_FromLong(0);
				3878
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3879	e = p + PyUnicode_GET_SIZE(self);
				3880	for (; p < e; p++) {
				3881	if (!Py_UNICODE_ISDIGIT(*p))
				3882	return PyInt_FromLong(0);
				3883	}
				3884	return PyInt_FromLong(1);
				3885	}
				3886
				3887	static char isnumeric__doc__[] =
				3888	"S.isnumeric() -> int\n\
				3889	\n\
				3890	Return 1 if there are only numeric characters in S,\n\
				3891	0 otherwise.";
				3892
				3893	static PyObject*
				3894	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3895	{
				3896	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3897	register const Py_UNICODE *e;
				3898
				3899	if (!PyArg_NoArgs(args))
				3900	return NULL;
				3901
				3902	/* Shortcut for single character strings */
				3903	if (PyUnicode_GET_SIZE(self) == 1 &&
				3904	Py_UNICODE_ISNUMERIC(*p))
				3905	return PyInt_FromLong(1);
				3906
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3907	/* Special case for empty strings */
				3908	if (PyString_GET_SIZE(self) == 0)
				3909	return PyInt_FromLong(0);
				3910
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3911	e = p + PyUnicode_GET_SIZE(self);
				3912	for (; p < e; p++) {
				3913	if (!Py_UNICODE_ISNUMERIC(*p))
				3914	return PyInt_FromLong(0);
				3915	}
				3916	return PyInt_FromLong(1);
				3917	}
				3918
				3919	static char join__doc__[] =
				3920	"S.join(sequence) -> unicode\n\
				3921	\n\
				3922	Return a string which is the concatenation of the strings in the\n\
				3923	sequence. The separator between elements is S.";
				3924
				3925	static PyObject*
				3926	unicode_join(PyUnicodeObject self, PyObject args)
				3927	{
				3928	PyObject *data;
				3929	if (!PyArg_ParseTuple(args, "O:join", &data))
				3930	return NULL;
				3931
				3932	return PyUnicode_Join((PyObject *)self, data);
				3933	}
				3934
				3935	static int
				3936	unicode_length(PyUnicodeObject *self)
				3937	{
				3938	return self->length;
				3939	}
				3940
				3941	static char ljust__doc__[] =
				3942	"S.ljust(width) -> unicode\n\
				3943	\n\
				3944	Return S left justified in a Unicode string of length width. Padding is\n\
				3945	done using spaces.";
				3946
				3947	static PyObject *
				3948	unicode_ljust(PyUnicodeObject self, PyObject args)
				3949	{
				3950	int width;
				3951	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3952	return NULL;
				3953
				3954	if (self->length >= width) {
				3955	Py_INCREF(self);
				3956	return (PyObject*) self;
				3957	}
				3958
				3959	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3960	}
				3961
				3962	static char lower__doc__[] =
				3963	"S.lower() -> unicode\n\
				3964	\n\
				3965	Return a copy of the string S converted to lowercase.";
				3966
				3967	static PyObject*
				3968	unicode_lower(PyUnicodeObject self, PyObject args)
				3969	{
				3970	if (!PyArg_NoArgs(args))
				3971	return NULL;
				3972	return fixup(self, fixlower);
				3973	}
				3974
				3975	static char lstrip__doc__[] =
				3976	"S.lstrip() -> unicode\n\
				3977	\n\
				3978	Return a copy of the string S with leading whitespace removed.";
				3979
				3980	static PyObject *
				3981	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3982	{
				3983	if (!PyArg_NoArgs(args))
				3984	return NULL;
				3985	return strip(self, 1, 0);
				3986	}
				3987
				3988	static PyObject*
				3989	unicode_repeat(PyUnicodeObject *str, int len)
				3990	{
				3991	PyUnicodeObject *u;
				3992	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3993	int nchars;
				3994	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3995
				3996	if (len < 0)
				3997	len = 0;
				3998
				3999	if (len == 1) {
				4000	/* no repeat, return original string */
				4001	Py_INCREF(str);
				4002	return (PyObject*) str;
				4003	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4004
				4005	/* ensure # of chars needed doesn't overflow int and # of bytes
				4006	* needed doesn't overflow size_t
				4007	*/
				4008	nchars = len * str->length;
				4009	if (len && nchars / len != str->length) {
				4010	PyErr_SetString(PyExc_OverflowError,
				4011	"repeated string is too long");
				4012	return NULL;
				4013	}
				4014	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4015	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4016	PyErr_SetString(PyExc_OverflowError,
				4017	"repeated string is too long");
				4018	return NULL;
				4019	}
				4020	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4021	if (!u)
				4022	return NULL;
				4023
				4024	p = u->str;
				4025
				4026	while (len-- > 0) {
				4027	Py_UNICODE_COPY(p, str->str, str->length);
				4028	p += str->length;
				4029	}
				4030
				4031	return (PyObject*) u;
				4032	}
				4033
				4034	PyObject PyUnicode_Replace(PyObject obj,
				4035	PyObject *subobj,
				4036	PyObject *replobj,
				4037	int maxcount)
				4038	{
				4039	PyObject *self;
				4040	PyObject *str1;
				4041	PyObject *str2;
				4042	PyObject *result;
				4043
				4044	self = PyUnicode_FromObject(obj);
				4045	if (self == NULL)
				4046	return NULL;
				4047	str1 = PyUnicode_FromObject(subobj);
				4048	if (str1 == NULL) {
				4049	Py_DECREF(self);
				4050	return NULL;
				4051	}
				4052	str2 = PyUnicode_FromObject(replobj);
				4053	if (str2 == NULL) {
				4054	Py_DECREF(self);
				4055	Py_DECREF(str1);
				4056	return NULL;
				4057	}
				4058	result = replace((PyUnicodeObject *)self,
				4059	(PyUnicodeObject *)str1,
				4060	(PyUnicodeObject *)str2,
				4061	maxcount);
				4062	Py_DECREF(self);
				4063	Py_DECREF(str1);
				4064	Py_DECREF(str2);
				4065	return result;
				4066	}
				4067
				4068	static char replace__doc__[] =
				4069	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4070	\n\
				4071	Return a copy of S with all occurrences of substring\n\
				4072	old replaced by new. If the optional argument maxsplit is\n\
				4073	given, only the first maxsplit occurrences are replaced.";
				4074
				4075	static PyObject*
				4076	unicode_replace(PyUnicodeObject self, PyObject args)
				4077	{
				4078	PyUnicodeObject *str1;
				4079	PyUnicodeObject *str2;
				4080	int maxcount = -1;
				4081	PyObject *result;
				4082
				4083	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4084	return NULL;
				4085	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4086	if (str1 == NULL)
				4087	return NULL;
				4088	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4089	if (str2 == NULL)
				4090	return NULL;
				4091
				4092	result = replace(self, str1, str2, maxcount);
				4093
				4094	Py_DECREF(str1);
				4095	Py_DECREF(str2);
				4096	return result;
				4097	}
				4098
				4099	static
				4100	PyObject unicode_repr(PyObject unicode)
				4101	{
				4102	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4103	PyUnicode_GET_SIZE(unicode),
				4104	1);
				4105	}
				4106
				4107	static char rfind__doc__[] =
				4108	"S.rfind(sub [,start [,end]]) -> int\n\
				4109	\n\
				4110	Return the highest index in S where substring sub is found,\n\
				4111	such that sub is contained within s[start,end]. Optional\n\
				4112	arguments start and end are interpreted as in slice notation.\n\
				4113	\n\
				4114	Return -1 on failure.";
				4115
				4116	static PyObject *
				4117	unicode_rfind(PyUnicodeObject self, PyObject args)
				4118	{
				4119	PyUnicodeObject *substring;
				4120	int start = 0;
				4121	int end = INT_MAX;
				4122	PyObject *result;
				4123
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4124	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4125	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4126	return NULL;
				4127	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4128	(PyObject *)substring);
				4129	if (substring == NULL)
				4130	return NULL;
				4131
				4132	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4133
				4134	Py_DECREF(substring);
				4135	return result;
				4136	}
				4137
				4138	static char rindex__doc__[] =
				4139	"S.rindex(sub [,start [,end]]) -> int\n\
				4140	\n\
				4141	Like S.rfind() but raise ValueError when the substring is not found.";
				4142
				4143	static PyObject *
				4144	unicode_rindex(PyUnicodeObject self, PyObject args)
				4145	{
				4146	int result;
				4147	PyUnicodeObject *substring;
				4148	int start = 0;
				4149	int end = INT_MAX;
				4150
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4151	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4152	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4153	return NULL;
				4154	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4155	(PyObject *)substring);
				4156	if (substring == NULL)
				4157	return NULL;
				4158
				4159	result = findstring(self, substring, start, end, -1);
				4160
				4161	Py_DECREF(substring);
				4162	if (result < 0) {
				4163	PyErr_SetString(PyExc_ValueError, "substring not found");
				4164	return NULL;
				4165	}
				4166	return PyInt_FromLong(result);
				4167	}
				4168
				4169	static char rjust__doc__[] =
				4170	"S.rjust(width) -> unicode\n\
				4171	\n\
				4172	Return S right justified in a Unicode string of length width. Padding is\n\
				4173	done using spaces.";
				4174
				4175	static PyObject *
				4176	unicode_rjust(PyUnicodeObject self, PyObject args)
				4177	{
				4178	int width;
				4179	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4180	return NULL;
				4181
				4182	if (self->length >= width) {
				4183	Py_INCREF(self);
				4184	return (PyObject*) self;
				4185	}
				4186
				4187	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4188	}
				4189
				4190	static char rstrip__doc__[] =
				4191	"S.rstrip() -> unicode\n\
				4192	\n\
				4193	Return a copy of the string S with trailing whitespace removed.";
				4194
				4195	static PyObject *
				4196	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4197	{
				4198	if (!PyArg_NoArgs(args))
				4199	return NULL;
				4200	return strip(self, 0, 1);
				4201	}
				4202
				4203	static PyObject*
				4204	unicode_slice(PyUnicodeObject *self, int start, int end)
				4205	{
				4206	/* standard clamping */
				4207	if (start < 0)
				4208	start = 0;
				4209	if (end < 0)
				4210	end = 0;
				4211	if (end > self->length)
				4212	end = self->length;
				4213	if (start == 0 && end == self->length) {
				4214	/* full slice, return original string */
				4215	Py_INCREF(self);
				4216	return (PyObject*) self;
				4217	}
				4218	if (start > end)
				4219	start = end;
				4220	/* copy slice */
				4221	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4222	end - start);
				4223	}
				4224
				4225	PyObject PyUnicode_Split(PyObject s,
				4226	PyObject *sep,
				4227	int maxsplit)
				4228	{
				4229	PyObject *result;
				4230
				4231	s = PyUnicode_FromObject(s);
				4232	if (s == NULL)
				4233	return NULL;
				4234	if (sep != NULL) {
				4235	sep = PyUnicode_FromObject(sep);
				4236	if (sep == NULL) {
				4237	Py_DECREF(s);
				4238	return NULL;
				4239	}
				4240	}
				4241
				4242	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4243
				4244	Py_DECREF(s);
				4245	Py_XDECREF(sep);
				4246	return result;
				4247	}
				4248
				4249	static char split__doc__[] =
				4250	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4251	\n\
				4252	Return a list of the words in S, using sep as the\n\
				4253	delimiter string. If maxsplit is given, at most maxsplit\n\
				4254	splits are done. If sep is not specified, any whitespace string\n\
				4255	is a separator.";
				4256
				4257	static PyObject*
				4258	unicode_split(PyUnicodeObject self, PyObject args)
				4259	{
				4260	PyObject *substring = Py_None;
				4261	int maxcount = -1;
				4262
				4263	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4264	return NULL;
				4265
				4266	if (substring == Py_None)
				4267	return split(self, NULL, maxcount);
				4268	else if (PyUnicode_Check(substring))
				4269	return split(self, (PyUnicodeObject *)substring, maxcount);
				4270	else
				4271	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4272	}
				4273
				4274	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4275	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4276	\n\
				4277	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4278	Line breaks are not included in the resulting list unless keepends\n\
				4279	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4280
				4281	static PyObject*
				4282	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4283	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4284	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4285
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4286	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4287	return NULL;
				4288
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4289	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4290	}
				4291
				4292	static
				4293	PyObject unicode_str(PyUnicodeObject self)
				4294	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4295	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4296	}
				4297
				4298	static char strip__doc__[] =
				4299	"S.strip() -> unicode\n\
				4300	\n\
				4301	Return a copy of S with leading and trailing whitespace removed.";
				4302
				4303	static PyObject *
				4304	unicode_strip(PyUnicodeObject self, PyObject args)
				4305	{
				4306	if (!PyArg_NoArgs(args))
				4307	return NULL;
				4308	return strip(self, 1, 1);
				4309	}
				4310
				4311	static char swapcase__doc__[] =
				4312	"S.swapcase() -> unicode\n\
				4313	\n\
				4314	Return a copy of S with uppercase characters converted to lowercase\n\
				4315	and vice versa.";
				4316
				4317	static PyObject*
				4318	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4319	{
				4320	if (!PyArg_NoArgs(args))
				4321	return NULL;
				4322	return fixup(self, fixswapcase);
				4323	}
				4324
				4325	static char translate__doc__[] =
				4326	"S.translate(table) -> unicode\n\
				4327	\n\
				4328	Return a copy of the string S, where all characters have been mapped\n\
				4329	through the given translation table, which must be a mapping of\n\
				4330	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4331	are left untouched. Characters mapped to None are deleted.";
				4332
				4333	static PyObject*
				4334	unicode_translate(PyUnicodeObject self, PyObject args)
				4335	{
				4336	PyObject *table;
				4337
				4338	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4339	return NULL;
				4340	return PyUnicode_TranslateCharmap(self->str,
				4341	self->length,
				4342	table,
				4343	"ignore");
				4344	}
				4345
				4346	static char upper__doc__[] =
				4347	"S.upper() -> unicode\n\
				4348	\n\
				4349	Return a copy of S converted to uppercase.";
				4350
				4351	static PyObject*
				4352	unicode_upper(PyUnicodeObject self, PyObject args)
				4353	{
				4354	if (!PyArg_NoArgs(args))
				4355	return NULL;
				4356	return fixup(self, fixupper);
				4357	}
				4358
				4359	#if 0
				4360	static char zfill__doc__[] =
				4361	"S.zfill(width) -> unicode\n\
				4362	\n\
				4363	Pad a numeric string x with zeros on the left, to fill a field\n\
				4364	of the specified width. The string x is never truncated.";
				4365
				4366	static PyObject *
				4367	unicode_zfill(PyUnicodeObject self, PyObject args)
				4368	{
				4369	int fill;
				4370	PyUnicodeObject *u;
				4371
				4372	int width;
				4373	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4374	return NULL;
				4375
				4376	if (self->length >= width) {
				4377	Py_INCREF(self);
				4378	return (PyObject*) self;
				4379	}
				4380
				4381	fill = width - self->length;
				4382
				4383	u = pad(self, fill, 0, '0');
				4384
				4385	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4386	/* move sign to beginning of string */
				4387	u->str[0] = u->str[fill];
				4388	u->str[fill] = '0';
				4389	}
				4390
				4391	return (PyObject*) u;
				4392	}
				4393	#endif
				4394
				4395	#if 0
				4396	static PyObject*
				4397	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4398	{
				4399	if (!PyArg_NoArgs(args))
				4400	return NULL;
				4401	return PyInt_FromLong(unicode_freelist_size);
				4402	}
				4403	#endif
				4404
				4405	static char startswith__doc__[] =
				4406	"S.startswith(prefix[, start[, end]]) -> int\n\
				4407	\n\
				4408	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4409	optional start, test S beginning at that position. With optional end, stop\n\
				4410	comparing S at that position.";
				4411
				4412	static PyObject *
				4413	unicode_startswith(PyUnicodeObject *self,
				4414	PyObject *args)
				4415	{
				4416	PyUnicodeObject *substring;
				4417	int start = 0;
				4418	int end = INT_MAX;
				4419	PyObject *result;
				4420
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4421	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4422	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4423	return NULL;
				4424	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4425	(PyObject *)substring);
				4426	if (substring == NULL)
				4427	return NULL;
				4428
				4429	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4430
				4431	Py_DECREF(substring);
				4432	return result;
				4433	}
				4434
				4435
				4436	static char endswith__doc__[] =
				4437	"S.endswith(suffix[, start[, end]]) -> int\n\
				4438	\n\
				4439	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4440	optional start, test S beginning at that position. With optional end, stop\n\
				4441	comparing S at that position.";
				4442
				4443	static PyObject *
				4444	unicode_endswith(PyUnicodeObject *self,
				4445	PyObject *args)
				4446	{
				4447	PyUnicodeObject *substring;
				4448	int start = 0;
				4449	int end = INT_MAX;
				4450	PyObject *result;
				4451
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4452	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4453	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4454	return NULL;
				4455	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4456	(PyObject *)substring);
				4457	if (substring == NULL)
				4458	return NULL;
				4459
				4460	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4461
				4462	Py_DECREF(substring);
				4463	return result;
				4464	}
				4465
				4466
				4467	static PyMethodDef unicode_methods[] = {
				4468
				4469	/* Order is according to common usage: often used methods should
				4470	appear first, since lookup is done sequentially. */
				4471
				4472	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4473	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4474	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4475	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4476	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4477	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4478	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4479	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4480	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4481	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4482	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4483	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4484	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4485	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4486	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4487	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4488	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4489	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4490	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4491	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4492	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4493	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4494	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4495	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4496	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4497	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4498	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4499	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4500	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4501	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4502	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4503	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4504	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4505	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4506	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4507	#if 0
				4508	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4509	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4510	#endif
				4511
				4512	#if 0
				4513	/* This one is just used for debugging the implementation. */
				4514	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4515	#endif
				4516
				4517	{NULL, NULL}
				4518	};
				4519
				4520	static PyObject *
				4521	unicode_getattr(PyUnicodeObject self, char name)
				4522	{
				4523	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4524	}
				4525
				4526	static PySequenceMethods unicode_as_sequence = {
				4527	(inquiry) unicode_length, /* sq_length */
				4528	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4529	(intargfunc) unicode_repeat, /* sq_repeat */
				4530	(intargfunc) unicode_getitem, /* sq_item */
				4531	(intintargfunc) unicode_slice, /* sq_slice */
				4532	0, /* sq_ass_item */
				4533	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4534	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4535	};
				4536
				4537	static int
				4538	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4539	int index,
				4540	const void **ptr)
				4541	{
				4542	if (index != 0) {
				4543	PyErr_SetString(PyExc_SystemError,
				4544	"accessing non-existent unicode segment");
				4545	return -1;
				4546	}
				4547	ptr = (void ) self->str;
				4548	return PyUnicode_GET_DATA_SIZE(self);
				4549	}
				4550
				4551	static int
				4552	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4553	const void **ptr)
				4554	{
				4555	PyErr_SetString(PyExc_TypeError,
				4556	"cannot use unicode as modifyable buffer");
				4557	return -1;
				4558	}
				4559
				4560	static int
				4561	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4562	int *lenp)
				4563	{
				4564	if (lenp)
				4565	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4566	return 1;
				4567	}
				4568
				4569	static int
				4570	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4571	int index,
				4572	const void **ptr)
				4573	{
				4574	PyObject *str;
				4575
				4576	if (index != 0) {
				4577	PyErr_SetString(PyExc_SystemError,
				4578	"accessing non-existent unicode segment");
				4579	return -1;
				4580	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4581	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4582	if (str == NULL)
				4583	return -1;
				4584	ptr = (void ) PyString_AS_STRING(str);
				4585	return PyString_GET_SIZE(str);
				4586	}
				4587
				4588	/* Helpers for PyUnicode_Format() */
				4589
				4590	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4591	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4592	{
				4593	int argidx = *p_argidx;
				4594	if (argidx < arglen) {
				4595	(*p_argidx)++;
				4596	if (arglen < 0)
				4597	return args;
				4598	else
				4599	return PyTuple_GetItem(args, argidx);
				4600	}
				4601	PyErr_SetString(PyExc_TypeError,
				4602	"not enough arguments for format string");
				4603	return NULL;
				4604	}
				4605
				4606	#define F_LJUST (1<<0)
				4607	#define F_SIGN (1<<1)
				4608	#define F_BLANK (1<<2)
				4609	#define F_ALT (1<<3)
				4610	#define F_ZERO (1<<4)
				4611
				4612	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4613	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4614	{
				4615	register int i;
				4616	int len;
				4617	va_list va;
				4618	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4619	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4620
				4621	/* First, format the string as char array, then expand to Py_UNICODE
				4622	array. */
				4623	charbuffer = (char *)buffer;
				4624	len = vsprintf(charbuffer, format, va);
				4625	for (i = len - 1; i >= 0; i--)
				4626	buffer[i] = (Py_UNICODE) charbuffer[i];
				4627
				4628	va_end(va);
				4629	return len;
				4630	}
				4631
				4632	static int
				4633	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4634	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4635	int flags,
				4636	int prec,
				4637	int type,
				4638	PyObject *v)
				4639	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4640	/* fmt = '%#.' + `prec` + `type`
				4641	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4642	char fmt[20];
				4643	double x;
				4644
				4645	x = PyFloat_AsDouble(v);
				4646	if (x == -1.0 && PyErr_Occurred())
				4647	return -1;
				4648	if (prec < 0)
				4649	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4650	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4651	type = 'g';
				4652	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4653	/* worst case length calc to ensure no buffer overrun:
				4654	fmt = %#.<prec>g
				4655	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4656	for any double rep.)
				4657	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4658	If prec=0 the effective precision is 1 (the leading digit is
				4659	always given), therefore increase by one to 10+prec. */
				4660	if (buflen <= (size_t)10 + (size_t)prec) {
				4661	PyErr_SetString(PyExc_OverflowError,
				4662	"formatted float is too long (precision too long?)");
				4663	return -1;
				4664	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4665	return usprintf(buf, fmt, x);
				4666	}
				4667
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4668	static PyObject*
				4669	formatlong(PyObject *val, int flags, int prec, int type)
				4670	{
				4671	char *buf;
				4672	int i, len;
				4673	PyObject str; / temporary string object. */
				4674	PyUnicodeObject *result;
				4675
				4676	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4677	if (!str)
				4678	return NULL;
				4679	result = _PyUnicode_New(len);
				4680	for (i = 0; i < len; i++)
				4681	result->str[i] = buf[i];
				4682	result->str[len] = 0;
				4683	Py_DECREF(str);
				4684	return (PyObject*)result;
				4685	}
				4686
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4687	static int
				4688	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4689	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4690	int flags,
				4691	int prec,
				4692	int type,
				4693	PyObject *v)
				4694	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4695	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4696	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4697	+ 1 + 1 = 24*/
				4698	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4699	long x;
				4700
				4701	x = PyInt_AsLong(v);
				4702	if (x == -1 && PyErr_Occurred())
				4703	return -1;
				4704	if (prec < 0)
				4705	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4706	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4707	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4708	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4709	PyErr_SetString(PyExc_OverflowError,
				4710	"formatted integer is too long (precision too long?)");
				4711	return -1;
				4712	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4713	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4714	return usprintf(buf, fmt, x);
				4715	}
				4716
				4717	static int
				4718	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4719	size_t buflen,
				4720	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4721	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4722	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4723	if (PyUnicode_Check(v)) {
				4724	if (PyUnicode_GET_SIZE(v) != 1)
				4725	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4726	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4727	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4728
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4729	else if (PyString_Check(v)) {
				4730	if (PyString_GET_SIZE(v) != 1)
				4731	goto onError;
				4732	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4733	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4734
				4735	else {
				4736	/* Integer input truncated to a character */
				4737	long x;
				4738	x = PyInt_AsLong(v);
				4739	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4740	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4741	buf[0] = (char) x;
				4742	}
				4743	buf[1] = '\0';
				4744	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4745
				4746	onError:
				4747	PyErr_SetString(PyExc_TypeError,
				4748	"%c requires int or char");
				4749	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4750	}
				4751
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4752	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4753
				4754	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4755	chars are formatted. XXX This is a magic number. Each formatting
				4756	routine does bounds checking to ensure no overflow, but a better
				4757	solution may be to malloc a buffer of appropriate size for each
				4758	format. For now, the current solution is sufficient.
				4759	*/
				4760	#define FORMATBUFLEN (size_t)120
				4761
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4762	PyObject PyUnicode_Format(PyObject format,
				4763	PyObject *args)
				4764	{
				4765	Py_UNICODE fmt, res;
				4766	int fmtcnt, rescnt, reslen, arglen, argidx;
				4767	int args_owned = 0;
				4768	PyUnicodeObject *result = NULL;
				4769	PyObject *dict = NULL;
				4770	PyObject *uformat;
				4771
				4772	if (format == NULL \|\| args == NULL) {
				4773	PyErr_BadInternalCall();
				4774	return NULL;
				4775	}
				4776	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4777	if (uformat == NULL)
				4778	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4779	fmt = PyUnicode_AS_UNICODE(uformat);
				4780	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4781
				4782	reslen = rescnt = fmtcnt + 100;
				4783	result = _PyUnicode_New(reslen);
				4784	if (result == NULL)
				4785	goto onError;
				4786	res = PyUnicode_AS_UNICODE(result);
				4787
				4788	if (PyTuple_Check(args)) {
				4789	arglen = PyTuple_Size(args);
				4790	argidx = 0;
				4791	}
				4792	else {
				4793	arglen = -1;
				4794	argidx = -2;
				4795	}
				4796	if (args->ob_type->tp_as_mapping)
				4797	dict = args;
				4798
				4799	while (--fmtcnt >= 0) {
				4800	if (*fmt != '%') {
				4801	if (--rescnt < 0) {
				4802	rescnt = fmtcnt + 100;
				4803	reslen += rescnt;
				4804	if (_PyUnicode_Resize(result, reslen) < 0)
				4805	return NULL;
				4806	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4807	--rescnt;
				4808	}
				4809	res++ = fmt++;
				4810	}
				4811	else {
				4812	/* Got a format specifier */
				4813	int flags = 0;
				4814	int width = -1;
				4815	int prec = -1;
				4816	int size = 0;
				4817	Py_UNICODE c = '\0';
				4818	Py_UNICODE fill;
				4819	PyObject *v = NULL;
				4820	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4821	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4822	Py_UNICODE sign;
				4823	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4824	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4825
				4826	fmt++;
				4827	if (*fmt == '(') {
				4828	Py_UNICODE *keystart;
				4829	int keylen;
				4830	PyObject *key;
				4831	int pcount = 1;
				4832
				4833	if (dict == NULL) {
				4834	PyErr_SetString(PyExc_TypeError,
				4835	"format requires a mapping");
				4836	goto onError;
				4837	}
				4838	++fmt;
				4839	--fmtcnt;
				4840	keystart = fmt;
				4841	/* Skip over balanced parentheses */
				4842	while (pcount > 0 && --fmtcnt >= 0) {
				4843	if (*fmt == ')')
				4844	--pcount;
				4845	else if (*fmt == '(')
				4846	++pcount;
				4847	fmt++;
				4848	}
				4849	keylen = fmt - keystart - 1;
				4850	if (fmtcnt < 0 \|\| pcount > 0) {
				4851	PyErr_SetString(PyExc_ValueError,
				4852	"incomplete format key");
				4853	goto onError;
				4854	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4855	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4856	then looked up since Python uses strings to hold
				4857	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4858	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4859	key = PyUnicode_EncodeUTF8(keystart,
				4860	keylen,
				4861	NULL);
				4862	if (key == NULL)
				4863	goto onError;
				4864	if (args_owned) {
				4865	Py_DECREF(args);
				4866	args_owned = 0;
				4867	}
				4868	args = PyObject_GetItem(dict, key);
				4869	Py_DECREF(key);
				4870	if (args == NULL) {
				4871	goto onError;
				4872	}
				4873	args_owned = 1;
				4874	arglen = -1;
				4875	argidx = -2;
				4876	}
				4877	while (--fmtcnt >= 0) {
				4878	switch (c = *fmt++) {
				4879	case '-': flags \|= F_LJUST; continue;
				4880	case '+': flags \|= F_SIGN; continue;
				4881	case ' ': flags \|= F_BLANK; continue;
				4882	case '#': flags \|= F_ALT; continue;
				4883	case '0': flags \|= F_ZERO; continue;
				4884	}
				4885	break;
				4886	}
				4887	if (c == '*') {
				4888	v = getnextarg(args, arglen, &argidx);
				4889	if (v == NULL)
				4890	goto onError;
				4891	if (!PyInt_Check(v)) {
				4892	PyErr_SetString(PyExc_TypeError,
				4893	"* wants int");
				4894	goto onError;
				4895	}
				4896	width = PyInt_AsLong(v);
				4897	if (width < 0) {
				4898	flags \|= F_LJUST;
				4899	width = -width;
				4900	}
				4901	if (--fmtcnt >= 0)
				4902	c = *fmt++;
				4903	}
				4904	else if (c >= '0' && c <= '9') {
				4905	width = c - '0';
				4906	while (--fmtcnt >= 0) {
				4907	c = *fmt++;
				4908	if (c < '0' \|\| c > '9')
				4909	break;
				4910	if ((width*10) / 10 != width) {
				4911	PyErr_SetString(PyExc_ValueError,
				4912	"width too big");
				4913	goto onError;
				4914	}
				4915	width = width*10 + (c - '0');
				4916	}
				4917	}
				4918	if (c == '.') {
				4919	prec = 0;
				4920	if (--fmtcnt >= 0)
				4921	c = *fmt++;
				4922	if (c == '*') {
				4923	v = getnextarg(args, arglen, &argidx);
				4924	if (v == NULL)
				4925	goto onError;
				4926	if (!PyInt_Check(v)) {
				4927	PyErr_SetString(PyExc_TypeError,
				4928	"* wants int");
				4929	goto onError;
				4930	}
				4931	prec = PyInt_AsLong(v);
				4932	if (prec < 0)
				4933	prec = 0;
				4934	if (--fmtcnt >= 0)
				4935	c = *fmt++;
				4936	}
				4937	else if (c >= '0' && c <= '9') {
				4938	prec = c - '0';
				4939	while (--fmtcnt >= 0) {
				4940	c = Py_CHARMASK(*fmt++);
				4941	if (c < '0' \|\| c > '9')
				4942	break;
				4943	if ((prec*10) / 10 != prec) {
				4944	PyErr_SetString(PyExc_ValueError,
				4945	"prec too big");
				4946	goto onError;
				4947	}
				4948	prec = prec*10 + (c - '0');
				4949	}
				4950	}
				4951	} /* prec */
				4952	if (fmtcnt >= 0) {
				4953	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4954	size = c;
				4955	if (--fmtcnt >= 0)
				4956	c = *fmt++;
				4957	}
				4958	}
				4959	if (fmtcnt < 0) {
				4960	PyErr_SetString(PyExc_ValueError,
				4961	"incomplete format");
				4962	goto onError;
				4963	}
				4964	if (c != '%') {
				4965	v = getnextarg(args, arglen, &argidx);
				4966	if (v == NULL)
				4967	goto onError;
				4968	}
				4969	sign = 0;
				4970	fill = ' ';
				4971	switch (c) {
				4972
				4973	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4974	pbuf = formatbuf;
				4975	/* presume that buffer length is at least 1 */
				4976	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4977	len = 1;
				4978	break;
				4979
				4980	case 's':
				4981	case 'r':
				4982	if (PyUnicode_Check(v) && c == 's') {
				4983	temp = v;
				4984	Py_INCREF(temp);
				4985	}
				4986	else {
				4987	PyObject *unicode;
				4988	if (c == 's')
				4989	temp = PyObject_Str(v);
				4990	else
				4991	temp = PyObject_Repr(v);
				4992	if (temp == NULL)
				4993	goto onError;
				4994	if (!PyString_Check(temp)) {
				4995	/* XXX Note: this should never happen, since
				4996	PyObject_Repr() and PyObject_Str() assure
				4997	this */
				4998	Py_DECREF(temp);
				4999	PyErr_SetString(PyExc_TypeError,
				5000	"%s argument has non-string str()");
				5001	goto onError;
				5002	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5003	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5004	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5005	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5006	"strict");
				5007	Py_DECREF(temp);
				5008	temp = unicode;
				5009	if (temp == NULL)
				5010	goto onError;
				5011	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5012	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5013	len = PyUnicode_GET_SIZE(temp);
				5014	if (prec >= 0 && len > prec)
				5015	len = prec;
				5016	break;
				5017
				5018	case 'i':
				5019	case 'd':
				5020	case 'u':
				5021	case 'o':
				5022	case 'x':
				5023	case 'X':
				5024	if (c == 'i')
				5025	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5026	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5027	temp = formatlong(v, flags, prec, c);
				5028	if (!temp)
				5029	goto onError;
				5030	pbuf = PyUnicode_AS_UNICODE(temp);
				5031	len = PyUnicode_GET_SIZE(temp);
				5032	/* unbounded ints can always produce
				5033	a sign character! */
				5034	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5035	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5036	else {
				5037	pbuf = formatbuf;
				5038	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5039	flags, prec, c, v);
				5040	if (len < 0)
				5041	goto onError;
				5042	/* only d conversion is signed */
				5043	sign = c == 'd';
				5044	}
				5045	if (flags & F_ZERO)
				5046	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5047	break;
				5048
				5049	case 'e':
				5050	case 'E':
				5051	case 'f':
				5052	case 'g':
				5053	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5054	pbuf = formatbuf;
				5055	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5056	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5057	if (len < 0)
				5058	goto onError;
				5059	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5060	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5061	fill = '0';
				5062	break;
				5063
				5064	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5065	pbuf = formatbuf;
				5066	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5067	if (len < 0)
				5068	goto onError;
				5069	break;
				5070
				5071	default:
				5072	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5073	"unsupported format character '%c' (0x%x) "
				5074	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5075	(31<=c && c<=126) ? c : '?',
				5076	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5077	goto onError;
				5078	}
				5079	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5080	if (pbuf == '-' \|\| pbuf == '+') {
				5081	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5082	len--;
				5083	}
				5084	else if (flags & F_SIGN)
				5085	sign = '+';
				5086	else if (flags & F_BLANK)
				5087	sign = ' ';
				5088	else
				5089	sign = 0;
				5090	}
				5091	if (width < len)
				5092	width = len;
				5093	if (rescnt < width + (sign != 0)) {
				5094	reslen -= rescnt;
				5095	rescnt = width + fmtcnt + 100;
				5096	reslen += rescnt;
				5097	if (_PyUnicode_Resize(result, reslen) < 0)
				5098	return NULL;
				5099	res = PyUnicode_AS_UNICODE(result)
				5100	+ reslen - rescnt;
				5101	}
				5102	if (sign) {
				5103	if (fill != ' ')
				5104	*res++ = sign;
				5105	rescnt--;
				5106	if (width > len)
				5107	width--;
				5108	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5109	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5110	assert(pbuf[0] == '0');
				5111	assert(pbuf[1] == c);
				5112	if (fill != ' ') {
				5113	res++ = pbuf++;
				5114	res++ = pbuf++;
				5115	}
				5116	rescnt -= 2;
				5117	width -= 2;
				5118	if (width < 0)
				5119	width = 0;
				5120	len -= 2;
				5121	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5122	if (width > len && !(flags & F_LJUST)) {
				5123	do {
				5124	--rescnt;
				5125	*res++ = fill;
				5126	} while (--width > len);
				5127	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5128	if (fill == ' ') {
				5129	if (sign)
				5130	*res++ = sign;
				5131	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5132	assert(pbuf[0] == '0');
				5133	assert(pbuf[1] == c);
				5134	res++ = pbuf++;
				5135	res++ = pbuf++;
				5136	}
				5137	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5138	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5139	res += len;
				5140	rescnt -= len;
				5141	while (--width >= len) {
				5142	--rescnt;
				5143	*res++ = ' ';
				5144	}
				5145	if (dict && (argidx < arglen) && c != '%') {
				5146	PyErr_SetString(PyExc_TypeError,
				5147	"not all arguments converted");
				5148	goto onError;
				5149	}
				5150	Py_XDECREF(temp);
				5151	} /* '%' */
				5152	} /* until end */
				5153	if (argidx < arglen && !dict) {
				5154	PyErr_SetString(PyExc_TypeError,
				5155	"not all arguments converted");
				5156	goto onError;
				5157	}
				5158
				5159	if (args_owned) {
				5160	Py_DECREF(args);
				5161	}
				5162	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5163	if (_PyUnicode_Resize(result, reslen - rescnt))
				5164	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5165	return (PyObject *)result;
				5166
				5167	onError:
				5168	Py_XDECREF(result);
				5169	Py_DECREF(uformat);
				5170	if (args_owned) {
				5171	Py_DECREF(args);
				5172	}
				5173	return NULL;
				5174	}
				5175
				5176	static PyBufferProcs unicode_as_buffer = {
				5177	(getreadbufferproc) unicode_buffer_getreadbuf,
				5178	(getwritebufferproc) unicode_buffer_getwritebuf,
				5179	(getsegcountproc) unicode_buffer_getsegcount,
				5180	(getcharbufferproc) unicode_buffer_getcharbuf,
				5181	};
				5182
				5183	PyTypeObject PyUnicode_Type = {
				5184	PyObject_HEAD_INIT(&PyType_Type)
				5185	0, /* ob_size */
				5186	"unicode", /* tp_name */
				5187	sizeof(PyUnicodeObject), /* tp_size */
				5188	0, /* tp_itemsize */
				5189	/* Slots */
				5190	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5191	0, /* tp_print */
				5192	(getattrfunc)unicode_getattr, /* tp_getattr */
				5193	0, /* tp_setattr */
				5194	(cmpfunc) unicode_compare, /* tp_compare */
				5195	(reprfunc) unicode_repr, /* tp_repr */
				5196	0, /* tp_as_number */
				5197	&unicode_as_sequence, /* tp_as_sequence */
				5198	0, /* tp_as_mapping */
				5199	(hashfunc) unicode_hash, /* tp_hash*/
				5200	0, /* tp_call*/
				5201	(reprfunc) unicode_str, /* tp_str */
				5202	(getattrofunc) NULL, /* tp_getattro */
				5203	(setattrofunc) NULL, /* tp_setattro */
				5204	&unicode_as_buffer, /* tp_as_buffer */
				5205	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5206	};
				5207
				5208	/* Initialize the Unicode implementation */
				5209
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5210	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5211	{
				5212	/* Doublecheck the configuration... */
				5213	if (sizeof(Py_UNICODE) != 2)
				5214	Py_FatalError("Unicode configuration error: "
				5215	"sizeof(Py_UNICODE) != 2 bytes");
				5216
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5217	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5218	unicode_freelist = NULL;
				5219	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5220	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5221	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5222	}
				5223
				5224	/* Finalize the Unicode implementation */
				5225
				5226	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5227	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5228	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5229	PyUnicodeObject *u;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5230
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5231	Py_XDECREF(unicode_empty);
				5232	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5233
				5234	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5235	PyUnicodeObject *v = u;
				5236	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5237	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5238	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5239	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5240	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5241	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5242	unicode_freelist = NULL;
				5243	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5244	}