Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: c1f3d5414f0cfc4112d576613e9c975246bea338 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	67	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	68	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	69
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	70	#ifdef MS_WIN32
				71	#include <windows.h>
				72	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	73
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	74	/* Limit for the Unicode object free list */
				75
				76	#define MAX_UNICODE_FREELIST_SIZE 1024
				77
				78	/* Limit for the Unicode object free list stay alive optimization.
				79
				80	The implementation will keep allocated Unicode memory intact for
				81	all objects on the free list having a size less than this
				82	limit. This reduces malloc() overhead for small Unicode objects.
				83
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	84	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	85	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	malloc()-overhead) bytes of unused garbage.
				87
				88	Setting the limit to 0 effectively turns the feature off.
				89
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	90	Note: This is an experimental feature ! If you get core dumps when
				91	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92
				93	*/
				94
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	95	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	96
				97	/* Endianness switches; defaults to little endian */
				98
				99	#ifdef WORDS_BIGENDIAN
				100	# define BYTEORDER_IS_BIG_ENDIAN
				101	#else
				102	# define BYTEORDER_IS_LITTLE_ENDIAN
				103	#endif
				104
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	105	/* --- Globals ------------------------------------------------------------
				106
				107	The globals are initialized by the _PyUnicode_Init() API and should
				108	not be used before calling that API.
				109
				110	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	111
				112	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	113	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	114
				115	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	116	static PyUnicodeObject *unicode_freelist;
				117	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	119	/* Default encoding to use and assume when NULL is passed as encoding
				120	parameter; it is initialized by _PyUnicode_Init().
				121
				122	Always use the PyUnicode_SetDefaultEncoding() and
				123	PyUnicode_GetDefaultEncoding() APIs to access this global.
				124
				125	*/
				126
				127	static char unicode_default_encoding[100];
				128
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129	/* --- Unicode Object ----------------------------------------------------- */
				130
				131	static
				132	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				133	int length)
				134	{
				135	void *oldstr;
				136
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	137	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	138	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	139	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	140
				141	/* Resizing unicode_empty is not allowed. */
				142	if (unicode == unicode_empty) {
				143	PyErr_SetString(PyExc_SystemError,
				144	"can't resize empty unicode object");
				145	return -1;
				146	}
				147
				148	/* We allocate one more byte to make sure the string is
				149	Ux0000 terminated -- XXX is this needed ? */
				150	oldstr = unicode->str;
				151	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				152	if (!unicode->str) {
				153	unicode->str = oldstr;
				154	PyErr_NoMemory();
				155	return -1;
				156	}
				157	unicode->str[length] = 0;
				158	unicode->length = length;
				159
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	160	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	161	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	162	if (unicode->defenc) {
				163	Py_DECREF(unicode->defenc);
				164	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	165	}
				166	unicode->hash = -1;
				167
				168	return 0;
				169	}
				170
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	171	int PyUnicode_Resize(PyObject **unicode,
				172	int length)
				173	{
				174	PyUnicodeObject *v;
				175
				176	if (unicode == NULL) {
				177	PyErr_BadInternalCall();
				178	return -1;
				179	}
				180	v = (PyUnicodeObject )unicode;
				181	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				182	PyErr_BadInternalCall();
				183	return -1;
				184	}
				185	return _PyUnicode_Resize(v, length);
				186	}
				187
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	/* We allocate one more byte to make sure the string is
				189	Ux0000 terminated -- XXX is this needed ?
				190
				191	XXX This allocator could further be enhanced by assuring that the
				192	free list never reduces its size below 1.
				193
				194	*/
				195
				196	static
				197	PyUnicodeObject *_PyUnicode_New(int length)
				198	{
				199	register PyUnicodeObject *unicode;
				200
				201	/* Optimization for empty strings */
				202	if (length == 0 && unicode_empty != NULL) {
				203	Py_INCREF(unicode_empty);
				204	return unicode_empty;
				205	}
				206
				207	/* Unicode freelist & memory allocation */
				208	if (unicode_freelist) {
				209	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	210	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	211	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	212	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	213	/* Keep-Alive optimization: we only upsize the buffer,
				214	never downsize it. */
				215	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	216	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	217	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	218	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	219	}
				220	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	221	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	222	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	223	}
				224	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	225	}
				226	else {
				227	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				228	if (unicode == NULL)
				229	return NULL;
				230	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				231	}
				232
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	233	if (!unicode->str) {
				234	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	235	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	236	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	237	unicode->str[length] = 0;
				238	unicode->length = length;
				239	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	240	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	242
				243	onError:
				244	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	246	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	247	}
				248
				249	static
				250	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				251	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	252	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	253	/* Keep-Alive optimization */
				254	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	255	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	256	unicode->str = NULL;
				257	unicode->length = 0;
				258	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	259	if (unicode->defenc) {
				260	Py_DECREF(unicode->defenc);
				261	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	262	}
				263	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	264	(PyUnicodeObject *)unicode = unicode_freelist;
				265	unicode_freelist = unicode;
				266	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	267	}
				268	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	269	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	270	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	271	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	272	}
				273	}
				274
				275	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				276	int size)
				277	{
				278	PyUnicodeObject *unicode;
				279
				280	unicode = _PyUnicode_New(size);
				281	if (!unicode)
				282	return NULL;
				283
				284	/* Copy the Unicode data into the new object */
				285	if (u != NULL)
				286	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				287
				288	return (PyObject *)unicode;
				289	}
				290
				291	#ifdef HAVE_WCHAR_H
				292
				293	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				294	int size)
				295	{
				296	PyUnicodeObject *unicode;
				297
				298	if (w == NULL) {
				299	PyErr_BadInternalCall();
				300	return NULL;
				301	}
				302
				303	unicode = _PyUnicode_New(size);
				304	if (!unicode)
				305	return NULL;
				306
				307	/* Copy the wchar_t data into the new object */
				308	#ifdef HAVE_USABLE_WCHAR_T
				309	memcpy(unicode->str, w, size * sizeof(wchar_t));
				310	#else
				311	{
				312	register Py_UNICODE *u;
				313	register int i;
				314	u = PyUnicode_AS_UNICODE(unicode);
				315	for (i = size; i >= 0; i--)
				316	u++ = w++;
				317	}
				318	#endif
				319
				320	return (PyObject *)unicode;
				321	}
				322
				323	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				324	register wchar_t *w,
				325	int size)
				326	{
				327	if (unicode == NULL) {
				328	PyErr_BadInternalCall();
				329	return -1;
				330	}
				331	if (size > PyUnicode_GET_SIZE(unicode))
				332	size = PyUnicode_GET_SIZE(unicode);
				333	#ifdef HAVE_USABLE_WCHAR_T
				334	memcpy(w, unicode->str, size * sizeof(wchar_t));
				335	#else
				336	{
				337	register Py_UNICODE *u;
				338	register int i;
				339	u = PyUnicode_AS_UNICODE(unicode);
				340	for (i = size; i >= 0; i--)
				341	w++ = u++;
				342	}
				343	#endif
				344
				345	return size;
				346	}
				347
				348	#endif
				349
				350	PyObject PyUnicode_FromObject(register PyObject obj)
				351	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	352	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				353	}
				354
				355	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				356	const char *encoding,
				357	const char *errors)
				358	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	359	const char *s;
				360	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	361	int owned = 0;
				362	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	363
				364	if (obj == NULL) {
				365	PyErr_BadInternalCall();
				366	return NULL;
				367	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	368
				369	/* Coerce object */
				370	if (PyInstance_Check(obj)) {
				371	PyObject *func;
				372	func = PyObject_GetAttrString(obj, "__str__");
				373	if (func == NULL) {
				374	PyErr_SetString(PyExc_TypeError,
				375	"coercing to Unicode: instance doesn't define __str__");
				376	return NULL;
				377	}
				378	obj = PyEval_CallObject(func, NULL);
				379	Py_DECREF(func);
				380	if (obj == NULL)
				381	return NULL;
				382	owned = 1;
				383	}
				384	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	385	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	386	v = obj;
				387	if (encoding) {
				388	PyErr_SetString(PyExc_TypeError,
				389	"decoding Unicode is not supported");
				390	return NULL;
				391	}
				392	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	393	}
				394	else if (PyString_Check(obj)) {
				395	s = PyString_AS_STRING(obj);
				396	len = PyString_GET_SIZE(obj);
				397	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	398	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				399	/* Overwrite the error message with something more useful in
				400	case of a TypeError. */
				401	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	402	PyErr_Format(PyExc_TypeError,
				403	"coercing to Unicode: need string or buffer, "
				404	"%.80s found",
				405	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	406	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	407	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	408
				409	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	410	if (len == 0) {
				411	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	412	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	413	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	414	else
				415	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	416
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	417	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	418	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	419	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	420	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	421	return v;
				422
				423	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	424	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	425	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	426	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	427	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	428	}
				429
				430	PyObject PyUnicode_Decode(const char s,
				431	int size,
				432	const char *encoding,
				433	const char *errors)
				434	{
				435	PyObject buffer = NULL, unicode;
				436
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	437	if (encoding == NULL)
				438	encoding = PyUnicode_GetDefaultEncoding();
				439
				440	/* Shortcuts for common default encodings */
				441	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	442	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	443	else if (strcmp(encoding, "latin-1") == 0)
				444	return PyUnicode_DecodeLatin1(s, size, errors);
				445	else if (strcmp(encoding, "ascii") == 0)
				446	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	447
				448	/* Decode via the codec registry */
				449	buffer = PyBuffer_FromMemory((void *)s, size);
				450	if (buffer == NULL)
				451	goto onError;
				452	unicode = PyCodec_Decode(buffer, encoding, errors);
				453	if (unicode == NULL)
				454	goto onError;
				455	if (!PyUnicode_Check(unicode)) {
				456	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	457	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	458	unicode->ob_type->tp_name);
				459	Py_DECREF(unicode);
				460	goto onError;
				461	}
				462	Py_DECREF(buffer);
				463	return unicode;
				464
				465	onError:
				466	Py_XDECREF(buffer);
				467	return NULL;
				468	}
				469
				470	PyObject PyUnicode_Encode(const Py_UNICODE s,
				471	int size,
				472	const char *encoding,
				473	const char *errors)
				474	{
				475	PyObject v, unicode;
				476
				477	unicode = PyUnicode_FromUnicode(s, size);
				478	if (unicode == NULL)
				479	return NULL;
				480	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				481	Py_DECREF(unicode);
				482	return v;
				483	}
				484
				485	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				486	const char *encoding,
				487	const char *errors)
				488	{
				489	PyObject *v;
				490
				491	if (!PyUnicode_Check(unicode)) {
				492	PyErr_BadArgument();
				493	goto onError;
				494	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	495
				496	if (encoding == NULL)
				497	encoding = PyUnicode_GetDefaultEncoding();
				498
				499	/* Shortcuts for common default encodings */
				500	if (errors == NULL) {
				501	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	502	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	503	else if (strcmp(encoding, "latin-1") == 0)
				504	return PyUnicode_AsLatin1String(unicode);
				505	else if (strcmp(encoding, "ascii") == 0)
				506	return PyUnicode_AsASCIIString(unicode);
				507	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	508
				509	/* Encode via the codec registry */
				510	v = PyCodec_Encode(unicode, encoding, errors);
				511	if (v == NULL)
				512	goto onError;
				513	/* XXX Should we really enforce this ? */
				514	if (!PyString_Check(v)) {
				515	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	516	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	517	v->ob_type->tp_name);
				518	Py_DECREF(v);
				519	goto onError;
				520	}
				521	return v;
				522
				523	onError:
				524	return NULL;
				525	}
				526
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	527	/* Return a Python string holding the default encoded value of the
				528	Unicode object.
				529
				530	The resulting string is cached in the Unicode object for subsequent
				531	usage by this function. The cached version is needed to implement
				532	the character buffer interface and will live (at least) as long as
				533	the Unicode object itself.
				534
				535	The refcount of the string is not incremented.
				536
				537	* Exported for internal use by the interpreter only !!! *
				538
				539	*/
				540
				541	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				542	const char *errors)
				543	{
				544	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				545
				546	if (v)
				547	return v;
				548	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				549	if (v && errors == NULL)
				550	((PyUnicodeObject *)unicode)->defenc = v;
				551	return v;
				552	}
				553
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	554	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				555	{
				556	if (!PyUnicode_Check(unicode)) {
				557	PyErr_BadArgument();
				558	goto onError;
				559	}
				560	return PyUnicode_AS_UNICODE(unicode);
				561
				562	onError:
				563	return NULL;
				564	}
				565
				566	int PyUnicode_GetSize(PyObject *unicode)
				567	{
				568	if (!PyUnicode_Check(unicode)) {
				569	PyErr_BadArgument();
				570	goto onError;
				571	}
				572	return PyUnicode_GET_SIZE(unicode);
				573
				574	onError:
				575	return -1;
				576	}
				577
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	578	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	579	{
				580	return unicode_default_encoding;
				581	}
				582
				583	int PyUnicode_SetDefaultEncoding(const char *encoding)
				584	{
				585	PyObject *v;
				586
				587	/* Make sure the encoding is valid. As side effect, this also
				588	loads the encoding into the codec registry cache. */
				589	v = _PyCodec_Lookup(encoding);
				590	if (v == NULL)
				591	goto onError;
				592	Py_DECREF(v);
				593	strncpy(unicode_default_encoding,
				594	encoding,
				595	sizeof(unicode_default_encoding));
				596	return 0;
				597
				598	onError:
				599	return -1;
				600	}
				601
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	602	/* --- UTF-8 Codec -------------------------------------------------------- */
				603
				604	static
				605	char utf8_code_length[256] = {
				606	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				607	illegal prefix. see RFC 2279 for details */
				608	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				609	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				610	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				611	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				612	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				613	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				614	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				615	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				616	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				617	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				618	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				619	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				620	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				621	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				622	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				623	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				624	};
				625
				626	static
				627	int utf8_decoding_error(const char **source,
				628	Py_UNICODE **dest,
				629	const char *errors,
				630	const char *details)
				631	{
				632	if ((errors == NULL) \|\|
				633	(strcmp(errors,"strict") == 0)) {
				634	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	635	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	636	details);
				637	return -1;
				638	}
				639	else if (strcmp(errors,"ignore") == 0) {
				640	(*source)++;
				641	return 0;
				642	}
				643	else if (strcmp(errors,"replace") == 0) {
				644	(*source)++;
				645	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				646	(*dest)++;
				647	return 0;
				648	}
				649	else {
				650	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	651	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	652	errors);
				653	return -1;
				654	}
				655	}
				656
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	657	PyObject PyUnicode_DecodeUTF8(const char s,
				658	int size,
				659	const char *errors)
				660	{
				661	int n;
				662	const char *e;
				663	PyUnicodeObject *unicode;
				664	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	665	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	666
				667	/* Note: size will always be longer than the resulting Unicode
				668	character count */
				669	unicode = _PyUnicode_New(size);
				670	if (!unicode)
				671	return NULL;
				672	if (size == 0)
				673	return (PyObject *)unicode;
				674
				675	/* Unpack UTF-8 encoded data */
				676	p = unicode->str;
				677	e = s + size;
				678
				679	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	680	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	681
				682	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	683	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	684	s++;
				685	continue;
				686	}
				687
				688	n = utf8_code_length[ch];
				689
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	690	if (s + n > e) {
				691	errmsg = "unexpected end of data";
				692	goto utf8Error;
				693	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	694
				695	switch (n) {
				696
				697	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	698	errmsg = "unexpected code byte";
				699	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	700	break;
				701
				702	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	703	errmsg = "internal error";
				704	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	705	break;
				706
				707	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	708	if ((s[1] & 0xc0) != 0x80) {
				709	errmsg = "invalid data";
				710	goto utf8Error;
				711	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	712	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	713	if (ch < 0x80) {
				714	errmsg = "illegal encoding";
				715	goto utf8Error;
				716	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	717	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	718	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	719	break;
				720
				721	case 3:
				722	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	723	(s[2] & 0xc0) != 0x80) {
				724	errmsg = "invalid data";
				725	goto utf8Error;
				726	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	727	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	728	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				729	errmsg = "illegal encoding";
				730	goto utf8Error;
				731	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	732	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	733	*p++ = (Py_UNICODE)ch;
				734	break;
				735
				736	case 4:
				737	if ((s[1] & 0xc0) != 0x80 \|\|
				738	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	739	(s[3] & 0xc0) != 0x80) {
				740	errmsg = "invalid data";
				741	goto utf8Error;
				742	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	743	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				744	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				745	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	746	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				747	byte encoding */
				748	(ch > 0x10ffff)) { /* maximum value allowed for
				749	UTF-16 */
				750	errmsg = "illegal encoding";
				751	goto utf8Error;
				752	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	753	/* compute and append the two surrogates: */
				754
				755	/* translate from 10000..10FFFF to 0..FFFF */
				756	ch -= 0x10000;
				757
				758	/* high surrogate = top 10 bits added to D800 */
				759	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				760
				761	/* low surrogate = bottom 10 bits added to DC00 */
				762	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	763	break;
				764
				765	default:
				766	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	767	errmsg = "unsupported Unicode code range";
				768	goto utf8Error;
				769	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	770	}
				771	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	772	continue;
				773
				774	utf8Error:
				775	if (utf8_decoding_error(&s, &p, errors, errmsg))
				776	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	777	}
				778
				779	/* Adjust length */
				780	if (_PyUnicode_Resize(unicode, p - unicode->str))
				781	goto onError;
				782
				783	return (PyObject *)unicode;
				784
				785	onError:
				786	Py_DECREF(unicode);
				787	return NULL;
				788	}
				789
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	790	/* Not used anymore, now that the encoder supports UTF-16
				791	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	792	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	793	static
				794	int utf8_encoding_error(const Py_UNICODE **source,
				795	char **dest,
				796	const char *errors,
				797	const char *details)
				798	{
				799	if ((errors == NULL) \|\|
				800	(strcmp(errors,"strict") == 0)) {
				801	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	802	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	803	details);
				804	return -1;
				805	}
				806	else if (strcmp(errors,"ignore") == 0) {
				807	return 0;
				808	}
				809	else if (strcmp(errors,"replace") == 0) {
				810	**dest = '?';
				811	(*dest)++;
				812	return 0;
				813	}
				814	else {
				815	PyErr_Format(PyExc_ValueError,
				816	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	817	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	818	errors);
				819	return -1;
				820	}
				821	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	822	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	823
				824	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				825	int size,
				826	const char *errors)
				827	{
				828	PyObject *v;
				829	char *p;
				830	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	831	Py_UCS4 ch2;
				832	unsigned int cbAllocated = 3 * size;
				833	unsigned int cbWritten = 0;
				834	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	835
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	836	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	837	if (v == NULL)
				838	return NULL;
				839	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	840	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	841
				842	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	843	while (i < size) {
				844	Py_UCS4 ch = s[i++];
				845	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	846	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	847	cbWritten++;
				848	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	849	else if (ch < 0x0800) {
				850	*p++ = 0xc0 \| (ch >> 6);
				851	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	852	cbWritten += 2;
				853	}
				854	else {
				855	/* Check for high surrogate */
				856	if (0xD800 <= ch && ch <= 0xDBFF) {
				857	if (i != size) {
				858	ch2 = s[i];
				859	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				860
				861	if (cbWritten >= (cbAllocated - 4)) {
				862	/* Provide enough room for some more
				863	surrogates */
				864	cbAllocated += 4*10;
				865	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	866	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	867	}
				868
				869	/* combine the two values */
				870	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				871
				872	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	873	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	874	i++;
				875	cbWritten += 4;
				876	}
				877	}
				878	}
				879	else {
				880	*p++ = (char)(0xe0 \| (ch >> 12));
				881	cbWritten += 3;
				882	}
				883	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				884	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	885	}
				886	}
				887	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	888	if (_PyString_Resize(&v, p - q))
				889	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	890	return v;
				891
				892	onError:
				893	Py_DECREF(v);
				894	return NULL;
				895	}
				896
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	897	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				898	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	899	if (!PyUnicode_Check(unicode)) {
				900	PyErr_BadArgument();
				901	return NULL;
				902	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	903	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				904	PyUnicode_GET_SIZE(unicode),
				905	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	906	}
				907
				908	/* --- UTF-16 Codec ------------------------------------------------------- */
				909
				910	static
				911	int utf16_decoding_error(const Py_UNICODE **source,
				912	Py_UNICODE **dest,
				913	const char *errors,
				914	const char *details)
				915	{
				916	if ((errors == NULL) \|\|
				917	(strcmp(errors,"strict") == 0)) {
				918	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	919	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	920	details);
				921	return -1;
				922	}
				923	else if (strcmp(errors,"ignore") == 0) {
				924	return 0;
				925	}
				926	else if (strcmp(errors,"replace") == 0) {
				927	if (dest) {
				928	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				929	(*dest)++;
				930	}
				931	return 0;
				932	}
				933	else {
				934	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	935	"UTF-16 decoding error; "
				936	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	937	errors);
				938	return -1;
				939	}
				940	}
				941
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	942	PyObject PyUnicode_DecodeUTF16(const char s,
				943	int size,
				944	const char *errors,
				945	int *byteorder)
				946	{
				947	PyUnicodeObject *unicode;
				948	Py_UNICODE *p;
				949	const Py_UNICODE q, e;
				950	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	951	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	952
				953	/* size should be an even number */
				954	if (size % sizeof(Py_UNICODE) != 0) {
				955	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				956	return NULL;
				957	/* The remaining input chars are ignored if we fall through
				958	here... */
				959	}
				960
				961	/* Note: size will always be longer than the resulting Unicode
				962	character count */
				963	unicode = _PyUnicode_New(size);
				964	if (!unicode)
				965	return NULL;
				966	if (size == 0)
				967	return (PyObject *)unicode;
				968
				969	/* Unpack UTF-16 encoded data */
				970	p = unicode->str;
				971	q = (Py_UNICODE *)s;
				972	e = q + (size / sizeof(Py_UNICODE));
				973
				974	if (byteorder)
				975	bo = *byteorder;
				976
				977	while (q < e) {
				978	register Py_UNICODE ch = *q++;
				979
				980	/* Check for BOM marks (U+FEFF) in the input and adjust
				981	current byte order setting accordingly. Swap input
				982	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				983	!) */
				984	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				985	if (ch == 0xFEFF) {
				986	bo = -1;
				987	continue;
				988	} else if (ch == 0xFFFE) {
				989	bo = 1;
				990	continue;
				991	}
				992	if (bo == 1)
				993	ch = (ch >> 8) \| (ch << 8);
				994	#else
				995	if (ch == 0xFEFF) {
				996	bo = 1;
				997	continue;
				998	} else if (ch == 0xFFFE) {
				999	bo = -1;
				1000	continue;
				1001	}
				1002	if (bo == -1)
				1003	ch = (ch >> 8) \| (ch << 8);
				1004	#endif
				1005	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1006	*p++ = ch;
				1007	continue;
				1008	}
				1009
				1010	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1011	if (q >= e) {
				1012	errmsg = "unexpected end of data";
				1013	goto utf16Error;
				1014	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1015	if (0xDC00 <= q && q <= 0xDFFF) {
				1016	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1017	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1018	/* This is valid data (a UTF-16 surrogate pair), but
				1019	we are not able to store this information since our
				1020	Py_UNICODE type only has 16 bits... this might
				1021	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1022	errmsg = "code pairs are not supported";
				1023	goto utf16Error;
				1024	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1025	else
				1026	continue;
				1027	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1028	errmsg = "illegal encoding";
				1029	/* Fall through to report the error */
				1030
				1031	utf16Error:
				1032	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1033	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1034	}
				1035
				1036	if (byteorder)
				1037	*byteorder = bo;
				1038
				1039	/* Adjust length */
				1040	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1041	goto onError;
				1042
				1043	return (PyObject *)unicode;
				1044
				1045	onError:
				1046	Py_DECREF(unicode);
				1047	return NULL;
				1048	}
				1049
				1050	#undef UTF16_ERROR
				1051
				1052	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1053	int size,
				1054	const char *errors,
				1055	int byteorder)
				1056	{
				1057	PyObject *v;
				1058	Py_UNICODE *p;
				1059	char *q;
				1060
				1061	/* We don't create UTF-16 pairs... */
				1062	v = PyString_FromStringAndSize(NULL,
				1063	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1064	if (v == NULL)
				1065	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1066
				1067	q = PyString_AS_STRING(v);
				1068	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1069	if (byteorder == 0)
				1070	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1071	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1072	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1073	if (byteorder == 0 \|\|
				1074	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1075	byteorder == -1
				1076	#else
				1077	byteorder == 1
				1078	#endif
				1079	)
				1080	memcpy(p, s, size * sizeof(Py_UNICODE));
				1081	else
				1082	while (size-- > 0) {
				1083	Py_UNICODE ch = *s++;
				1084	*p++ = (ch >> 8) \| (ch << 8);
				1085	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1086	return v;
				1087	}
				1088
				1089	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1090	{
				1091	if (!PyUnicode_Check(unicode)) {
				1092	PyErr_BadArgument();
				1093	return NULL;
				1094	}
				1095	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1096	PyUnicode_GET_SIZE(unicode),
				1097	NULL,
				1098	0);
				1099	}
				1100
				1101	/* --- Unicode Escape Codec ----------------------------------------------- */
				1102
				1103	static
				1104	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1105	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1106	const char *errors,
				1107	const char *details)
				1108	{
				1109	if ((errors == NULL) \|\|
				1110	(strcmp(errors,"strict") == 0)) {
				1111	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1112	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1113	details);
				1114	return -1;
				1115	}
				1116	else if (strcmp(errors,"ignore") == 0) {
				1117	return 0;
				1118	}
				1119	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1120	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1121	return 0;
				1122	}
				1123	else {
				1124	PyErr_Format(PyExc_ValueError,
				1125	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1126	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1127	errors);
				1128	return -1;
				1129	}
				1130	}
				1131
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1132	static _Py_UCNHashAPI *pucnHash = NULL;
				1133
				1134	static
				1135	int mystrnicmp(const char s1, const char s2, size_t count)
				1136	{
				1137	char c1, c2;
				1138
				1139	if (count)
				1140	{
				1141	do
				1142	{
				1143	c1 = tolower(*(s1++));
				1144	c2 = tolower(*(s2++));
				1145	}
				1146	while(--count && c1 == c2);
				1147
				1148	return c1 - c2;
				1149	}
				1150
				1151	return 0;
				1152	}
				1153
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1154	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1155	int size,
				1156	const char *errors)
				1157	{
				1158	PyUnicodeObject *v;
				1159	Py_UNICODE p = NULL, buf = NULL;
				1160	const char *end;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1161	Py_UCS4 chr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1162
				1163	/* Escaped strings will always be longer than the resulting
				1164	Unicode string, so we start with size here and then reduce the
				1165	length after conversion to the true value. */
				1166	v = _PyUnicode_New(size);
				1167	if (v == NULL)
				1168	goto onError;
				1169	if (size == 0)
				1170	return (PyObject *)v;
				1171	p = buf = PyUnicode_AS_UNICODE(v);
				1172	end = s + size;
				1173	while (s < end) {
				1174	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1175	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1176	int i;
				1177
				1178	/* Non-escape characters are interpreted as Unicode ordinals */
				1179	if (*s != '\\') {
				1180	p++ = (unsigned char)s++;
				1181	continue;
				1182	}
				1183
				1184	/* \ - Escapes */
				1185	s++;
				1186	switch (*s++) {
				1187
				1188	/* \x escapes */
				1189	case '\n': break;
				1190	case '\\': *p++ = '\\'; break;
				1191	case '\'': *p++ = '\''; break;
				1192	case '\"': *p++ = '\"'; break;
				1193	case 'b': *p++ = '\b'; break;
				1194	case 'f': p++ = '\014'; break; / FF */
				1195	case 't': *p++ = '\t'; break;
				1196	case 'n': *p++ = '\n'; break;
				1197	case 'r': *p++ = '\r'; break;
				1198	case 'v': p++ = '\013'; break; / VT */
				1199	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1200
				1201	/* \OOO (octal) escapes */
				1202	case '0': case '1': case '2': case '3':
				1203	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1204	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1205	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1206	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1207	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1208	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1209	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1210	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1211	break;
				1212
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1213	/* \xXX with two hex digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1214	case 'x':
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1215	for (x = 0, i = 0; i < 2; i++) {
				1216	c = (unsigned char)s[i];
				1217	if (!isxdigit(c)) {
				1218	if (unicodeescape_decoding_error(&s, &x, errors,
				1219	"truncated \\xXX"))
				1220	goto onError;
				1221	i++;
				1222	break;
				1223	}
				1224	x = (x<<4) & ~0xF;
				1225	if (c >= '0' && c <= '9')
				1226	x += c - '0';
				1227	else if (c >= 'a' && c <= 'f')
				1228	x += 10 + c - 'a';
				1229	else
				1230	x += 10 + c - 'A';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1231	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1232	s += i;
				1233	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1234	break;
				1235
				1236	/* \uXXXX with 4 hex digits */
				1237	case 'u':
				1238	for (x = 0, i = 0; i < 4; i++) {
				1239	c = (unsigned char)s[i];
				1240	if (!isxdigit(c)) {
				1241	if (unicodeescape_decoding_error(&s, &x, errors,
				1242	"truncated \\uXXXX"))
				1243	goto onError;
				1244	i++;
				1245	break;
				1246	}
				1247	x = (x<<4) & ~0xF;
				1248	if (c >= '0' && c <= '9')
				1249	x += c - '0';
				1250	else if (c >= 'a' && c <= 'f')
				1251	x += 10 + c - 'a';
				1252	else
				1253	x += 10 + c - 'A';
				1254	}
				1255	s += i;
				1256	*p++ = x;
				1257	break;
				1258
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1259	/* \UXXXXXXXX with 8 hex digits */
				1260	case 'U':
				1261	for (chr = 0, i = 0; i < 8; i++) {
				1262	c = (unsigned char)s[i];
				1263	if (!isxdigit(c)) {
				1264	if (unicodeescape_decoding_error(&s, &x, errors,
				1265	"truncated \\uXXXX"))
				1266	goto onError;
				1267	i++;
				1268	break;
				1269	}
				1270	chr = (chr<<4) & ~0xF;
				1271	if (c >= '0' && c <= '9')
				1272	chr += c - '0';
				1273	else if (c >= 'a' && c <= 'f')
				1274	chr += 10 + c - 'a';
				1275	else
				1276	chr += 10 + c - 'A';
				1277	}
				1278	s += i;
				1279	goto store;
				1280
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1281	case 'N':
				1282	/* Ok, we need to deal with Unicode Character Names now,
				1283	* make sure we've imported the hash table data...
				1284	*/
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1285	if (pucnHash == NULL) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1286	PyObject mod = 0, v = 0;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1287	mod = PyImport_ImportModule("ucnhash");
				1288	if (mod == NULL)
				1289	goto onError;
				1290	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1291	Py_DECREF(mod);
				1292	if (v == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1293	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1294	pucnHash = PyCObject_AsVoidPtr(v);
				1295	Py_DECREF(v);
				1296	if (pucnHash == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1297	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1298	}
				1299
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1300	if (*s == '{') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1301	const char *start = s + 1;
				1302	const char *endBrace = start;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1303	unsigned long j;
				1304
				1305	/* look for either the closing brace, or we
				1306	* exceed the maximum length of the unicode character names
				1307	*/
				1308	while (*endBrace != '}' &&
				1309	(unsigned int)(endBrace - start) <=
				1310	pucnHash->cchMax &&
				1311	endBrace < end)
				1312	{
				1313	endBrace++;
				1314	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1315	if (endBrace != end && *endBrace == '}') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1316	j = pucnHash->hash(start, endBrace - start);
				1317	if (j > pucnHash->cKeys \|\|
				1318	mystrnicmp(
				1319	start,
				1320	((_Py_UnicodeCharacterName *)
				1321	(pucnHash->getValue(j)))->pszUCN,
				1322	(int)(endBrace - start)) != 0)
				1323	{
				1324	if (unicodeescape_decoding_error(
				1325	&s, &x, errors,
				1326	"Invalid Unicode Character Name"))
				1327	{
				1328	goto onError;
				1329	}
				1330	goto ucnFallthrough;
				1331	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1332	chr = ((_Py_UnicodeCharacterName *)
				1333	(pucnHash->getValue(j)))->value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1334	s = endBrace + 1;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1335	goto store;
				1336	} else {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1337	if (unicodeescape_decoding_error(
				1338	&s, &x, errors,
				1339	"Unicode name missing closing brace"))
				1340	goto onError;
				1341	goto ucnFallthrough;
				1342	}
				1343	break;
				1344	}
				1345	if (unicodeescape_decoding_error(
				1346	&s, &x, errors,
				1347	"Missing opening brace for Unicode Character Name escape"))
				1348	goto onError;
				1349	ucnFallthrough:
				1350	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1351	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1352	*p++ = '\\';
				1353	*p++ = (unsigned char)s[-1];
				1354	break;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1355	store:
				1356	/* when we get here, chr is a 32-bit unicode character */
				1357	if (chr <= 0xffff)
				1358	/* UCS-2 character */
				1359	*p++ = (Py_UNICODE) chr;
				1360	else if (chr <= 0x10ffff) {
				1361	/* UCS-4 character. store as two surrogate characters */
				1362	chr -= 0x10000L;
				1363	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
				1364	*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
				1365	} else {
				1366	if (unicodeescape_decoding_error(
				1367	&s, &x, errors,
				1368	"Illegal Unicode character")
				1369	)
				1370	goto onError;
				1371	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1372	}
				1373	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1374	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1375	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1376	return (PyObject *)v;
				1377
				1378	onError:
				1379	Py_XDECREF(v);
				1380	return NULL;
				1381	}
				1382
				1383	/* Return a Unicode-Escape string version of the Unicode object.
				1384
				1385	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1386	appropriate.
				1387
				1388	*/
				1389
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1390	static const Py_UNICODE findchar(const Py_UNICODE s,
				1391	int size,
				1392	Py_UNICODE ch);
				1393
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1394	static
				1395	PyObject unicodeescape_string(const Py_UNICODE s,
				1396	int size,
				1397	int quotes)
				1398	{
				1399	PyObject *repr;
				1400	char *p;
				1401	char *q;
				1402
				1403	static const char *hexdigit = "0123456789ABCDEF";
				1404
				1405	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1406	if (repr == NULL)
				1407	return NULL;
				1408
				1409	p = q = PyString_AS_STRING(repr);
				1410
				1411	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1412	*p++ = 'u';
				1413	*p++ = (findchar(s, size, '\'') &&
				1414	!findchar(s, size, '"')) ? '"' : '\'';
				1415	}
				1416	while (size-- > 0) {
				1417	Py_UNICODE ch = *s++;
				1418	/* Escape quotes */
				1419	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1420	*p++ = '\\';
				1421	*p++ = (char) ch;
				1422	}
				1423	/* Map 16-bit characters to '\uxxxx' */
				1424	else if (ch >= 256) {
				1425	*p++ = '\\';
				1426	*p++ = 'u';
				1427	*p++ = hexdigit[(ch >> 12) & 0xf];
				1428	*p++ = hexdigit[(ch >> 8) & 0xf];
				1429	*p++ = hexdigit[(ch >> 4) & 0xf];
				1430	*p++ = hexdigit[ch & 15];
				1431	}
				1432	/* Map non-printable US ASCII to '\ooo' */
				1433	else if (ch < ' ' \|\| ch >= 128) {
				1434	*p++ = '\\';
				1435	*p++ = hexdigit[(ch >> 6) & 7];
				1436	*p++ = hexdigit[(ch >> 3) & 7];
				1437	*p++ = hexdigit[ch & 7];
				1438	}
				1439	/* Copy everything else as-is */
				1440	else
				1441	*p++ = (char) ch;
				1442	}
				1443	if (quotes)
				1444	*p++ = q[1];
				1445
				1446	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1447	if (_PyString_Resize(&repr, p - q))
				1448	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1449
				1450	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1451
				1452	onError:
				1453	Py_DECREF(repr);
				1454	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1455	}
				1456
				1457	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1458	int size)
				1459	{
				1460	return unicodeescape_string(s, size, 0);
				1461	}
				1462
				1463	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1464	{
				1465	if (!PyUnicode_Check(unicode)) {
				1466	PyErr_BadArgument();
				1467	return NULL;
				1468	}
				1469	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1470	PyUnicode_GET_SIZE(unicode));
				1471	}
				1472
				1473	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1474
				1475	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1476	int size,
				1477	const char *errors)
				1478	{
				1479	PyUnicodeObject *v;
				1480	Py_UNICODE p, buf;
				1481	const char *end;
				1482	const char *bs;
				1483
				1484	/* Escaped strings will always be longer than the resulting
				1485	Unicode string, so we start with size here and then reduce the
				1486	length after conversion to the true value. */
				1487	v = _PyUnicode_New(size);
				1488	if (v == NULL)
				1489	goto onError;
				1490	if (size == 0)
				1491	return (PyObject *)v;
				1492	p = buf = PyUnicode_AS_UNICODE(v);
				1493	end = s + size;
				1494	while (s < end) {
				1495	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1496	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1497	int i;
				1498
				1499	/* Non-escape characters are interpreted as Unicode ordinals */
				1500	if (*s != '\\') {
				1501	p++ = (unsigned char)s++;
				1502	continue;
				1503	}
				1504
				1505	/* \u-escapes are only interpreted iff the number of leading
				1506	backslashes if odd */
				1507	bs = s;
				1508	for (;s < end;) {
				1509	if (*s != '\\')
				1510	break;
				1511	p++ = (unsigned char)s++;
				1512	}
				1513	if (((s - bs) & 1) == 0 \|\|
				1514	s >= end \|\|
				1515	*s != 'u') {
				1516	continue;
				1517	}
				1518	p--;
				1519	s++;
				1520
				1521	/* \uXXXX with 4 hex digits */
				1522	for (x = 0, i = 0; i < 4; i++) {
				1523	c = (unsigned char)s[i];
				1524	if (!isxdigit(c)) {
				1525	if (unicodeescape_decoding_error(&s, &x, errors,
				1526	"truncated \\uXXXX"))
				1527	goto onError;
				1528	i++;
				1529	break;
				1530	}
				1531	x = (x<<4) & ~0xF;
				1532	if (c >= '0' && c <= '9')
				1533	x += c - '0';
				1534	else if (c >= 'a' && c <= 'f')
				1535	x += 10 + c - 'a';
				1536	else
				1537	x += 10 + c - 'A';
				1538	}
				1539	s += i;
				1540	*p++ = x;
				1541	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1542	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1543	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1544	return (PyObject *)v;
				1545
				1546	onError:
				1547	Py_XDECREF(v);
				1548	return NULL;
				1549	}
				1550
				1551	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1552	int size)
				1553	{
				1554	PyObject *repr;
				1555	char *p;
				1556	char *q;
				1557
				1558	static const char *hexdigit = "0123456789ABCDEF";
				1559
				1560	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1561	if (repr == NULL)
				1562	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1563	if (size == 0)
				1564	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1565
				1566	p = q = PyString_AS_STRING(repr);
				1567	while (size-- > 0) {
				1568	Py_UNICODE ch = *s++;
				1569	/* Map 16-bit characters to '\uxxxx' */
				1570	if (ch >= 256) {
				1571	*p++ = '\\';
				1572	*p++ = 'u';
				1573	*p++ = hexdigit[(ch >> 12) & 0xf];
				1574	*p++ = hexdigit[(ch >> 8) & 0xf];
				1575	*p++ = hexdigit[(ch >> 4) & 0xf];
				1576	*p++ = hexdigit[ch & 15];
				1577	}
				1578	/* Copy everything else as-is */
				1579	else
				1580	*p++ = (char) ch;
				1581	}
				1582	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1583	if (_PyString_Resize(&repr, p - q))
				1584	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1585
				1586	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1587
				1588	onError:
				1589	Py_DECREF(repr);
				1590	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1591	}
				1592
				1593	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1594	{
				1595	if (!PyUnicode_Check(unicode)) {
				1596	PyErr_BadArgument();
				1597	return NULL;
				1598	}
				1599	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1600	PyUnicode_GET_SIZE(unicode));
				1601	}
				1602
				1603	/* --- Latin-1 Codec ------------------------------------------------------ */
				1604
				1605	PyObject PyUnicode_DecodeLatin1(const char s,
				1606	int size,
				1607	const char *errors)
				1608	{
				1609	PyUnicodeObject *v;
				1610	Py_UNICODE *p;
				1611
				1612	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1613	v = _PyUnicode_New(size);
				1614	if (v == NULL)
				1615	goto onError;
				1616	if (size == 0)
				1617	return (PyObject *)v;
				1618	p = PyUnicode_AS_UNICODE(v);
				1619	while (size-- > 0)
				1620	p++ = (unsigned char)s++;
				1621	return (PyObject *)v;
				1622
				1623	onError:
				1624	Py_XDECREF(v);
				1625	return NULL;
				1626	}
				1627
				1628	static
				1629	int latin1_encoding_error(const Py_UNICODE **source,
				1630	char **dest,
				1631	const char *errors,
				1632	const char *details)
				1633	{
				1634	if ((errors == NULL) \|\|
				1635	(strcmp(errors,"strict") == 0)) {
				1636	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1637	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1638	details);
				1639	return -1;
				1640	}
				1641	else if (strcmp(errors,"ignore") == 0) {
				1642	return 0;
				1643	}
				1644	else if (strcmp(errors,"replace") == 0) {
				1645	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1646	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1647	return 0;
				1648	}
				1649	else {
				1650	PyErr_Format(PyExc_ValueError,
				1651	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1652	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1653	errors);
				1654	return -1;
				1655	}
				1656	}
				1657
				1658	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1659	int size,
				1660	const char *errors)
				1661	{
				1662	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1663	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1664
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1665	repr = PyString_FromStringAndSize(NULL, size);
				1666	if (repr == NULL)
				1667	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1668	if (size == 0)
				1669	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1670
				1671	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1672	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1673	while (size-- > 0) {
				1674	Py_UNICODE ch = *p++;
				1675	if (ch >= 256) {
				1676	if (latin1_encoding_error(&p, &s, errors,
				1677	"ordinal not in range(256)"))
				1678	goto onError;
				1679	}
				1680	else
				1681	*s++ = (char)ch;
				1682	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1683	/* Resize if error handling skipped some characters */
				1684	if (s - start < PyString_GET_SIZE(repr))
				1685	if (_PyString_Resize(&repr, s - start))
				1686	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1687	return repr;
				1688
				1689	onError:
				1690	Py_DECREF(repr);
				1691	return NULL;
				1692	}
				1693
				1694	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1695	{
				1696	if (!PyUnicode_Check(unicode)) {
				1697	PyErr_BadArgument();
				1698	return NULL;
				1699	}
				1700	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1701	PyUnicode_GET_SIZE(unicode),
				1702	NULL);
				1703	}
				1704
				1705	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1706
				1707	static
				1708	int ascii_decoding_error(const char **source,
				1709	Py_UNICODE **dest,
				1710	const char *errors,
				1711	const char *details)
				1712	{
				1713	if ((errors == NULL) \|\|
				1714	(strcmp(errors,"strict") == 0)) {
				1715	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1716	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1717	details);
				1718	return -1;
				1719	}
				1720	else if (strcmp(errors,"ignore") == 0) {
				1721	return 0;
				1722	}
				1723	else if (strcmp(errors,"replace") == 0) {
				1724	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1725	(*dest)++;
				1726	return 0;
				1727	}
				1728	else {
				1729	PyErr_Format(PyExc_ValueError,
				1730	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1731	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1732	errors);
				1733	return -1;
				1734	}
				1735	}
				1736
				1737	PyObject PyUnicode_DecodeASCII(const char s,
				1738	int size,
				1739	const char *errors)
				1740	{
				1741	PyUnicodeObject *v;
				1742	Py_UNICODE *p;
				1743
				1744	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1745	v = _PyUnicode_New(size);
				1746	if (v == NULL)
				1747	goto onError;
				1748	if (size == 0)
				1749	return (PyObject *)v;
				1750	p = PyUnicode_AS_UNICODE(v);
				1751	while (size-- > 0) {
				1752	register unsigned char c;
				1753
				1754	c = (unsigned char)*s++;
				1755	if (c < 128)
				1756	*p++ = c;
				1757	else if (ascii_decoding_error(&s, &p, errors,
				1758	"ordinal not in range(128)"))
				1759	goto onError;
				1760	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1761	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1762	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1763	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1764	return (PyObject *)v;
				1765
				1766	onError:
				1767	Py_XDECREF(v);
				1768	return NULL;
				1769	}
				1770
				1771	static
				1772	int ascii_encoding_error(const Py_UNICODE **source,
				1773	char **dest,
				1774	const char *errors,
				1775	const char *details)
				1776	{
				1777	if ((errors == NULL) \|\|
				1778	(strcmp(errors,"strict") == 0)) {
				1779	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1780	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1781	details);
				1782	return -1;
				1783	}
				1784	else if (strcmp(errors,"ignore") == 0) {
				1785	return 0;
				1786	}
				1787	else if (strcmp(errors,"replace") == 0) {
				1788	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1789	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1790	return 0;
				1791	}
				1792	else {
				1793	PyErr_Format(PyExc_ValueError,
				1794	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1795	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1796	errors);
				1797	return -1;
				1798	}
				1799	}
				1800
				1801	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1802	int size,
				1803	const char *errors)
				1804	{
				1805	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1806	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1807
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1808	repr = PyString_FromStringAndSize(NULL, size);
				1809	if (repr == NULL)
				1810	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1811	if (size == 0)
				1812	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1813
				1814	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1815	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1816	while (size-- > 0) {
				1817	Py_UNICODE ch = *p++;
				1818	if (ch >= 128) {
				1819	if (ascii_encoding_error(&p, &s, errors,
				1820	"ordinal not in range(128)"))
				1821	goto onError;
				1822	}
				1823	else
				1824	*s++ = (char)ch;
				1825	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1826	/* Resize if error handling skipped some characters */
				1827	if (s - start < PyString_GET_SIZE(repr))
				1828	if (_PyString_Resize(&repr, s - start))
				1829	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1830	return repr;
				1831
				1832	onError:
				1833	Py_DECREF(repr);
				1834	return NULL;
				1835	}
				1836
				1837	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1838	{
				1839	if (!PyUnicode_Check(unicode)) {
				1840	PyErr_BadArgument();
				1841	return NULL;
				1842	}
				1843	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1844	PyUnicode_GET_SIZE(unicode),
				1845	NULL);
				1846	}
				1847
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1848	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1849
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1850	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1851
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1852	PyObject PyUnicode_DecodeMBCS(const char s,
				1853	int size,
				1854	const char *errors)
				1855	{
				1856	PyUnicodeObject *v;
				1857	Py_UNICODE *p;
				1858
				1859	/* First get the size of the result */
				1860	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1861	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1862	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1863
				1864	v = _PyUnicode_New(usize);
				1865	if (v == NULL)
				1866	return NULL;
				1867	if (usize == 0)
				1868	return (PyObject *)v;
				1869	p = PyUnicode_AS_UNICODE(v);
				1870	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1871	Py_DECREF(v);
				1872	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1873	}
				1874
				1875	return (PyObject *)v;
				1876	}
				1877
				1878	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1879	int size,
				1880	const char *errors)
				1881	{
				1882	PyObject *repr;
				1883	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1884	DWORD mbcssize;
				1885
				1886	/* If there are no characters, bail now! */
				1887	if (size==0)
				1888	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1889
				1890	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1891	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1892	if (mbcssize==0)
				1893	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1894
				1895	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1896	if (repr == NULL)
				1897	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1898	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1899	return repr;
				1900
				1901	/* Do the conversion */
				1902	s = PyString_AS_STRING(repr);
				1903	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1904	Py_DECREF(repr);
				1905	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1906	}
				1907	return repr;
				1908	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1909
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1910	#endif /* MS_WIN32 */
				1911
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1912	/* --- Character Mapping Codec -------------------------------------------- */
				1913
				1914	static
				1915	int charmap_decoding_error(const char **source,
				1916	Py_UNICODE **dest,
				1917	const char *errors,
				1918	const char *details)
				1919	{
				1920	if ((errors == NULL) \|\|
				1921	(strcmp(errors,"strict") == 0)) {
				1922	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1923	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1924	details);
				1925	return -1;
				1926	}
				1927	else if (strcmp(errors,"ignore") == 0) {
				1928	return 0;
				1929	}
				1930	else if (strcmp(errors,"replace") == 0) {
				1931	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1932	(*dest)++;
				1933	return 0;
				1934	}
				1935	else {
				1936	PyErr_Format(PyExc_ValueError,
				1937	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1938	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1939	errors);
				1940	return -1;
				1941	}
				1942	}
				1943
				1944	PyObject PyUnicode_DecodeCharmap(const char s,
				1945	int size,
				1946	PyObject *mapping,
				1947	const char *errors)
				1948	{
				1949	PyUnicodeObject *v;
				1950	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1951	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1952
				1953	/* Default to Latin-1 */
				1954	if (mapping == NULL)
				1955	return PyUnicode_DecodeLatin1(s, size, errors);
				1956
				1957	v = _PyUnicode_New(size);
				1958	if (v == NULL)
				1959	goto onError;
				1960	if (size == 0)
				1961	return (PyObject *)v;
				1962	p = PyUnicode_AS_UNICODE(v);
				1963	while (size-- > 0) {
				1964	unsigned char ch = *s++;
				1965	PyObject w, x;
				1966
				1967	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1968	w = PyInt_FromLong((long)ch);
				1969	if (w == NULL)
				1970	goto onError;
				1971	x = PyObject_GetItem(mapping, w);
				1972	Py_DECREF(w);
				1973	if (x == NULL) {
				1974	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1975	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1976	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1977	x = Py_None;
				1978	Py_INCREF(x);
				1979	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1980	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1981	}
				1982
				1983	/* Apply mapping */
				1984	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1985	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1986	if (value < 0 \|\| value > 65535) {
				1987	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1988	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1989	Py_DECREF(x);
				1990	goto onError;
				1991	}
				1992	*p++ = (Py_UNICODE)value;
				1993	}
				1994	else if (x == Py_None) {
				1995	/* undefined mapping */
				1996	if (charmap_decoding_error(&s, &p, errors,
				1997	"character maps to <undefined>")) {
				1998	Py_DECREF(x);
				1999	goto onError;
				2000	}
				2001	}
				2002	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2003	int targetsize = PyUnicode_GET_SIZE(x);
				2004
				2005	if (targetsize == 1)
				2006	/* 1-1 mapping */
				2007	p++ = PyUnicode_AS_UNICODE(x);
				2008
				2009	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2010	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2011	if (targetsize > extrachars) {
				2012	/* resize first */
				2013	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2014	int needed = (targetsize - extrachars) + \
				2015	(targetsize << 2);
				2016	extrachars += needed;
				2017	if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2018	Py_DECREF(x);
				2019	goto onError;
				2020	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2021	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2022	}
				2023	Py_UNICODE_COPY(p,
				2024	PyUnicode_AS_UNICODE(x),
				2025	targetsize);
				2026	p += targetsize;
				2027	extrachars -= targetsize;
				2028	}
				2029	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2030	}
				2031	else {
				2032	/* wrong return value */
				2033	PyErr_SetString(PyExc_TypeError,
				2034	"character mapping must return integer, None or unicode");
				2035	Py_DECREF(x);
				2036	goto onError;
				2037	}
				2038	Py_DECREF(x);
				2039	}
				2040	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				2041	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2042	goto onError;
				2043	return (PyObject *)v;
				2044
				2045	onError:
				2046	Py_XDECREF(v);
				2047	return NULL;
				2048	}
				2049
				2050	static
				2051	int charmap_encoding_error(const Py_UNICODE **source,
				2052	char **dest,
				2053	const char *errors,
				2054	const char *details)
				2055	{
				2056	if ((errors == NULL) \|\|
				2057	(strcmp(errors,"strict") == 0)) {
				2058	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2059	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2060	details);
				2061	return -1;
				2062	}
				2063	else if (strcmp(errors,"ignore") == 0) {
				2064	return 0;
				2065	}
				2066	else if (strcmp(errors,"replace") == 0) {
				2067	**dest = '?';
				2068	(*dest)++;
				2069	return 0;
				2070	}
				2071	else {
				2072	PyErr_Format(PyExc_ValueError,
				2073	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2074	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2075	errors);
				2076	return -1;
				2077	}
				2078	}
				2079
				2080	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2081	int size,
				2082	PyObject *mapping,
				2083	const char *errors)
				2084	{
				2085	PyObject *v;
				2086	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2087	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2088
				2089	/* Default to Latin-1 */
				2090	if (mapping == NULL)
				2091	return PyUnicode_EncodeLatin1(p, size, errors);
				2092
				2093	v = PyString_FromStringAndSize(NULL, size);
				2094	if (v == NULL)
				2095	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2096	if (size == 0)
				2097	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2098	s = PyString_AS_STRING(v);
				2099	while (size-- > 0) {
				2100	Py_UNICODE ch = *p++;
				2101	PyObject w, x;
				2102
				2103	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2104	w = PyInt_FromLong((long)ch);
				2105	if (w == NULL)
				2106	goto onError;
				2107	x = PyObject_GetItem(mapping, w);
				2108	Py_DECREF(w);
				2109	if (x == NULL) {
				2110	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2111	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2112	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2113	x = Py_None;
				2114	Py_INCREF(x);
				2115	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2116	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2117	}
				2118
				2119	/* Apply mapping */
				2120	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2121	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2122	if (value < 0 \|\| value > 255) {
				2123	PyErr_SetString(PyExc_TypeError,
				2124	"character mapping must be in range(256)");
				2125	Py_DECREF(x);
				2126	goto onError;
				2127	}
				2128	*s++ = (char)value;
				2129	}
				2130	else if (x == Py_None) {
				2131	/* undefined mapping */
				2132	if (charmap_encoding_error(&p, &s, errors,
				2133	"character maps to <undefined>")) {
				2134	Py_DECREF(x);
				2135	goto onError;
				2136	}
				2137	}
				2138	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2139	int targetsize = PyString_GET_SIZE(x);
				2140
				2141	if (targetsize == 1)
				2142	/* 1-1 mapping */
				2143	s++ = PyString_AS_STRING(x);
				2144
				2145	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2146	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2147	if (targetsize > extrachars) {
				2148	/* resize first */
				2149	int oldpos = (int)(s - PyString_AS_STRING(v));
				2150	int needed = (targetsize - extrachars) + \
				2151	(targetsize << 2);
				2152	extrachars += needed;
				2153	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2154	Py_DECREF(x);
				2155	goto onError;
				2156	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2157	s = PyString_AS_STRING(v) + oldpos;
				2158	}
				2159	memcpy(s,
				2160	PyString_AS_STRING(x),
				2161	targetsize);
				2162	s += targetsize;
				2163	extrachars -= targetsize;
				2164	}
				2165	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2166	}
				2167	else {
				2168	/* wrong return value */
				2169	PyErr_SetString(PyExc_TypeError,
				2170	"character mapping must return integer, None or unicode");
				2171	Py_DECREF(x);
				2172	goto onError;
				2173	}
				2174	Py_DECREF(x);
				2175	}
				2176	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2177	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2178	goto onError;
				2179	return v;
				2180
				2181	onError:
				2182	Py_DECREF(v);
				2183	return NULL;
				2184	}
				2185
				2186	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2187	PyObject *mapping)
				2188	{
				2189	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2190	PyErr_BadArgument();
				2191	return NULL;
				2192	}
				2193	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2194	PyUnicode_GET_SIZE(unicode),
				2195	mapping,
				2196	NULL);
				2197	}
				2198
				2199	static
				2200	int translate_error(const Py_UNICODE **source,
				2201	Py_UNICODE **dest,
				2202	const char *errors,
				2203	const char *details)
				2204	{
				2205	if ((errors == NULL) \|\|
				2206	(strcmp(errors,"strict") == 0)) {
				2207	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2208	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2209	details);
				2210	return -1;
				2211	}
				2212	else if (strcmp(errors,"ignore") == 0) {
				2213	return 0;
				2214	}
				2215	else if (strcmp(errors,"replace") == 0) {
				2216	**dest = '?';
				2217	(*dest)++;
				2218	return 0;
				2219	}
				2220	else {
				2221	PyErr_Format(PyExc_ValueError,
				2222	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2223	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2224	errors);
				2225	return -1;
				2226	}
				2227	}
				2228
				2229	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2230	int size,
				2231	PyObject *mapping,
				2232	const char *errors)
				2233	{
				2234	PyUnicodeObject *v;
				2235	Py_UNICODE *p;
				2236
				2237	if (mapping == NULL) {
				2238	PyErr_BadArgument();
				2239	return NULL;
				2240	}
				2241
				2242	/* Output will never be longer than input */
				2243	v = _PyUnicode_New(size);
				2244	if (v == NULL)
				2245	goto onError;
				2246	if (size == 0)
				2247	goto done;
				2248	p = PyUnicode_AS_UNICODE(v);
				2249	while (size-- > 0) {
				2250	Py_UNICODE ch = *s++;
				2251	PyObject w, x;
				2252
				2253	/* Get mapping */
				2254	w = PyInt_FromLong(ch);
				2255	if (w == NULL)
				2256	goto onError;
				2257	x = PyObject_GetItem(mapping, w);
				2258	Py_DECREF(w);
				2259	if (x == NULL) {
				2260	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2261	/* No mapping found: default to 1-1 mapping */
				2262	PyErr_Clear();
				2263	*p++ = ch;
				2264	continue;
				2265	}
				2266	goto onError;
				2267	}
				2268
				2269	/* Apply mapping */
				2270	if (PyInt_Check(x))
				2271	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2272	else if (x == Py_None) {
				2273	/* undefined mapping */
				2274	if (translate_error(&s, &p, errors,
				2275	"character maps to <undefined>")) {
				2276	Py_DECREF(x);
				2277	goto onError;
				2278	}
				2279	}
				2280	else if (PyUnicode_Check(x)) {
				2281	if (PyUnicode_GET_SIZE(x) != 1) {
				2282	/* 1-n mapping */
				2283	PyErr_SetString(PyExc_NotImplementedError,
				2284	"1-n mappings are currently not implemented");
				2285	Py_DECREF(x);
				2286	goto onError;
				2287	}
				2288	p++ = PyUnicode_AS_UNICODE(x);
				2289	}
				2290	else {
				2291	/* wrong return value */
				2292	PyErr_SetString(PyExc_TypeError,
				2293	"translate mapping must return integer, None or unicode");
				2294	Py_DECREF(x);
				2295	goto onError;
				2296	}
				2297	Py_DECREF(x);
				2298	}
				2299	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2300	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2301	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2302
				2303	done:
				2304	return (PyObject *)v;
				2305
				2306	onError:
				2307	Py_XDECREF(v);
				2308	return NULL;
				2309	}
				2310
				2311	PyObject PyUnicode_Translate(PyObject str,
				2312	PyObject *mapping,
				2313	const char *errors)
				2314	{
				2315	PyObject *result;
				2316
				2317	str = PyUnicode_FromObject(str);
				2318	if (str == NULL)
				2319	goto onError;
				2320	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2321	PyUnicode_GET_SIZE(str),
				2322	mapping,
				2323	errors);
				2324	Py_DECREF(str);
				2325	return result;
				2326
				2327	onError:
				2328	Py_XDECREF(str);
				2329	return NULL;
				2330	}
				2331
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2332	/* --- Decimal Encoder ---------------------------------------------------- */
				2333
				2334	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2335	int length,
				2336	char *output,
				2337	const char *errors)
				2338	{
				2339	Py_UNICODE p, end;
				2340
				2341	if (output == NULL) {
				2342	PyErr_BadArgument();
				2343	return -1;
				2344	}
				2345
				2346	p = s;
				2347	end = s + length;
				2348	while (p < end) {
				2349	register Py_UNICODE ch = *p++;
				2350	int decimal;
				2351
				2352	if (Py_UNICODE_ISSPACE(ch)) {
				2353	*output++ = ' ';
				2354	continue;
				2355	}
				2356	decimal = Py_UNICODE_TODECIMAL(ch);
				2357	if (decimal >= 0) {
				2358	*output++ = '0' + decimal;
				2359	continue;
				2360	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2361	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2362	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2363	continue;
				2364	}
				2365	/* All other characters are considered invalid */
				2366	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2367	PyErr_SetString(PyExc_ValueError,
				2368	"invalid decimal Unicode string");
				2369	goto onError;
				2370	}
				2371	else if (strcmp(errors, "ignore") == 0)
				2372	continue;
				2373	else if (strcmp(errors, "replace") == 0) {
				2374	*output++ = '?';
				2375	continue;
				2376	}
				2377	}
				2378	/* 0-terminate the output string */
				2379	*output++ = '\0';
				2380	return 0;
				2381
				2382	onError:
				2383	return -1;
				2384	}
				2385
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2386	/* --- Helpers ------------------------------------------------------------ */
				2387
				2388	static
				2389	int count(PyUnicodeObject *self,
				2390	int start,
				2391	int end,
				2392	PyUnicodeObject *substring)
				2393	{
				2394	int count = 0;
				2395
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2396	if (start < 0)
				2397	start += self->length;
				2398	if (start < 0)
				2399	start = 0;
				2400	if (end > self->length)
				2401	end = self->length;
				2402	if (end < 0)
				2403	end += self->length;
				2404	if (end < 0)
				2405	end = 0;
				2406
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2407	if (substring->length == 0)
				2408	return (end - start + 1);
				2409
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2410	end -= substring->length;
				2411
				2412	while (start <= end)
				2413	if (Py_UNICODE_MATCH(self, start, substring)) {
				2414	count++;
				2415	start += substring->length;
				2416	} else
				2417	start++;
				2418
				2419	return count;
				2420	}
				2421
				2422	int PyUnicode_Count(PyObject *str,
				2423	PyObject *substr,
				2424	int start,
				2425	int end)
				2426	{
				2427	int result;
				2428
				2429	str = PyUnicode_FromObject(str);
				2430	if (str == NULL)
				2431	return -1;
				2432	substr = PyUnicode_FromObject(substr);
				2433	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2434	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2435	return -1;
				2436	}
				2437
				2438	result = count((PyUnicodeObject *)str,
				2439	start, end,
				2440	(PyUnicodeObject *)substr);
				2441
				2442	Py_DECREF(str);
				2443	Py_DECREF(substr);
				2444	return result;
				2445	}
				2446
				2447	static
				2448	int findstring(PyUnicodeObject *self,
				2449	PyUnicodeObject *substring,
				2450	int start,
				2451	int end,
				2452	int direction)
				2453	{
				2454	if (start < 0)
				2455	start += self->length;
				2456	if (start < 0)
				2457	start = 0;
				2458
				2459	if (substring->length == 0)
				2460	return start;
				2461
				2462	if (end > self->length)
				2463	end = self->length;
				2464	if (end < 0)
				2465	end += self->length;
				2466	if (end < 0)
				2467	end = 0;
				2468
				2469	end -= substring->length;
				2470
				2471	if (direction < 0) {
				2472	for (; end >= start; end--)
				2473	if (Py_UNICODE_MATCH(self, end, substring))
				2474	return end;
				2475	} else {
				2476	for (; start <= end; start++)
				2477	if (Py_UNICODE_MATCH(self, start, substring))
				2478	return start;
				2479	}
				2480
				2481	return -1;
				2482	}
				2483
				2484	int PyUnicode_Find(PyObject *str,
				2485	PyObject *substr,
				2486	int start,
				2487	int end,
				2488	int direction)
				2489	{
				2490	int result;
				2491
				2492	str = PyUnicode_FromObject(str);
				2493	if (str == NULL)
				2494	return -1;
				2495	substr = PyUnicode_FromObject(substr);
				2496	if (substr == NULL) {
				2497	Py_DECREF(substr);
				2498	return -1;
				2499	}
				2500
				2501	result = findstring((PyUnicodeObject *)str,
				2502	(PyUnicodeObject *)substr,
				2503	start, end, direction);
				2504	Py_DECREF(str);
				2505	Py_DECREF(substr);
				2506	return result;
				2507	}
				2508
				2509	static
				2510	int tailmatch(PyUnicodeObject *self,
				2511	PyUnicodeObject *substring,
				2512	int start,
				2513	int end,
				2514	int direction)
				2515	{
				2516	if (start < 0)
				2517	start += self->length;
				2518	if (start < 0)
				2519	start = 0;
				2520
				2521	if (substring->length == 0)
				2522	return 1;
				2523
				2524	if (end > self->length)
				2525	end = self->length;
				2526	if (end < 0)
				2527	end += self->length;
				2528	if (end < 0)
				2529	end = 0;
				2530
				2531	end -= substring->length;
				2532	if (end < start)
				2533	return 0;
				2534
				2535	if (direction > 0) {
				2536	if (Py_UNICODE_MATCH(self, end, substring))
				2537	return 1;
				2538	} else {
				2539	if (Py_UNICODE_MATCH(self, start, substring))
				2540	return 1;
				2541	}
				2542
				2543	return 0;
				2544	}
				2545
				2546	int PyUnicode_Tailmatch(PyObject *str,
				2547	PyObject *substr,
				2548	int start,
				2549	int end,
				2550	int direction)
				2551	{
				2552	int result;
				2553
				2554	str = PyUnicode_FromObject(str);
				2555	if (str == NULL)
				2556	return -1;
				2557	substr = PyUnicode_FromObject(substr);
				2558	if (substr == NULL) {
				2559	Py_DECREF(substr);
				2560	return -1;
				2561	}
				2562
				2563	result = tailmatch((PyUnicodeObject *)str,
				2564	(PyUnicodeObject *)substr,
				2565	start, end, direction);
				2566	Py_DECREF(str);
				2567	Py_DECREF(substr);
				2568	return result;
				2569	}
				2570
				2571	static
				2572	const Py_UNICODE findchar(const Py_UNICODE s,
				2573	int size,
				2574	Py_UNICODE ch)
				2575	{
				2576	/* like wcschr, but doesn't stop at NULL characters */
				2577
				2578	while (size-- > 0) {
				2579	if (*s == ch)
				2580	return s;
				2581	s++;
				2582	}
				2583
				2584	return NULL;
				2585	}
				2586
				2587	/* Apply fixfct filter to the Unicode object self and return a
				2588	reference to the modified object */
				2589
				2590	static
				2591	PyObject fixup(PyUnicodeObject self,
				2592	int (fixfct)(PyUnicodeObject s))
				2593	{
				2594
				2595	PyUnicodeObject *u;
				2596
				2597	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2598	self->length);
				2599	if (u == NULL)
				2600	return NULL;
				2601	if (!fixfct(u)) {
				2602	/* fixfct should return TRUE if it modified the buffer. If
				2603	FALSE, return a reference to the original buffer instead
				2604	(to save space, not time) */
				2605	Py_INCREF(self);
				2606	Py_DECREF(u);
				2607	return (PyObject*) self;
				2608	}
				2609	return (PyObject*) u;
				2610	}
				2611
				2612	static
				2613	int fixupper(PyUnicodeObject *self)
				2614	{
				2615	int len = self->length;
				2616	Py_UNICODE *s = self->str;
				2617	int status = 0;
				2618
				2619	while (len-- > 0) {
				2620	register Py_UNICODE ch;
				2621
				2622	ch = Py_UNICODE_TOUPPER(*s);
				2623	if (ch != *s) {
				2624	status = 1;
				2625	*s = ch;
				2626	}
				2627	s++;
				2628	}
				2629
				2630	return status;
				2631	}
				2632
				2633	static
				2634	int fixlower(PyUnicodeObject *self)
				2635	{
				2636	int len = self->length;
				2637	Py_UNICODE *s = self->str;
				2638	int status = 0;
				2639
				2640	while (len-- > 0) {
				2641	register Py_UNICODE ch;
				2642
				2643	ch = Py_UNICODE_TOLOWER(*s);
				2644	if (ch != *s) {
				2645	status = 1;
				2646	*s = ch;
				2647	}
				2648	s++;
				2649	}
				2650
				2651	return status;
				2652	}
				2653
				2654	static
				2655	int fixswapcase(PyUnicodeObject *self)
				2656	{
				2657	int len = self->length;
				2658	Py_UNICODE *s = self->str;
				2659	int status = 0;
				2660
				2661	while (len-- > 0) {
				2662	if (Py_UNICODE_ISUPPER(*s)) {
				2663	s = Py_UNICODE_TOLOWER(s);
				2664	status = 1;
				2665	} else if (Py_UNICODE_ISLOWER(*s)) {
				2666	s = Py_UNICODE_TOUPPER(s);
				2667	status = 1;
				2668	}
				2669	s++;
				2670	}
				2671
				2672	return status;
				2673	}
				2674
				2675	static
				2676	int fixcapitalize(PyUnicodeObject *self)
				2677	{
				2678	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2679	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2680	return 1;
				2681	}
				2682	return 0;
				2683	}
				2684
				2685	static
				2686	int fixtitle(PyUnicodeObject *self)
				2687	{
				2688	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2689	register Py_UNICODE *e;
				2690	int previous_is_cased;
				2691
				2692	/* Shortcut for single character strings */
				2693	if (PyUnicode_GET_SIZE(self) == 1) {
				2694	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2695	if (*p != ch) {
				2696	*p = ch;
				2697	return 1;
				2698	}
				2699	else
				2700	return 0;
				2701	}
				2702
				2703	e = p + PyUnicode_GET_SIZE(self);
				2704	previous_is_cased = 0;
				2705	for (; p < e; p++) {
				2706	register const Py_UNICODE ch = *p;
				2707
				2708	if (previous_is_cased)
				2709	*p = Py_UNICODE_TOLOWER(ch);
				2710	else
				2711	*p = Py_UNICODE_TOTITLE(ch);
				2712
				2713	if (Py_UNICODE_ISLOWER(ch) \|\|
				2714	Py_UNICODE_ISUPPER(ch) \|\|
				2715	Py_UNICODE_ISTITLE(ch))
				2716	previous_is_cased = 1;
				2717	else
				2718	previous_is_cased = 0;
				2719	}
				2720	return 1;
				2721	}
				2722
				2723	PyObject PyUnicode_Join(PyObject separator,
				2724	PyObject *seq)
				2725	{
				2726	Py_UNICODE *sep;
				2727	int seplen;
				2728	PyUnicodeObject *res = NULL;
				2729	int reslen = 0;
				2730	Py_UNICODE *p;
				2731	int seqlen = 0;
				2732	int sz = 100;
				2733	int i;
				2734
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2735	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2736	if (seqlen < 0 && PyErr_Occurred())
				2737	return NULL;
				2738
				2739	if (separator == NULL) {
				2740	Py_UNICODE blank = ' ';
				2741	sep = &blank;
				2742	seplen = 1;
				2743	}
				2744	else {
				2745	separator = PyUnicode_FromObject(separator);
				2746	if (separator == NULL)
				2747	return NULL;
				2748	sep = PyUnicode_AS_UNICODE(separator);
				2749	seplen = PyUnicode_GET_SIZE(separator);
				2750	}
				2751
				2752	res = _PyUnicode_New(sz);
				2753	if (res == NULL)
				2754	goto onError;
				2755	p = PyUnicode_AS_UNICODE(res);
				2756	reslen = 0;
				2757
				2758	for (i = 0; i < seqlen; i++) {
				2759	int itemlen;
				2760	PyObject *item;
				2761
				2762	item = PySequence_GetItem(seq, i);
				2763	if (item == NULL)
				2764	goto onError;
				2765	if (!PyUnicode_Check(item)) {
				2766	PyObject *v;
				2767	v = PyUnicode_FromObject(item);
				2768	Py_DECREF(item);
				2769	item = v;
				2770	if (item == NULL)
				2771	goto onError;
				2772	}
				2773	itemlen = PyUnicode_GET_SIZE(item);
				2774	while (reslen + itemlen + seplen >= sz) {
				2775	if (_PyUnicode_Resize(res, sz*2))
				2776	goto onError;
				2777	sz *= 2;
				2778	p = PyUnicode_AS_UNICODE(res) + reslen;
				2779	}
				2780	if (i > 0) {
				2781	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2782	p += seplen;
				2783	reslen += seplen;
				2784	}
				2785	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2786	p += itemlen;
				2787	reslen += itemlen;
				2788	Py_DECREF(item);
				2789	}
				2790	if (_PyUnicode_Resize(res, reslen))
				2791	goto onError;
				2792
				2793	Py_XDECREF(separator);
				2794	return (PyObject *)res;
				2795
				2796	onError:
				2797	Py_XDECREF(separator);
				2798	Py_DECREF(res);
				2799	return NULL;
				2800	}
				2801
				2802	static
				2803	PyUnicodeObject pad(PyUnicodeObject self,
				2804	int left,
				2805	int right,
				2806	Py_UNICODE fill)
				2807	{
				2808	PyUnicodeObject *u;
				2809
				2810	if (left < 0)
				2811	left = 0;
				2812	if (right < 0)
				2813	right = 0;
				2814
				2815	if (left == 0 && right == 0) {
				2816	Py_INCREF(self);
				2817	return self;
				2818	}
				2819
				2820	u = _PyUnicode_New(left + self->length + right);
				2821	if (u) {
				2822	if (left)
				2823	Py_UNICODE_FILL(u->str, fill, left);
				2824	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2825	if (right)
				2826	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2827	}
				2828
				2829	return u;
				2830	}
				2831
				2832	#define SPLIT_APPEND(data, left, right) \
				2833	str = PyUnicode_FromUnicode(data + left, right - left); \
				2834	if (!str) \
				2835	goto onError; \
				2836	if (PyList_Append(list, str)) { \
				2837	Py_DECREF(str); \
				2838	goto onError; \
				2839	} \
				2840	else \
				2841	Py_DECREF(str);
				2842
				2843	static
				2844	PyObject split_whitespace(PyUnicodeObject self,
				2845	PyObject *list,
				2846	int maxcount)
				2847	{
				2848	register int i;
				2849	register int j;
				2850	int len = self->length;
				2851	PyObject *str;
				2852
				2853	for (i = j = 0; i < len; ) {
				2854	/* find a token */
				2855	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2856	i++;
				2857	j = i;
				2858	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2859	i++;
				2860	if (j < i) {
				2861	if (maxcount-- <= 0)
				2862	break;
				2863	SPLIT_APPEND(self->str, j, i);
				2864	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2865	i++;
				2866	j = i;
				2867	}
				2868	}
				2869	if (j < len) {
				2870	SPLIT_APPEND(self->str, j, len);
				2871	}
				2872	return list;
				2873
				2874	onError:
				2875	Py_DECREF(list);
				2876	return NULL;
				2877	}
				2878
				2879	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2880	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2881	{
				2882	register int i;
				2883	register int j;
				2884	int len;
				2885	PyObject *list;
				2886	PyObject *str;
				2887	Py_UNICODE *data;
				2888
				2889	string = PyUnicode_FromObject(string);
				2890	if (string == NULL)
				2891	return NULL;
				2892	data = PyUnicode_AS_UNICODE(string);
				2893	len = PyUnicode_GET_SIZE(string);
				2894
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2895	list = PyList_New(0);
				2896	if (!list)
				2897	goto onError;
				2898
				2899	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2900	int eol;
				2901
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2902	/* Find a line and append it */
				2903	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2904	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2905
				2906	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2907	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2908	if (i < len) {
				2909	if (data[i] == '\r' && i + 1 < len &&
				2910	data[i+1] == '\n')
				2911	i += 2;
				2912	else
				2913	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2914	if (keepends)
				2915	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2916	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2917	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2918	j = i;
				2919	}
				2920	if (j < len) {
				2921	SPLIT_APPEND(data, j, len);
				2922	}
				2923
				2924	Py_DECREF(string);
				2925	return list;
				2926
				2927	onError:
				2928	Py_DECREF(list);
				2929	Py_DECREF(string);
				2930	return NULL;
				2931	}
				2932
				2933	static
				2934	PyObject split_char(PyUnicodeObject self,
				2935	PyObject *list,
				2936	Py_UNICODE ch,
				2937	int maxcount)
				2938	{
				2939	register int i;
				2940	register int j;
				2941	int len = self->length;
				2942	PyObject *str;
				2943
				2944	for (i = j = 0; i < len; ) {
				2945	if (self->str[i] == ch) {
				2946	if (maxcount-- <= 0)
				2947	break;
				2948	SPLIT_APPEND(self->str, j, i);
				2949	i = j = i + 1;
				2950	} else
				2951	i++;
				2952	}
				2953	if (j <= len) {
				2954	SPLIT_APPEND(self->str, j, len);
				2955	}
				2956	return list;
				2957
				2958	onError:
				2959	Py_DECREF(list);
				2960	return NULL;
				2961	}
				2962
				2963	static
				2964	PyObject split_substring(PyUnicodeObject self,
				2965	PyObject *list,
				2966	PyUnicodeObject *substring,
				2967	int maxcount)
				2968	{
				2969	register int i;
				2970	register int j;
				2971	int len = self->length;
				2972	int sublen = substring->length;
				2973	PyObject *str;
				2974
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	2975	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2976	if (Py_UNICODE_MATCH(self, i, substring)) {
				2977	if (maxcount-- <= 0)
				2978	break;
				2979	SPLIT_APPEND(self->str, j, i);
				2980	i = j = i + sublen;
				2981	} else
				2982	i++;
				2983	}
				2984	if (j <= len) {
				2985	SPLIT_APPEND(self->str, j, len);
				2986	}
				2987	return list;
				2988
				2989	onError:
				2990	Py_DECREF(list);
				2991	return NULL;
				2992	}
				2993
				2994	#undef SPLIT_APPEND
				2995
				2996	static
				2997	PyObject split(PyUnicodeObject self,
				2998	PyUnicodeObject *substring,
				2999	int maxcount)
				3000	{
				3001	PyObject *list;
				3002
				3003	if (maxcount < 0)
				3004	maxcount = INT_MAX;
				3005
				3006	list = PyList_New(0);
				3007	if (!list)
				3008	return NULL;
				3009
				3010	if (substring == NULL)
				3011	return split_whitespace(self,list,maxcount);
				3012
				3013	else if (substring->length == 1)
				3014	return split_char(self,list,substring->str[0],maxcount);
				3015
				3016	else if (substring->length == 0) {
				3017	Py_DECREF(list);
				3018	PyErr_SetString(PyExc_ValueError, "empty separator");
				3019	return NULL;
				3020	}
				3021	else
				3022	return split_substring(self,list,substring,maxcount);
				3023	}
				3024
				3025	static
				3026	PyObject strip(PyUnicodeObject self,
				3027	int left,
				3028	int right)
				3029	{
				3030	Py_UNICODE *p = self->str;
				3031	int start = 0;
				3032	int end = self->length;
				3033
				3034	if (left)
				3035	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3036	start++;
				3037
				3038	if (right)
				3039	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3040	end--;
				3041
				3042	if (start == 0 && end == self->length) {
				3043	/* couldn't strip anything off, return original string */
				3044	Py_INCREF(self);
				3045	return (PyObject*) self;
				3046	}
				3047
				3048	return (PyObject*) PyUnicode_FromUnicode(
				3049	self->str + start,
				3050	end - start
				3051	);
				3052	}
				3053
				3054	static
				3055	PyObject replace(PyUnicodeObject self,
				3056	PyUnicodeObject *str1,
				3057	PyUnicodeObject *str2,
				3058	int maxcount)
				3059	{
				3060	PyUnicodeObject *u;
				3061
				3062	if (maxcount < 0)
				3063	maxcount = INT_MAX;
				3064
				3065	if (str1->length == 1 && str2->length == 1) {
				3066	int i;
				3067
				3068	/* replace characters */
				3069	if (!findchar(self->str, self->length, str1->str[0])) {
				3070	/* nothing to replace, return original string */
				3071	Py_INCREF(self);
				3072	u = self;
				3073	} else {
				3074	Py_UNICODE u1 = str1->str[0];
				3075	Py_UNICODE u2 = str2->str[0];
				3076
				3077	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3078	self->str,
				3079	self->length
				3080	);
				3081	if (u)
				3082	for (i = 0; i < u->length; i++)
				3083	if (u->str[i] == u1) {
				3084	if (--maxcount < 0)
				3085	break;
				3086	u->str[i] = u2;
				3087	}
				3088	}
				3089
				3090	} else {
				3091	int n, i;
				3092	Py_UNICODE *p;
				3093
				3094	/* replace strings */
				3095	n = count(self, 0, self->length, str1);
				3096	if (n > maxcount)
				3097	n = maxcount;
				3098	if (n == 0) {
				3099	/* nothing to replace, return original string */
				3100	Py_INCREF(self);
				3101	u = self;
				3102	} else {
				3103	u = _PyUnicode_New(
				3104	self->length + n * (str2->length - str1->length));
				3105	if (u) {
				3106	i = 0;
				3107	p = u->str;
				3108	while (i <= self->length - str1->length)
				3109	if (Py_UNICODE_MATCH(self, i, str1)) {
				3110	/* replace string segment */
				3111	Py_UNICODE_COPY(p, str2->str, str2->length);
				3112	p += str2->length;
				3113	i += str1->length;
				3114	if (--n <= 0) {
				3115	/* copy remaining part */
				3116	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3117	break;
				3118	}
				3119	} else
				3120	*p++ = self->str[i++];
				3121	}
				3122	}
				3123	}
				3124
				3125	return (PyObject *) u;
				3126	}
				3127
				3128	/* --- Unicode Object Methods --------------------------------------------- */
				3129
				3130	static char title__doc__[] =
				3131	"S.title() -> unicode\n\
				3132	\n\
				3133	Return a titlecased version of S, i.e. words start with title case\n\
				3134	characters, all remaining cased characters have lower case.";
				3135
				3136	static PyObject*
				3137	unicode_title(PyUnicodeObject self, PyObject args)
				3138	{
				3139	if (!PyArg_NoArgs(args))
				3140	return NULL;
				3141	return fixup(self, fixtitle);
				3142	}
				3143
				3144	static char capitalize__doc__[] =
				3145	"S.capitalize() -> unicode\n\
				3146	\n\
				3147	Return a capitalized version of S, i.e. make the first character\n\
				3148	have upper case.";
				3149
				3150	static PyObject*
				3151	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3152	{
				3153	if (!PyArg_NoArgs(args))
				3154	return NULL;
				3155	return fixup(self, fixcapitalize);
				3156	}
				3157
				3158	#if 0
				3159	static char capwords__doc__[] =
				3160	"S.capwords() -> unicode\n\
				3161	\n\
				3162	Apply .capitalize() to all words in S and return the result with\n\
				3163	normalized whitespace (all whitespace strings are replaced by ' ').";
				3164
				3165	static PyObject*
				3166	unicode_capwords(PyUnicodeObject self, PyObject args)
				3167	{
				3168	PyObject *list;
				3169	PyObject *item;
				3170	int i;
				3171
				3172	if (!PyArg_NoArgs(args))
				3173	return NULL;
				3174
				3175	/* Split into words */
				3176	list = split(self, NULL, -1);
				3177	if (!list)
				3178	return NULL;
				3179
				3180	/* Capitalize each word */
				3181	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3182	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3183	fixcapitalize);
				3184	if (item == NULL)
				3185	goto onError;
				3186	Py_DECREF(PyList_GET_ITEM(list, i));
				3187	PyList_SET_ITEM(list, i, item);
				3188	}
				3189
				3190	/* Join the words to form a new string */
				3191	item = PyUnicode_Join(NULL, list);
				3192
				3193	onError:
				3194	Py_DECREF(list);
				3195	return (PyObject *)item;
				3196	}
				3197	#endif
				3198
				3199	static char center__doc__[] =
				3200	"S.center(width) -> unicode\n\
				3201	\n\
				3202	Return S centered in a Unicode string of length width. Padding is done\n\
				3203	using spaces.";
				3204
				3205	static PyObject *
				3206	unicode_center(PyUnicodeObject self, PyObject args)
				3207	{
				3208	int marg, left;
				3209	int width;
				3210
				3211	if (!PyArg_ParseTuple(args, "i:center", &width))
				3212	return NULL;
				3213
				3214	if (self->length >= width) {
				3215	Py_INCREF(self);
				3216	return (PyObject*) self;
				3217	}
				3218
				3219	marg = width - self->length;
				3220	left = marg / 2 + (marg & width & 1);
				3221
				3222	return (PyObject*) pad(self, left, marg - left, ' ');
				3223	}
				3224
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3225	#if 0
				3226
				3227	/* This code should go into some future Unicode collation support
				3228	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3229	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3230
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3231	/* speedy UTF-16 code point order comparison */
				3232	/* gleaned from: */
				3233	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3234
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3235	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3236	{
				3237	0, 0, 0, 0, 0, 0, 0, 0,
				3238	0, 0, 0, 0, 0, 0, 0, 0,
				3239	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3240	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3241	};
				3242
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3243	static int
				3244	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3245	{
				3246	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3247
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3248	Py_UNICODE *s1 = str1->str;
				3249	Py_UNICODE *s2 = str2->str;
				3250
				3251	len1 = str1->length;
				3252	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3253
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3254	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3255	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3256	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3257
				3258	c1 = *s1++;
				3259	c2 = *s2++;
				3260	if (c1 > (1<<11) * 26)
				3261	c1 += utf16Fixup[c1>>11];
				3262	if (c2 > (1<<11) * 26)
				3263	c2 += utf16Fixup[c2>>11];
				3264
				3265	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3266	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3267	if (diff)
				3268	return (diff < 0) ? -1 : (diff != 0);
				3269	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3270	}
				3271
				3272	return (len1 < len2) ? -1 : (len1 != len2);
				3273	}
				3274
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3275	#else
				3276
				3277	static int
				3278	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3279	{
				3280	register int len1, len2;
				3281
				3282	Py_UNICODE *s1 = str1->str;
				3283	Py_UNICODE *s2 = str2->str;
				3284
				3285	len1 = str1->length;
				3286	len2 = str2->length;
				3287
				3288	while (len1 > 0 && len2 > 0) {
				3289	register long diff;
				3290
				3291	diff = (long)s1++ - (long)s2++;
				3292	if (diff)
				3293	return (diff < 0) ? -1 : (diff != 0);
				3294	len1--; len2--;
				3295	}
				3296
				3297	return (len1 < len2) ? -1 : (len1 != len2);
				3298	}
				3299
				3300	#endif
				3301
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3302	int PyUnicode_Compare(PyObject *left,
				3303	PyObject *right)
				3304	{
				3305	PyUnicodeObject u = NULL, v = NULL;
				3306	int result;
				3307
				3308	/* Coerce the two arguments */
				3309	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3310	if (u == NULL)
				3311	goto onError;
				3312	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3313	if (v == NULL)
				3314	goto onError;
				3315
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3316	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3317	if (v == u) {
				3318	Py_DECREF(u);
				3319	Py_DECREF(v);
				3320	return 0;
				3321	}
				3322
				3323	result = unicode_compare(u, v);
				3324
				3325	Py_DECREF(u);
				3326	Py_DECREF(v);
				3327	return result;
				3328
				3329	onError:
				3330	Py_XDECREF(u);
				3331	Py_XDECREF(v);
				3332	return -1;
				3333	}
				3334
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3335	int PyUnicode_Contains(PyObject *container,
				3336	PyObject *element)
				3337	{
				3338	PyUnicodeObject u = NULL, v = NULL;
				3339	int result;
				3340	register const Py_UNICODE p, e;
				3341	register Py_UNICODE ch;
				3342
				3343	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3344	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3345	if (v == NULL) {
				3346	PyErr_SetString(PyExc_TypeError,
				3347	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3348	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3349	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3350	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3351	if (u == NULL) {
				3352	Py_DECREF(v);
				3353	goto onError;
				3354	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3355
				3356	/* Check v in u */
				3357	if (PyUnicode_GET_SIZE(v) != 1) {
				3358	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3359	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3360	goto onError;
				3361	}
				3362	ch = *PyUnicode_AS_UNICODE(v);
				3363	p = PyUnicode_AS_UNICODE(u);
				3364	e = p + PyUnicode_GET_SIZE(u);
				3365	result = 0;
				3366	while (p < e) {
				3367	if (*p++ == ch) {
				3368	result = 1;
				3369	break;
				3370	}
				3371	}
				3372
				3373	Py_DECREF(u);
				3374	Py_DECREF(v);
				3375	return result;
				3376
				3377	onError:
				3378	Py_XDECREF(u);
				3379	Py_XDECREF(v);
				3380	return -1;
				3381	}
				3382
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3383	/* Concat to string or Unicode object giving a new Unicode object. */
				3384
				3385	PyObject PyUnicode_Concat(PyObject left,
				3386	PyObject *right)
				3387	{
				3388	PyUnicodeObject u = NULL, v = NULL, *w;
				3389
				3390	/* Coerce the two arguments */
				3391	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3392	if (u == NULL)
				3393	goto onError;
				3394	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3395	if (v == NULL)
				3396	goto onError;
				3397
				3398	/* Shortcuts */
				3399	if (v == unicode_empty) {
				3400	Py_DECREF(v);
				3401	return (PyObject *)u;
				3402	}
				3403	if (u == unicode_empty) {
				3404	Py_DECREF(u);
				3405	return (PyObject *)v;
				3406	}
				3407
				3408	/* Concat the two Unicode strings */
				3409	w = _PyUnicode_New(u->length + v->length);
				3410	if (w == NULL)
				3411	goto onError;
				3412	Py_UNICODE_COPY(w->str, u->str, u->length);
				3413	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3414
				3415	Py_DECREF(u);
				3416	Py_DECREF(v);
				3417	return (PyObject *)w;
				3418
				3419	onError:
				3420	Py_XDECREF(u);
				3421	Py_XDECREF(v);
				3422	return NULL;
				3423	}
				3424
				3425	static char count__doc__[] =
				3426	"S.count(sub[, start[, end]]) -> int\n\
				3427	\n\
				3428	Return the number of occurrences of substring sub in Unicode string\n\
				3429	S[start:end]. Optional arguments start and end are\n\
				3430	interpreted as in slice notation.";
				3431
				3432	static PyObject *
				3433	unicode_count(PyUnicodeObject self, PyObject args)
				3434	{
				3435	PyUnicodeObject *substring;
				3436	int start = 0;
				3437	int end = INT_MAX;
				3438	PyObject *result;
				3439
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3440	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3441	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3442	return NULL;
				3443
				3444	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3445	(PyObject *)substring);
				3446	if (substring == NULL)
				3447	return NULL;
				3448
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3449	if (start < 0)
				3450	start += self->length;
				3451	if (start < 0)
				3452	start = 0;
				3453	if (end > self->length)
				3454	end = self->length;
				3455	if (end < 0)
				3456	end += self->length;
				3457	if (end < 0)
				3458	end = 0;
				3459
				3460	result = PyInt_FromLong((long) count(self, start, end, substring));
				3461
				3462	Py_DECREF(substring);
				3463	return result;
				3464	}
				3465
				3466	static char encode__doc__[] =
				3467	"S.encode([encoding[,errors]]) -> string\n\
				3468	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3469	Return an encoded string version of S. Default encoding is the current\n\
				3470	default string encoding. errors may be given to set a different error\n\
				3471	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3472	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3473
				3474	static PyObject *
				3475	unicode_encode(PyUnicodeObject self, PyObject args)
				3476	{
				3477	char *encoding = NULL;
				3478	char *errors = NULL;
				3479	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3480	return NULL;
				3481	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3482	}
				3483
				3484	static char expandtabs__doc__[] =
				3485	"S.expandtabs([tabsize]) -> unicode\n\
				3486	\n\
				3487	Return a copy of S where all tab characters are expanded using spaces.\n\
				3488	If tabsize is not given, a tab size of 8 characters is assumed.";
				3489
				3490	static PyObject*
				3491	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3492	{
				3493	Py_UNICODE *e;
				3494	Py_UNICODE *p;
				3495	Py_UNICODE *q;
				3496	int i, j;
				3497	PyUnicodeObject *u;
				3498	int tabsize = 8;
				3499
				3500	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3501	return NULL;
				3502
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3503	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3504	i = j = 0;
				3505	e = self->str + self->length;
				3506	for (p = self->str; p < e; p++)
				3507	if (*p == '\t') {
				3508	if (tabsize > 0)
				3509	j += tabsize - (j % tabsize);
				3510	}
				3511	else {
				3512	j++;
				3513	if (p == '\n' \|\| p == '\r') {
				3514	i += j;
				3515	j = 0;
				3516	}
				3517	}
				3518
				3519	/* Second pass: create output string and fill it */
				3520	u = _PyUnicode_New(i + j);
				3521	if (!u)
				3522	return NULL;
				3523
				3524	j = 0;
				3525	q = u->str;
				3526
				3527	for (p = self->str; p < e; p++)
				3528	if (*p == '\t') {
				3529	if (tabsize > 0) {
				3530	i = tabsize - (j % tabsize);
				3531	j += i;
				3532	while (i--)
				3533	*q++ = ' ';
				3534	}
				3535	}
				3536	else {
				3537	j++;
				3538	q++ = p;
				3539	if (p == '\n' \|\| p == '\r')
				3540	j = 0;
				3541	}
				3542
				3543	return (PyObject*) u;
				3544	}
				3545
				3546	static char find__doc__[] =
				3547	"S.find(sub [,start [,end]]) -> int\n\
				3548	\n\
				3549	Return the lowest index in S where substring sub is found,\n\
				3550	such that sub is contained within s[start,end]. Optional\n\
				3551	arguments start and end are interpreted as in slice notation.\n\
				3552	\n\
				3553	Return -1 on failure.";
				3554
				3555	static PyObject *
				3556	unicode_find(PyUnicodeObject self, PyObject args)
				3557	{
				3558	PyUnicodeObject *substring;
				3559	int start = 0;
				3560	int end = INT_MAX;
				3561	PyObject *result;
				3562
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3563	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3564	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3565	return NULL;
				3566	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3567	(PyObject *)substring);
				3568	if (substring == NULL)
				3569	return NULL;
				3570
				3571	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3572
				3573	Py_DECREF(substring);
				3574	return result;
				3575	}
				3576
				3577	static PyObject *
				3578	unicode_getitem(PyUnicodeObject *self, int index)
				3579	{
				3580	if (index < 0 \|\| index >= self->length) {
				3581	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3582	return NULL;
				3583	}
				3584
				3585	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3586	}
				3587
				3588	static long
				3589	unicode_hash(PyUnicodeObject *self)
				3590	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3591	/* Since Unicode objects compare equal to their ASCII string
				3592	counterparts, they should use the individual character values
				3593	as basis for their hash value. This is needed to assure that
				3594	strings and Unicode objects behave in the same way as
				3595	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3596
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3597	register int len;
				3598	register Py_UNICODE *p;
				3599	register long x;
				3600
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3601	if (self->hash != -1)
				3602	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3603	len = PyUnicode_GET_SIZE(self);
				3604	p = PyUnicode_AS_UNICODE(self);
				3605	x = *p << 7;
				3606	while (--len >= 0)
				3607	x = (1000003x) ^ p++;
				3608	x ^= PyUnicode_GET_SIZE(self);
				3609	if (x == -1)
				3610	x = -2;
				3611	self->hash = x;
				3612	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3613	}
				3614
				3615	static char index__doc__[] =
				3616	"S.index(sub [,start [,end]]) -> int\n\
				3617	\n\
				3618	Like S.find() but raise ValueError when the substring is not found.";
				3619
				3620	static PyObject *
				3621	unicode_index(PyUnicodeObject self, PyObject args)
				3622	{
				3623	int result;
				3624	PyUnicodeObject *substring;
				3625	int start = 0;
				3626	int end = INT_MAX;
				3627
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3628	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3629	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3630	return NULL;
				3631
				3632	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3633	(PyObject *)substring);
				3634	if (substring == NULL)
				3635	return NULL;
				3636
				3637	result = findstring(self, substring, start, end, 1);
				3638
				3639	Py_DECREF(substring);
				3640	if (result < 0) {
				3641	PyErr_SetString(PyExc_ValueError, "substring not found");
				3642	return NULL;
				3643	}
				3644	return PyInt_FromLong(result);
				3645	}
				3646
				3647	static char islower__doc__[] =
				3648	"S.islower() -> int\n\
				3649	\n\
				3650	Return 1 if all cased characters in S are lowercase and there is\n\
				3651	at least one cased character in S, 0 otherwise.";
				3652
				3653	static PyObject*
				3654	unicode_islower(PyUnicodeObject self, PyObject args)
				3655	{
				3656	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3657	register const Py_UNICODE *e;
				3658	int cased;
				3659
				3660	if (!PyArg_NoArgs(args))
				3661	return NULL;
				3662
				3663	/* Shortcut for single character strings */
				3664	if (PyUnicode_GET_SIZE(self) == 1)
				3665	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3666
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3667	/* Special case for empty strings */
				3668	if (PyString_GET_SIZE(self) == 0)
				3669	return PyInt_FromLong(0);
				3670
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3671	e = p + PyUnicode_GET_SIZE(self);
				3672	cased = 0;
				3673	for (; p < e; p++) {
				3674	register const Py_UNICODE ch = *p;
				3675
				3676	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3677	return PyInt_FromLong(0);
				3678	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3679	cased = 1;
				3680	}
				3681	return PyInt_FromLong(cased);
				3682	}
				3683
				3684	static char isupper__doc__[] =
				3685	"S.isupper() -> int\n\
				3686	\n\
				3687	Return 1 if all cased characters in S are uppercase and there is\n\
				3688	at least one cased character in S, 0 otherwise.";
				3689
				3690	static PyObject*
				3691	unicode_isupper(PyUnicodeObject self, PyObject args)
				3692	{
				3693	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3694	register const Py_UNICODE *e;
				3695	int cased;
				3696
				3697	if (!PyArg_NoArgs(args))
				3698	return NULL;
				3699
				3700	/* Shortcut for single character strings */
				3701	if (PyUnicode_GET_SIZE(self) == 1)
				3702	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3703
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3704	/* Special case for empty strings */
				3705	if (PyString_GET_SIZE(self) == 0)
				3706	return PyInt_FromLong(0);
				3707
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3708	e = p + PyUnicode_GET_SIZE(self);
				3709	cased = 0;
				3710	for (; p < e; p++) {
				3711	register const Py_UNICODE ch = *p;
				3712
				3713	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3714	return PyInt_FromLong(0);
				3715	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3716	cased = 1;
				3717	}
				3718	return PyInt_FromLong(cased);
				3719	}
				3720
				3721	static char istitle__doc__[] =
				3722	"S.istitle() -> int\n\
				3723	\n\
				3724	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3725	may only follow uncased characters and lowercase characters only cased\n\
				3726	ones. Return 0 otherwise.";
				3727
				3728	static PyObject*
				3729	unicode_istitle(PyUnicodeObject self, PyObject args)
				3730	{
				3731	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3732	register const Py_UNICODE *e;
				3733	int cased, previous_is_cased;
				3734
				3735	if (!PyArg_NoArgs(args))
				3736	return NULL;
				3737
				3738	/* Shortcut for single character strings */
				3739	if (PyUnicode_GET_SIZE(self) == 1)
				3740	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3741	(Py_UNICODE_ISUPPER(*p) != 0));
				3742
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3743	/* Special case for empty strings */
				3744	if (PyString_GET_SIZE(self) == 0)
				3745	return PyInt_FromLong(0);
				3746
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3747	e = p + PyUnicode_GET_SIZE(self);
				3748	cased = 0;
				3749	previous_is_cased = 0;
				3750	for (; p < e; p++) {
				3751	register const Py_UNICODE ch = *p;
				3752
				3753	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3754	if (previous_is_cased)
				3755	return PyInt_FromLong(0);
				3756	previous_is_cased = 1;
				3757	cased = 1;
				3758	}
				3759	else if (Py_UNICODE_ISLOWER(ch)) {
				3760	if (!previous_is_cased)
				3761	return PyInt_FromLong(0);
				3762	previous_is_cased = 1;
				3763	cased = 1;
				3764	}
				3765	else
				3766	previous_is_cased = 0;
				3767	}
				3768	return PyInt_FromLong(cased);
				3769	}
				3770
				3771	static char isspace__doc__[] =
				3772	"S.isspace() -> int\n\
				3773	\n\
				3774	Return 1 if there are only whitespace characters in S,\n\
				3775	0 otherwise.";
				3776
				3777	static PyObject*
				3778	unicode_isspace(PyUnicodeObject self, PyObject args)
				3779	{
				3780	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3781	register const Py_UNICODE *e;
				3782
				3783	if (!PyArg_NoArgs(args))
				3784	return NULL;
				3785
				3786	/* Shortcut for single character strings */
				3787	if (PyUnicode_GET_SIZE(self) == 1 &&
				3788	Py_UNICODE_ISSPACE(*p))
				3789	return PyInt_FromLong(1);
				3790
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3791	/* Special case for empty strings */
				3792	if (PyString_GET_SIZE(self) == 0)
				3793	return PyInt_FromLong(0);
				3794
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3795	e = p + PyUnicode_GET_SIZE(self);
				3796	for (; p < e; p++) {
				3797	if (!Py_UNICODE_ISSPACE(*p))
				3798	return PyInt_FromLong(0);
				3799	}
				3800	return PyInt_FromLong(1);
				3801	}
				3802
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3803	static char isalpha__doc__[] =
				3804	"S.isalpha() -> int\n\
				3805	\n\
				3806	Return 1 if all characters in S are alphabetic\n\
				3807	and there is at least one character in S, 0 otherwise.";
				3808
				3809	static PyObject*
				3810	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3811	{
				3812	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3813	register const Py_UNICODE *e;
				3814
				3815	if (!PyArg_NoArgs(args))
				3816	return NULL;
				3817
				3818	/* Shortcut for single character strings */
				3819	if (PyUnicode_GET_SIZE(self) == 1 &&
				3820	Py_UNICODE_ISALPHA(*p))
				3821	return PyInt_FromLong(1);
				3822
				3823	/* Special case for empty strings */
				3824	if (PyString_GET_SIZE(self) == 0)
				3825	return PyInt_FromLong(0);
				3826
				3827	e = p + PyUnicode_GET_SIZE(self);
				3828	for (; p < e; p++) {
				3829	if (!Py_UNICODE_ISALPHA(*p))
				3830	return PyInt_FromLong(0);
				3831	}
				3832	return PyInt_FromLong(1);
				3833	}
				3834
				3835	static char isalnum__doc__[] =
				3836	"S.isalnum() -> int\n\
				3837	\n\
				3838	Return 1 if all characters in S are alphanumeric\n\
				3839	and there is at least one character in S, 0 otherwise.";
				3840
				3841	static PyObject*
				3842	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3843	{
				3844	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3845	register const Py_UNICODE *e;
				3846
				3847	if (!PyArg_NoArgs(args))
				3848	return NULL;
				3849
				3850	/* Shortcut for single character strings */
				3851	if (PyUnicode_GET_SIZE(self) == 1 &&
				3852	Py_UNICODE_ISALNUM(*p))
				3853	return PyInt_FromLong(1);
				3854
				3855	/* Special case for empty strings */
				3856	if (PyString_GET_SIZE(self) == 0)
				3857	return PyInt_FromLong(0);
				3858
				3859	e = p + PyUnicode_GET_SIZE(self);
				3860	for (; p < e; p++) {
				3861	if (!Py_UNICODE_ISALNUM(*p))
				3862	return PyInt_FromLong(0);
				3863	}
				3864	return PyInt_FromLong(1);
				3865	}
				3866
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3867	static char isdecimal__doc__[] =
				3868	"S.isdecimal() -> int\n\
				3869	\n\
				3870	Return 1 if there are only decimal characters in S,\n\
				3871	0 otherwise.";
				3872
				3873	static PyObject*
				3874	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3875	{
				3876	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3877	register const Py_UNICODE *e;
				3878
				3879	if (!PyArg_NoArgs(args))
				3880	return NULL;
				3881
				3882	/* Shortcut for single character strings */
				3883	if (PyUnicode_GET_SIZE(self) == 1 &&
				3884	Py_UNICODE_ISDECIMAL(*p))
				3885	return PyInt_FromLong(1);
				3886
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3887	/* Special case for empty strings */
				3888	if (PyString_GET_SIZE(self) == 0)
				3889	return PyInt_FromLong(0);
				3890
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3891	e = p + PyUnicode_GET_SIZE(self);
				3892	for (; p < e; p++) {
				3893	if (!Py_UNICODE_ISDECIMAL(*p))
				3894	return PyInt_FromLong(0);
				3895	}
				3896	return PyInt_FromLong(1);
				3897	}
				3898
				3899	static char isdigit__doc__[] =
				3900	"S.isdigit() -> int\n\
				3901	\n\
				3902	Return 1 if there are only digit characters in S,\n\
				3903	0 otherwise.";
				3904
				3905	static PyObject*
				3906	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3907	{
				3908	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3909	register const Py_UNICODE *e;
				3910
				3911	if (!PyArg_NoArgs(args))
				3912	return NULL;
				3913
				3914	/* Shortcut for single character strings */
				3915	if (PyUnicode_GET_SIZE(self) == 1 &&
				3916	Py_UNICODE_ISDIGIT(*p))
				3917	return PyInt_FromLong(1);
				3918
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3919	/* Special case for empty strings */
				3920	if (PyString_GET_SIZE(self) == 0)
				3921	return PyInt_FromLong(0);
				3922
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3923	e = p + PyUnicode_GET_SIZE(self);
				3924	for (; p < e; p++) {
				3925	if (!Py_UNICODE_ISDIGIT(*p))
				3926	return PyInt_FromLong(0);
				3927	}
				3928	return PyInt_FromLong(1);
				3929	}
				3930
				3931	static char isnumeric__doc__[] =
				3932	"S.isnumeric() -> int\n\
				3933	\n\
				3934	Return 1 if there are only numeric characters in S,\n\
				3935	0 otherwise.";
				3936
				3937	static PyObject*
				3938	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3939	{
				3940	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3941	register const Py_UNICODE *e;
				3942
				3943	if (!PyArg_NoArgs(args))
				3944	return NULL;
				3945
				3946	/* Shortcut for single character strings */
				3947	if (PyUnicode_GET_SIZE(self) == 1 &&
				3948	Py_UNICODE_ISNUMERIC(*p))
				3949	return PyInt_FromLong(1);
				3950
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3951	/* Special case for empty strings */
				3952	if (PyString_GET_SIZE(self) == 0)
				3953	return PyInt_FromLong(0);
				3954
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3955	e = p + PyUnicode_GET_SIZE(self);
				3956	for (; p < e; p++) {
				3957	if (!Py_UNICODE_ISNUMERIC(*p))
				3958	return PyInt_FromLong(0);
				3959	}
				3960	return PyInt_FromLong(1);
				3961	}
				3962
				3963	static char join__doc__[] =
				3964	"S.join(sequence) -> unicode\n\
				3965	\n\
				3966	Return a string which is the concatenation of the strings in the\n\
				3967	sequence. The separator between elements is S.";
				3968
				3969	static PyObject*
				3970	unicode_join(PyUnicodeObject self, PyObject args)
				3971	{
				3972	PyObject *data;
				3973	if (!PyArg_ParseTuple(args, "O:join", &data))
				3974	return NULL;
				3975
				3976	return PyUnicode_Join((PyObject *)self, data);
				3977	}
				3978
				3979	static int
				3980	unicode_length(PyUnicodeObject *self)
				3981	{
				3982	return self->length;
				3983	}
				3984
				3985	static char ljust__doc__[] =
				3986	"S.ljust(width) -> unicode\n\
				3987	\n\
				3988	Return S left justified in a Unicode string of length width. Padding is\n\
				3989	done using spaces.";
				3990
				3991	static PyObject *
				3992	unicode_ljust(PyUnicodeObject self, PyObject args)
				3993	{
				3994	int width;
				3995	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3996	return NULL;
				3997
				3998	if (self->length >= width) {
				3999	Py_INCREF(self);
				4000	return (PyObject*) self;
				4001	}
				4002
				4003	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4004	}
				4005
				4006	static char lower__doc__[] =
				4007	"S.lower() -> unicode\n\
				4008	\n\
				4009	Return a copy of the string S converted to lowercase.";
				4010
				4011	static PyObject*
				4012	unicode_lower(PyUnicodeObject self, PyObject args)
				4013	{
				4014	if (!PyArg_NoArgs(args))
				4015	return NULL;
				4016	return fixup(self, fixlower);
				4017	}
				4018
				4019	static char lstrip__doc__[] =
				4020	"S.lstrip() -> unicode\n\
				4021	\n\
				4022	Return a copy of the string S with leading whitespace removed.";
				4023
				4024	static PyObject *
				4025	unicode_lstrip(PyUnicodeObject self, PyObject args)
				4026	{
				4027	if (!PyArg_NoArgs(args))
				4028	return NULL;
				4029	return strip(self, 1, 0);
				4030	}
				4031
				4032	static PyObject*
				4033	unicode_repeat(PyUnicodeObject *str, int len)
				4034	{
				4035	PyUnicodeObject *u;
				4036	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4037	int nchars;
				4038	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4039
				4040	if (len < 0)
				4041	len = 0;
				4042
				4043	if (len == 1) {
				4044	/* no repeat, return original string */
				4045	Py_INCREF(str);
				4046	return (PyObject*) str;
				4047	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4048
				4049	/* ensure # of chars needed doesn't overflow int and # of bytes
				4050	* needed doesn't overflow size_t
				4051	*/
				4052	nchars = len * str->length;
				4053	if (len && nchars / len != str->length) {
				4054	PyErr_SetString(PyExc_OverflowError,
				4055	"repeated string is too long");
				4056	return NULL;
				4057	}
				4058	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4059	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4060	PyErr_SetString(PyExc_OverflowError,
				4061	"repeated string is too long");
				4062	return NULL;
				4063	}
				4064	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4065	if (!u)
				4066	return NULL;
				4067
				4068	p = u->str;
				4069
				4070	while (len-- > 0) {
				4071	Py_UNICODE_COPY(p, str->str, str->length);
				4072	p += str->length;
				4073	}
				4074
				4075	return (PyObject*) u;
				4076	}
				4077
				4078	PyObject PyUnicode_Replace(PyObject obj,
				4079	PyObject *subobj,
				4080	PyObject *replobj,
				4081	int maxcount)
				4082	{
				4083	PyObject *self;
				4084	PyObject *str1;
				4085	PyObject *str2;
				4086	PyObject *result;
				4087
				4088	self = PyUnicode_FromObject(obj);
				4089	if (self == NULL)
				4090	return NULL;
				4091	str1 = PyUnicode_FromObject(subobj);
				4092	if (str1 == NULL) {
				4093	Py_DECREF(self);
				4094	return NULL;
				4095	}
				4096	str2 = PyUnicode_FromObject(replobj);
				4097	if (str2 == NULL) {
				4098	Py_DECREF(self);
				4099	Py_DECREF(str1);
				4100	return NULL;
				4101	}
				4102	result = replace((PyUnicodeObject *)self,
				4103	(PyUnicodeObject *)str1,
				4104	(PyUnicodeObject *)str2,
				4105	maxcount);
				4106	Py_DECREF(self);
				4107	Py_DECREF(str1);
				4108	Py_DECREF(str2);
				4109	return result;
				4110	}
				4111
				4112	static char replace__doc__[] =
				4113	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4114	\n\
				4115	Return a copy of S with all occurrences of substring\n\
				4116	old replaced by new. If the optional argument maxsplit is\n\
				4117	given, only the first maxsplit occurrences are replaced.";
				4118
				4119	static PyObject*
				4120	unicode_replace(PyUnicodeObject self, PyObject args)
				4121	{
				4122	PyUnicodeObject *str1;
				4123	PyUnicodeObject *str2;
				4124	int maxcount = -1;
				4125	PyObject *result;
				4126
				4127	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4128	return NULL;
				4129	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4130	if (str1 == NULL)
				4131	return NULL;
				4132	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4133	if (str2 == NULL)
				4134	return NULL;
				4135
				4136	result = replace(self, str1, str2, maxcount);
				4137
				4138	Py_DECREF(str1);
				4139	Py_DECREF(str2);
				4140	return result;
				4141	}
				4142
				4143	static
				4144	PyObject unicode_repr(PyObject unicode)
				4145	{
				4146	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4147	PyUnicode_GET_SIZE(unicode),
				4148	1);
				4149	}
				4150
				4151	static char rfind__doc__[] =
				4152	"S.rfind(sub [,start [,end]]) -> int\n\
				4153	\n\
				4154	Return the highest index in S where substring sub is found,\n\
				4155	such that sub is contained within s[start,end]. Optional\n\
				4156	arguments start and end are interpreted as in slice notation.\n\
				4157	\n\
				4158	Return -1 on failure.";
				4159
				4160	static PyObject *
				4161	unicode_rfind(PyUnicodeObject self, PyObject args)
				4162	{
				4163	PyUnicodeObject *substring;
				4164	int start = 0;
				4165	int end = INT_MAX;
				4166	PyObject *result;
				4167
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4168	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4169	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4170	return NULL;
				4171	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4172	(PyObject *)substring);
				4173	if (substring == NULL)
				4174	return NULL;
				4175
				4176	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4177
				4178	Py_DECREF(substring);
				4179	return result;
				4180	}
				4181
				4182	static char rindex__doc__[] =
				4183	"S.rindex(sub [,start [,end]]) -> int\n\
				4184	\n\
				4185	Like S.rfind() but raise ValueError when the substring is not found.";
				4186
				4187	static PyObject *
				4188	unicode_rindex(PyUnicodeObject self, PyObject args)
				4189	{
				4190	int result;
				4191	PyUnicodeObject *substring;
				4192	int start = 0;
				4193	int end = INT_MAX;
				4194
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4195	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4196	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4197	return NULL;
				4198	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4199	(PyObject *)substring);
				4200	if (substring == NULL)
				4201	return NULL;
				4202
				4203	result = findstring(self, substring, start, end, -1);
				4204
				4205	Py_DECREF(substring);
				4206	if (result < 0) {
				4207	PyErr_SetString(PyExc_ValueError, "substring not found");
				4208	return NULL;
				4209	}
				4210	return PyInt_FromLong(result);
				4211	}
				4212
				4213	static char rjust__doc__[] =
				4214	"S.rjust(width) -> unicode\n\
				4215	\n\
				4216	Return S right justified in a Unicode string of length width. Padding is\n\
				4217	done using spaces.";
				4218
				4219	static PyObject *
				4220	unicode_rjust(PyUnicodeObject self, PyObject args)
				4221	{
				4222	int width;
				4223	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4224	return NULL;
				4225
				4226	if (self->length >= width) {
				4227	Py_INCREF(self);
				4228	return (PyObject*) self;
				4229	}
				4230
				4231	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4232	}
				4233
				4234	static char rstrip__doc__[] =
				4235	"S.rstrip() -> unicode\n\
				4236	\n\
				4237	Return a copy of the string S with trailing whitespace removed.";
				4238
				4239	static PyObject *
				4240	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4241	{
				4242	if (!PyArg_NoArgs(args))
				4243	return NULL;
				4244	return strip(self, 0, 1);
				4245	}
				4246
				4247	static PyObject*
				4248	unicode_slice(PyUnicodeObject *self, int start, int end)
				4249	{
				4250	/* standard clamping */
				4251	if (start < 0)
				4252	start = 0;
				4253	if (end < 0)
				4254	end = 0;
				4255	if (end > self->length)
				4256	end = self->length;
				4257	if (start == 0 && end == self->length) {
				4258	/* full slice, return original string */
				4259	Py_INCREF(self);
				4260	return (PyObject*) self;
				4261	}
				4262	if (start > end)
				4263	start = end;
				4264	/* copy slice */
				4265	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4266	end - start);
				4267	}
				4268
				4269	PyObject PyUnicode_Split(PyObject s,
				4270	PyObject *sep,
				4271	int maxsplit)
				4272	{
				4273	PyObject *result;
				4274
				4275	s = PyUnicode_FromObject(s);
				4276	if (s == NULL)
				4277	return NULL;
				4278	if (sep != NULL) {
				4279	sep = PyUnicode_FromObject(sep);
				4280	if (sep == NULL) {
				4281	Py_DECREF(s);
				4282	return NULL;
				4283	}
				4284	}
				4285
				4286	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4287
				4288	Py_DECREF(s);
				4289	Py_XDECREF(sep);
				4290	return result;
				4291	}
				4292
				4293	static char split__doc__[] =
				4294	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4295	\n\
				4296	Return a list of the words in S, using sep as the\n\
				4297	delimiter string. If maxsplit is given, at most maxsplit\n\
				4298	splits are done. If sep is not specified, any whitespace string\n\
				4299	is a separator.";
				4300
				4301	static PyObject*
				4302	unicode_split(PyUnicodeObject self, PyObject args)
				4303	{
				4304	PyObject *substring = Py_None;
				4305	int maxcount = -1;
				4306
				4307	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4308	return NULL;
				4309
				4310	if (substring == Py_None)
				4311	return split(self, NULL, maxcount);
				4312	else if (PyUnicode_Check(substring))
				4313	return split(self, (PyUnicodeObject *)substring, maxcount);
				4314	else
				4315	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4316	}
				4317
				4318	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4319	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4320	\n\
				4321	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4322	Line breaks are not included in the resulting list unless keepends\n\
				4323	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4324
				4325	static PyObject*
				4326	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4327	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4328	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4329
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4330	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4331	return NULL;
				4332
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4333	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4334	}
				4335
				4336	static
				4337	PyObject unicode_str(PyUnicodeObject self)
				4338	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4339	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4340	}
				4341
				4342	static char strip__doc__[] =
				4343	"S.strip() -> unicode\n\
				4344	\n\
				4345	Return a copy of S with leading and trailing whitespace removed.";
				4346
				4347	static PyObject *
				4348	unicode_strip(PyUnicodeObject self, PyObject args)
				4349	{
				4350	if (!PyArg_NoArgs(args))
				4351	return NULL;
				4352	return strip(self, 1, 1);
				4353	}
				4354
				4355	static char swapcase__doc__[] =
				4356	"S.swapcase() -> unicode\n\
				4357	\n\
				4358	Return a copy of S with uppercase characters converted to lowercase\n\
				4359	and vice versa.";
				4360
				4361	static PyObject*
				4362	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4363	{
				4364	if (!PyArg_NoArgs(args))
				4365	return NULL;
				4366	return fixup(self, fixswapcase);
				4367	}
				4368
				4369	static char translate__doc__[] =
				4370	"S.translate(table) -> unicode\n\
				4371	\n\
				4372	Return a copy of the string S, where all characters have been mapped\n\
				4373	through the given translation table, which must be a mapping of\n\
				4374	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4375	are left untouched. Characters mapped to None are deleted.";
				4376
				4377	static PyObject*
				4378	unicode_translate(PyUnicodeObject self, PyObject args)
				4379	{
				4380	PyObject *table;
				4381
				4382	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4383	return NULL;
				4384	return PyUnicode_TranslateCharmap(self->str,
				4385	self->length,
				4386	table,
				4387	"ignore");
				4388	}
				4389
				4390	static char upper__doc__[] =
				4391	"S.upper() -> unicode\n\
				4392	\n\
				4393	Return a copy of S converted to uppercase.";
				4394
				4395	static PyObject*
				4396	unicode_upper(PyUnicodeObject self, PyObject args)
				4397	{
				4398	if (!PyArg_NoArgs(args))
				4399	return NULL;
				4400	return fixup(self, fixupper);
				4401	}
				4402
				4403	#if 0
				4404	static char zfill__doc__[] =
				4405	"S.zfill(width) -> unicode\n\
				4406	\n\
				4407	Pad a numeric string x with zeros on the left, to fill a field\n\
				4408	of the specified width. The string x is never truncated.";
				4409
				4410	static PyObject *
				4411	unicode_zfill(PyUnicodeObject self, PyObject args)
				4412	{
				4413	int fill;
				4414	PyUnicodeObject *u;
				4415
				4416	int width;
				4417	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4418	return NULL;
				4419
				4420	if (self->length >= width) {
				4421	Py_INCREF(self);
				4422	return (PyObject*) self;
				4423	}
				4424
				4425	fill = width - self->length;
				4426
				4427	u = pad(self, fill, 0, '0');
				4428
				4429	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4430	/* move sign to beginning of string */
				4431	u->str[0] = u->str[fill];
				4432	u->str[fill] = '0';
				4433	}
				4434
				4435	return (PyObject*) u;
				4436	}
				4437	#endif
				4438
				4439	#if 0
				4440	static PyObject*
				4441	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4442	{
				4443	if (!PyArg_NoArgs(args))
				4444	return NULL;
				4445	return PyInt_FromLong(unicode_freelist_size);
				4446	}
				4447	#endif
				4448
				4449	static char startswith__doc__[] =
				4450	"S.startswith(prefix[, start[, end]]) -> int\n\
				4451	\n\
				4452	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4453	optional start, test S beginning at that position. With optional end, stop\n\
				4454	comparing S at that position.";
				4455
				4456	static PyObject *
				4457	unicode_startswith(PyUnicodeObject *self,
				4458	PyObject *args)
				4459	{
				4460	PyUnicodeObject *substring;
				4461	int start = 0;
				4462	int end = INT_MAX;
				4463	PyObject *result;
				4464
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4465	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4466	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4467	return NULL;
				4468	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4469	(PyObject *)substring);
				4470	if (substring == NULL)
				4471	return NULL;
				4472
				4473	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4474
				4475	Py_DECREF(substring);
				4476	return result;
				4477	}
				4478
				4479
				4480	static char endswith__doc__[] =
				4481	"S.endswith(suffix[, start[, end]]) -> int\n\
				4482	\n\
				4483	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4484	optional start, test S beginning at that position. With optional end, stop\n\
				4485	comparing S at that position.";
				4486
				4487	static PyObject *
				4488	unicode_endswith(PyUnicodeObject *self,
				4489	PyObject *args)
				4490	{
				4491	PyUnicodeObject *substring;
				4492	int start = 0;
				4493	int end = INT_MAX;
				4494	PyObject *result;
				4495
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4496	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4497	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4498	return NULL;
				4499	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4500	(PyObject *)substring);
				4501	if (substring == NULL)
				4502	return NULL;
				4503
				4504	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4505
				4506	Py_DECREF(substring);
				4507	return result;
				4508	}
				4509
				4510
				4511	static PyMethodDef unicode_methods[] = {
				4512
				4513	/* Order is according to common usage: often used methods should
				4514	appear first, since lookup is done sequentially. */
				4515
				4516	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4517	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4518	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4519	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4520	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4521	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4522	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4523	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4524	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4525	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4526	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4527	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4528	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4529	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4530	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4531	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4532	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4533	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4534	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4535	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4536	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4537	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4538	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4539	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4540	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4541	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4542	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4543	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4544	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4545	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4546	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4547	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4548	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4549	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4550	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4551	#if 0
				4552	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4553	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4554	#endif
				4555
				4556	#if 0
				4557	/* This one is just used for debugging the implementation. */
				4558	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4559	#endif
				4560
				4561	{NULL, NULL}
				4562	};
				4563
				4564	static PyObject *
				4565	unicode_getattr(PyUnicodeObject self, char name)
				4566	{
				4567	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4568	}
				4569
				4570	static PySequenceMethods unicode_as_sequence = {
				4571	(inquiry) unicode_length, /* sq_length */
				4572	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4573	(intargfunc) unicode_repeat, /* sq_repeat */
				4574	(intargfunc) unicode_getitem, /* sq_item */
				4575	(intintargfunc) unicode_slice, /* sq_slice */
				4576	0, /* sq_ass_item */
				4577	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4578	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4579	};
				4580
				4581	static int
				4582	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4583	int index,
				4584	const void **ptr)
				4585	{
				4586	if (index != 0) {
				4587	PyErr_SetString(PyExc_SystemError,
				4588	"accessing non-existent unicode segment");
				4589	return -1;
				4590	}
				4591	ptr = (void ) self->str;
				4592	return PyUnicode_GET_DATA_SIZE(self);
				4593	}
				4594
				4595	static int
				4596	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4597	const void **ptr)
				4598	{
				4599	PyErr_SetString(PyExc_TypeError,
				4600	"cannot use unicode as modifyable buffer");
				4601	return -1;
				4602	}
				4603
				4604	static int
				4605	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4606	int *lenp)
				4607	{
				4608	if (lenp)
				4609	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4610	return 1;
				4611	}
				4612
				4613	static int
				4614	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4615	int index,
				4616	const void **ptr)
				4617	{
				4618	PyObject *str;
				4619
				4620	if (index != 0) {
				4621	PyErr_SetString(PyExc_SystemError,
				4622	"accessing non-existent unicode segment");
				4623	return -1;
				4624	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4625	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4626	if (str == NULL)
				4627	return -1;
				4628	ptr = (void ) PyString_AS_STRING(str);
				4629	return PyString_GET_SIZE(str);
				4630	}
				4631
				4632	/* Helpers for PyUnicode_Format() */
				4633
				4634	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4635	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4636	{
				4637	int argidx = *p_argidx;
				4638	if (argidx < arglen) {
				4639	(*p_argidx)++;
				4640	if (arglen < 0)
				4641	return args;
				4642	else
				4643	return PyTuple_GetItem(args, argidx);
				4644	}
				4645	PyErr_SetString(PyExc_TypeError,
				4646	"not enough arguments for format string");
				4647	return NULL;
				4648	}
				4649
				4650	#define F_LJUST (1<<0)
				4651	#define F_SIGN (1<<1)
				4652	#define F_BLANK (1<<2)
				4653	#define F_ALT (1<<3)
				4654	#define F_ZERO (1<<4)
				4655
				4656	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4657	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4658	{
				4659	register int i;
				4660	int len;
				4661	va_list va;
				4662	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4663	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4664
				4665	/* First, format the string as char array, then expand to Py_UNICODE
				4666	array. */
				4667	charbuffer = (char *)buffer;
				4668	len = vsprintf(charbuffer, format, va);
				4669	for (i = len - 1; i >= 0; i--)
				4670	buffer[i] = (Py_UNICODE) charbuffer[i];
				4671
				4672	va_end(va);
				4673	return len;
				4674	}
				4675
				4676	static int
				4677	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4678	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4679	int flags,
				4680	int prec,
				4681	int type,
				4682	PyObject *v)
				4683	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4684	/* fmt = '%#.' + `prec` + `type`
				4685	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4686	char fmt[20];
				4687	double x;
				4688
				4689	x = PyFloat_AsDouble(v);
				4690	if (x == -1.0 && PyErr_Occurred())
				4691	return -1;
				4692	if (prec < 0)
				4693	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4694	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4695	type = 'g';
				4696	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4697	/* worst case length calc to ensure no buffer overrun:
				4698	fmt = %#.<prec>g
				4699	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4700	for any double rep.)
				4701	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4702	If prec=0 the effective precision is 1 (the leading digit is
				4703	always given), therefore increase by one to 10+prec. */
				4704	if (buflen <= (size_t)10 + (size_t)prec) {
				4705	PyErr_SetString(PyExc_OverflowError,
				4706	"formatted float is too long (precision too long?)");
				4707	return -1;
				4708	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4709	return usprintf(buf, fmt, x);
				4710	}
				4711
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4712	static PyObject*
				4713	formatlong(PyObject *val, int flags, int prec, int type)
				4714	{
				4715	char *buf;
				4716	int i, len;
				4717	PyObject str; / temporary string object. */
				4718	PyUnicodeObject *result;
				4719
				4720	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4721	if (!str)
				4722	return NULL;
				4723	result = _PyUnicode_New(len);
				4724	for (i = 0; i < len; i++)
				4725	result->str[i] = buf[i];
				4726	result->str[len] = 0;
				4727	Py_DECREF(str);
				4728	return (PyObject*)result;
				4729	}
				4730
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4731	static int
				4732	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4733	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4734	int flags,
				4735	int prec,
				4736	int type,
				4737	PyObject *v)
				4738	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4739	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4740	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4741	+ 1 + 1 = 24*/
				4742	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4743	long x;
				4744
				4745	x = PyInt_AsLong(v);
				4746	if (x == -1 && PyErr_Occurred())
				4747	return -1;
				4748	if (prec < 0)
				4749	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4750	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4751	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4752	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4753	PyErr_SetString(PyExc_OverflowError,
				4754	"formatted integer is too long (precision too long?)");
				4755	return -1;
				4756	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4757	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4758	return usprintf(buf, fmt, x);
				4759	}
				4760
				4761	static int
				4762	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4763	size_t buflen,
				4764	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4765	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4766	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4767	if (PyUnicode_Check(v)) {
				4768	if (PyUnicode_GET_SIZE(v) != 1)
				4769	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4770	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4771	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4772
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4773	else if (PyString_Check(v)) {
				4774	if (PyString_GET_SIZE(v) != 1)
				4775	goto onError;
				4776	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4777	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4778
				4779	else {
				4780	/* Integer input truncated to a character */
				4781	long x;
				4782	x = PyInt_AsLong(v);
				4783	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4784	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4785	buf[0] = (char) x;
				4786	}
				4787	buf[1] = '\0';
				4788	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4789
				4790	onError:
				4791	PyErr_SetString(PyExc_TypeError,
				4792	"%c requires int or char");
				4793	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4794	}
				4795
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4796	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4797
				4798	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4799	chars are formatted. XXX This is a magic number. Each formatting
				4800	routine does bounds checking to ensure no overflow, but a better
				4801	solution may be to malloc a buffer of appropriate size for each
				4802	format. For now, the current solution is sufficient.
				4803	*/
				4804	#define FORMATBUFLEN (size_t)120
				4805
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4806	PyObject PyUnicode_Format(PyObject format,
				4807	PyObject *args)
				4808	{
				4809	Py_UNICODE fmt, res;
				4810	int fmtcnt, rescnt, reslen, arglen, argidx;
				4811	int args_owned = 0;
				4812	PyUnicodeObject *result = NULL;
				4813	PyObject *dict = NULL;
				4814	PyObject *uformat;
				4815
				4816	if (format == NULL \|\| args == NULL) {
				4817	PyErr_BadInternalCall();
				4818	return NULL;
				4819	}
				4820	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4821	if (uformat == NULL)
				4822	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4823	fmt = PyUnicode_AS_UNICODE(uformat);
				4824	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4825
				4826	reslen = rescnt = fmtcnt + 100;
				4827	result = _PyUnicode_New(reslen);
				4828	if (result == NULL)
				4829	goto onError;
				4830	res = PyUnicode_AS_UNICODE(result);
				4831
				4832	if (PyTuple_Check(args)) {
				4833	arglen = PyTuple_Size(args);
				4834	argidx = 0;
				4835	}
				4836	else {
				4837	arglen = -1;
				4838	argidx = -2;
				4839	}
				4840	if (args->ob_type->tp_as_mapping)
				4841	dict = args;
				4842
				4843	while (--fmtcnt >= 0) {
				4844	if (*fmt != '%') {
				4845	if (--rescnt < 0) {
				4846	rescnt = fmtcnt + 100;
				4847	reslen += rescnt;
				4848	if (_PyUnicode_Resize(result, reslen) < 0)
				4849	return NULL;
				4850	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4851	--rescnt;
				4852	}
				4853	res++ = fmt++;
				4854	}
				4855	else {
				4856	/* Got a format specifier */
				4857	int flags = 0;
				4858	int width = -1;
				4859	int prec = -1;
				4860	int size = 0;
				4861	Py_UNICODE c = '\0';
				4862	Py_UNICODE fill;
				4863	PyObject *v = NULL;
				4864	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4865	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4866	Py_UNICODE sign;
				4867	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4868	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4869
				4870	fmt++;
				4871	if (*fmt == '(') {
				4872	Py_UNICODE *keystart;
				4873	int keylen;
				4874	PyObject *key;
				4875	int pcount = 1;
				4876
				4877	if (dict == NULL) {
				4878	PyErr_SetString(PyExc_TypeError,
				4879	"format requires a mapping");
				4880	goto onError;
				4881	}
				4882	++fmt;
				4883	--fmtcnt;
				4884	keystart = fmt;
				4885	/* Skip over balanced parentheses */
				4886	while (pcount > 0 && --fmtcnt >= 0) {
				4887	if (*fmt == ')')
				4888	--pcount;
				4889	else if (*fmt == '(')
				4890	++pcount;
				4891	fmt++;
				4892	}
				4893	keylen = fmt - keystart - 1;
				4894	if (fmtcnt < 0 \|\| pcount > 0) {
				4895	PyErr_SetString(PyExc_ValueError,
				4896	"incomplete format key");
				4897	goto onError;
				4898	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4899	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4900	then looked up since Python uses strings to hold
				4901	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4902	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4903	key = PyUnicode_EncodeUTF8(keystart,
				4904	keylen,
				4905	NULL);
				4906	if (key == NULL)
				4907	goto onError;
				4908	if (args_owned) {
				4909	Py_DECREF(args);
				4910	args_owned = 0;
				4911	}
				4912	args = PyObject_GetItem(dict, key);
				4913	Py_DECREF(key);
				4914	if (args == NULL) {
				4915	goto onError;
				4916	}
				4917	args_owned = 1;
				4918	arglen = -1;
				4919	argidx = -2;
				4920	}
				4921	while (--fmtcnt >= 0) {
				4922	switch (c = *fmt++) {
				4923	case '-': flags \|= F_LJUST; continue;
				4924	case '+': flags \|= F_SIGN; continue;
				4925	case ' ': flags \|= F_BLANK; continue;
				4926	case '#': flags \|= F_ALT; continue;
				4927	case '0': flags \|= F_ZERO; continue;
				4928	}
				4929	break;
				4930	}
				4931	if (c == '*') {
				4932	v = getnextarg(args, arglen, &argidx);
				4933	if (v == NULL)
				4934	goto onError;
				4935	if (!PyInt_Check(v)) {
				4936	PyErr_SetString(PyExc_TypeError,
				4937	"* wants int");
				4938	goto onError;
				4939	}
				4940	width = PyInt_AsLong(v);
				4941	if (width < 0) {
				4942	flags \|= F_LJUST;
				4943	width = -width;
				4944	}
				4945	if (--fmtcnt >= 0)
				4946	c = *fmt++;
				4947	}
				4948	else if (c >= '0' && c <= '9') {
				4949	width = c - '0';
				4950	while (--fmtcnt >= 0) {
				4951	c = *fmt++;
				4952	if (c < '0' \|\| c > '9')
				4953	break;
				4954	if ((width*10) / 10 != width) {
				4955	PyErr_SetString(PyExc_ValueError,
				4956	"width too big");
				4957	goto onError;
				4958	}
				4959	width = width*10 + (c - '0');
				4960	}
				4961	}
				4962	if (c == '.') {
				4963	prec = 0;
				4964	if (--fmtcnt >= 0)
				4965	c = *fmt++;
				4966	if (c == '*') {
				4967	v = getnextarg(args, arglen, &argidx);
				4968	if (v == NULL)
				4969	goto onError;
				4970	if (!PyInt_Check(v)) {
				4971	PyErr_SetString(PyExc_TypeError,
				4972	"* wants int");
				4973	goto onError;
				4974	}
				4975	prec = PyInt_AsLong(v);
				4976	if (prec < 0)
				4977	prec = 0;
				4978	if (--fmtcnt >= 0)
				4979	c = *fmt++;
				4980	}
				4981	else if (c >= '0' && c <= '9') {
				4982	prec = c - '0';
				4983	while (--fmtcnt >= 0) {
				4984	c = Py_CHARMASK(*fmt++);
				4985	if (c < '0' \|\| c > '9')
				4986	break;
				4987	if ((prec*10) / 10 != prec) {
				4988	PyErr_SetString(PyExc_ValueError,
				4989	"prec too big");
				4990	goto onError;
				4991	}
				4992	prec = prec*10 + (c - '0');
				4993	}
				4994	}
				4995	} /* prec */
				4996	if (fmtcnt >= 0) {
				4997	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4998	size = c;
				4999	if (--fmtcnt >= 0)
				5000	c = *fmt++;
				5001	}
				5002	}
				5003	if (fmtcnt < 0) {
				5004	PyErr_SetString(PyExc_ValueError,
				5005	"incomplete format");
				5006	goto onError;
				5007	}
				5008	if (c != '%') {
				5009	v = getnextarg(args, arglen, &argidx);
				5010	if (v == NULL)
				5011	goto onError;
				5012	}
				5013	sign = 0;
				5014	fill = ' ';
				5015	switch (c) {
				5016
				5017	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5018	pbuf = formatbuf;
				5019	/* presume that buffer length is at least 1 */
				5020	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5021	len = 1;
				5022	break;
				5023
				5024	case 's':
				5025	case 'r':
				5026	if (PyUnicode_Check(v) && c == 's') {
				5027	temp = v;
				5028	Py_INCREF(temp);
				5029	}
				5030	else {
				5031	PyObject *unicode;
				5032	if (c == 's')
				5033	temp = PyObject_Str(v);
				5034	else
				5035	temp = PyObject_Repr(v);
				5036	if (temp == NULL)
				5037	goto onError;
				5038	if (!PyString_Check(temp)) {
				5039	/* XXX Note: this should never happen, since
				5040	PyObject_Repr() and PyObject_Str() assure
				5041	this */
				5042	Py_DECREF(temp);
				5043	PyErr_SetString(PyExc_TypeError,
				5044	"%s argument has non-string str()");
				5045	goto onError;
				5046	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5047	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5048	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5049	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5050	"strict");
				5051	Py_DECREF(temp);
				5052	temp = unicode;
				5053	if (temp == NULL)
				5054	goto onError;
				5055	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5056	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5057	len = PyUnicode_GET_SIZE(temp);
				5058	if (prec >= 0 && len > prec)
				5059	len = prec;
				5060	break;
				5061
				5062	case 'i':
				5063	case 'd':
				5064	case 'u':
				5065	case 'o':
				5066	case 'x':
				5067	case 'X':
				5068	if (c == 'i')
				5069	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5070	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5071	temp = formatlong(v, flags, prec, c);
				5072	if (!temp)
				5073	goto onError;
				5074	pbuf = PyUnicode_AS_UNICODE(temp);
				5075	len = PyUnicode_GET_SIZE(temp);
				5076	/* unbounded ints can always produce
				5077	a sign character! */
				5078	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5079	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5080	else {
				5081	pbuf = formatbuf;
				5082	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5083	flags, prec, c, v);
				5084	if (len < 0)
				5085	goto onError;
				5086	/* only d conversion is signed */
				5087	sign = c == 'd';
				5088	}
				5089	if (flags & F_ZERO)
				5090	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5091	break;
				5092
				5093	case 'e':
				5094	case 'E':
				5095	case 'f':
				5096	case 'g':
				5097	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5098	pbuf = formatbuf;
				5099	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5100	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5101	if (len < 0)
				5102	goto onError;
				5103	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5104	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5105	fill = '0';
				5106	break;
				5107
				5108	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5109	pbuf = formatbuf;
				5110	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5111	if (len < 0)
				5112	goto onError;
				5113	break;
				5114
				5115	default:
				5116	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5117	"unsupported format character '%c' (0x%x) "
				5118	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5119	(31<=c && c<=126) ? c : '?',
				5120	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5121	goto onError;
				5122	}
				5123	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5124	if (pbuf == '-' \|\| pbuf == '+') {
				5125	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5126	len--;
				5127	}
				5128	else if (flags & F_SIGN)
				5129	sign = '+';
				5130	else if (flags & F_BLANK)
				5131	sign = ' ';
				5132	else
				5133	sign = 0;
				5134	}
				5135	if (width < len)
				5136	width = len;
				5137	if (rescnt < width + (sign != 0)) {
				5138	reslen -= rescnt;
				5139	rescnt = width + fmtcnt + 100;
				5140	reslen += rescnt;
				5141	if (_PyUnicode_Resize(result, reslen) < 0)
				5142	return NULL;
				5143	res = PyUnicode_AS_UNICODE(result)
				5144	+ reslen - rescnt;
				5145	}
				5146	if (sign) {
				5147	if (fill != ' ')
				5148	*res++ = sign;
				5149	rescnt--;
				5150	if (width > len)
				5151	width--;
				5152	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5153	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5154	assert(pbuf[0] == '0');
				5155	assert(pbuf[1] == c);
				5156	if (fill != ' ') {
				5157	res++ = pbuf++;
				5158	res++ = pbuf++;
				5159	}
				5160	rescnt -= 2;
				5161	width -= 2;
				5162	if (width < 0)
				5163	width = 0;
				5164	len -= 2;
				5165	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5166	if (width > len && !(flags & F_LJUST)) {
				5167	do {
				5168	--rescnt;
				5169	*res++ = fill;
				5170	} while (--width > len);
				5171	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5172	if (fill == ' ') {
				5173	if (sign)
				5174	*res++ = sign;
				5175	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5176	assert(pbuf[0] == '0');
				5177	assert(pbuf[1] == c);
				5178	res++ = pbuf++;
				5179	res++ = pbuf++;
				5180	}
				5181	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5182	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5183	res += len;
				5184	rescnt -= len;
				5185	while (--width >= len) {
				5186	--rescnt;
				5187	*res++ = ' ';
				5188	}
				5189	if (dict && (argidx < arglen) && c != '%') {
				5190	PyErr_SetString(PyExc_TypeError,
				5191	"not all arguments converted");
				5192	goto onError;
				5193	}
				5194	Py_XDECREF(temp);
				5195	} /* '%' */
				5196	} /* until end */
				5197	if (argidx < arglen && !dict) {
				5198	PyErr_SetString(PyExc_TypeError,
				5199	"not all arguments converted");
				5200	goto onError;
				5201	}
				5202
				5203	if (args_owned) {
				5204	Py_DECREF(args);
				5205	}
				5206	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5207	if (_PyUnicode_Resize(result, reslen - rescnt))
				5208	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5209	return (PyObject *)result;
				5210
				5211	onError:
				5212	Py_XDECREF(result);
				5213	Py_DECREF(uformat);
				5214	if (args_owned) {
				5215	Py_DECREF(args);
				5216	}
				5217	return NULL;
				5218	}
				5219
				5220	static PyBufferProcs unicode_as_buffer = {
				5221	(getreadbufferproc) unicode_buffer_getreadbuf,
				5222	(getwritebufferproc) unicode_buffer_getwritebuf,
				5223	(getsegcountproc) unicode_buffer_getsegcount,
				5224	(getcharbufferproc) unicode_buffer_getcharbuf,
				5225	};
				5226
				5227	PyTypeObject PyUnicode_Type = {
				5228	PyObject_HEAD_INIT(&PyType_Type)
				5229	0, /* ob_size */
				5230	"unicode", /* tp_name */
				5231	sizeof(PyUnicodeObject), /* tp_size */
				5232	0, /* tp_itemsize */
				5233	/* Slots */
				5234	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5235	0, /* tp_print */
				5236	(getattrfunc)unicode_getattr, /* tp_getattr */
				5237	0, /* tp_setattr */
				5238	(cmpfunc) unicode_compare, /* tp_compare */
				5239	(reprfunc) unicode_repr, /* tp_repr */
				5240	0, /* tp_as_number */
				5241	&unicode_as_sequence, /* tp_as_sequence */
				5242	0, /* tp_as_mapping */
				5243	(hashfunc) unicode_hash, /* tp_hash*/
				5244	0, /* tp_call*/
				5245	(reprfunc) unicode_str, /* tp_str */
				5246	(getattrofunc) NULL, /* tp_getattro */
				5247	(setattrofunc) NULL, /* tp_setattro */
				5248	&unicode_as_buffer, /* tp_as_buffer */
				5249	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5250	};
				5251
				5252	/* Initialize the Unicode implementation */
				5253
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5254	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5255	{
				5256	/* Doublecheck the configuration... */
				5257	if (sizeof(Py_UNICODE) != 2)
				5258	Py_FatalError("Unicode configuration error: "
				5259	"sizeof(Py_UNICODE) != 2 bytes");
				5260
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5261	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5262	unicode_freelist = NULL;
				5263	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5264	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5265	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5266	}
				5267
				5268	/* Finalize the Unicode implementation */
				5269
				5270	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5271	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5272	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5273	PyUnicodeObject *u;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5274
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5275	Py_XDECREF(unicode_empty);
				5276	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5277
				5278	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5279	PyUnicodeObject *v = u;
				5280	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5281	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5282	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5283	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5284	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5285	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5286	unicode_freelist = NULL;
				5287	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5288	}