Blame - Objects/unicodeobject.c - platform/external/python/cpython2

blob: 8f7b354c243bc8b6226da05f1bd9f435976729d1 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	67	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	68	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	69
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	70	#ifdef MS_WIN32
				71	#include <windows.h>
				72	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	73
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	74	/* Limit for the Unicode object free list */
				75
				76	#define MAX_UNICODE_FREELIST_SIZE 1024
				77
				78	/* Limit for the Unicode object free list stay alive optimization.
				79
				80	The implementation will keep allocated Unicode memory intact for
				81	all objects on the free list having a size less than this
				82	limit. This reduces malloc() overhead for small Unicode objects.
				83
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	84	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	85	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	malloc()-overhead) bytes of unused garbage.
				87
				88	Setting the limit to 0 effectively turns the feature off.
				89
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	90	Note: This is an experimental feature ! If you get core dumps when
				91	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92
				93	*/
				94
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	95	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	96
				97	/* Endianness switches; defaults to little endian */
				98
				99	#ifdef WORDS_BIGENDIAN
				100	# define BYTEORDER_IS_BIG_ENDIAN
				101	#else
				102	# define BYTEORDER_IS_LITTLE_ENDIAN
				103	#endif
				104
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	105	/* --- Globals ------------------------------------------------------------
				106
				107	The globals are initialized by the _PyUnicode_Init() API and should
				108	not be used before calling that API.
				109
				110	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	111
				112	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	113	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	114
				115	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	116	static PyUnicodeObject *unicode_freelist;
				117	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	119	/* Default encoding to use and assume when NULL is passed as encoding
				120	parameter; it is initialized by _PyUnicode_Init().
				121
				122	Always use the PyUnicode_SetDefaultEncoding() and
				123	PyUnicode_GetDefaultEncoding() APIs to access this global.
				124
				125	*/
				126
				127	static char unicode_default_encoding[100];
				128
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129	/* --- Unicode Object ----------------------------------------------------- */
				130
				131	static
				132	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				133	int length)
				134	{
				135	void *oldstr;
				136
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	137	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	138	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	139	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	140
				141	/* Resizing unicode_empty is not allowed. */
				142	if (unicode == unicode_empty) {
				143	PyErr_SetString(PyExc_SystemError,
				144	"can't resize empty unicode object");
				145	return -1;
				146	}
				147
				148	/* We allocate one more byte to make sure the string is
				149	Ux0000 terminated -- XXX is this needed ? */
				150	oldstr = unicode->str;
				151	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				152	if (!unicode->str) {
				153	unicode->str = oldstr;
				154	PyErr_NoMemory();
				155	return -1;
				156	}
				157	unicode->str[length] = 0;
				158	unicode->length = length;
				159
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	160	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	161	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	162	if (unicode->defenc) {
				163	Py_DECREF(unicode->defenc);
				164	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	165	}
				166	unicode->hash = -1;
				167
				168	return 0;
				169	}
				170
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	171	int PyUnicode_Resize(PyObject **unicode,
				172	int length)
				173	{
				174	PyUnicodeObject *v;
				175
				176	if (unicode == NULL) {
				177	PyErr_BadInternalCall();
				178	return -1;
				179	}
				180	v = (PyUnicodeObject )unicode;
				181	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				182	PyErr_BadInternalCall();
				183	return -1;
				184	}
				185	return _PyUnicode_Resize(v, length);
				186	}
				187
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	/* We allocate one more byte to make sure the string is
				189	Ux0000 terminated -- XXX is this needed ?
				190
				191	XXX This allocator could further be enhanced by assuring that the
				192	free list never reduces its size below 1.
				193
				194	*/
				195
				196	static
				197	PyUnicodeObject *_PyUnicode_New(int length)
				198	{
				199	register PyUnicodeObject *unicode;
				200
				201	/* Optimization for empty strings */
				202	if (length == 0 && unicode_empty != NULL) {
				203	Py_INCREF(unicode_empty);
				204	return unicode_empty;
				205	}
				206
				207	/* Unicode freelist & memory allocation */
				208	if (unicode_freelist) {
				209	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	210	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	211	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	212	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	213	/* Keep-Alive optimization: we only upsize the buffer,
				214	never downsize it. */
				215	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	216	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	217	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	218	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	219	}
				220	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	221	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	222	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	223	}
				224	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	225	}
				226	else {
				227	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				228	if (unicode == NULL)
				229	return NULL;
				230	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				231	}
				232
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	233	if (!unicode->str) {
				234	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	235	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	236	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	237	unicode->str[length] = 0;
				238	unicode->length = length;
				239	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	240	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	242
				243	onError:
				244	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	246	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	247	}
				248
				249	static
				250	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				251	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	252	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	253	/* Keep-Alive optimization */
				254	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	255	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	256	unicode->str = NULL;
				257	unicode->length = 0;
				258	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	259	if (unicode->defenc) {
				260	Py_DECREF(unicode->defenc);
				261	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	262	}
				263	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	264	(PyUnicodeObject *)unicode = unicode_freelist;
				265	unicode_freelist = unicode;
				266	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	267	}
				268	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	269	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	270	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	271	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	272	}
				273	}
				274
				275	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				276	int size)
				277	{
				278	PyUnicodeObject *unicode;
				279
				280	unicode = _PyUnicode_New(size);
				281	if (!unicode)
				282	return NULL;
				283
				284	/* Copy the Unicode data into the new object */
				285	if (u != NULL)
				286	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				287
				288	return (PyObject *)unicode;
				289	}
				290
				291	#ifdef HAVE_WCHAR_H
				292
				293	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				294	int size)
				295	{
				296	PyUnicodeObject *unicode;
				297
				298	if (w == NULL) {
				299	PyErr_BadInternalCall();
				300	return NULL;
				301	}
				302
				303	unicode = _PyUnicode_New(size);
				304	if (!unicode)
				305	return NULL;
				306
				307	/* Copy the wchar_t data into the new object */
				308	#ifdef HAVE_USABLE_WCHAR_T
				309	memcpy(unicode->str, w, size * sizeof(wchar_t));
				310	#else
				311	{
				312	register Py_UNICODE *u;
				313	register int i;
				314	u = PyUnicode_AS_UNICODE(unicode);
				315	for (i = size; i >= 0; i--)
				316	u++ = w++;
				317	}
				318	#endif
				319
				320	return (PyObject *)unicode;
				321	}
				322
				323	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				324	register wchar_t *w,
				325	int size)
				326	{
				327	if (unicode == NULL) {
				328	PyErr_BadInternalCall();
				329	return -1;
				330	}
				331	if (size > PyUnicode_GET_SIZE(unicode))
				332	size = PyUnicode_GET_SIZE(unicode);
				333	#ifdef HAVE_USABLE_WCHAR_T
				334	memcpy(w, unicode->str, size * sizeof(wchar_t));
				335	#else
				336	{
				337	register Py_UNICODE *u;
				338	register int i;
				339	u = PyUnicode_AS_UNICODE(unicode);
				340	for (i = size; i >= 0; i--)
				341	w++ = u++;
				342	}
				343	#endif
				344
				345	return size;
				346	}
				347
				348	#endif
				349
				350	PyObject PyUnicode_FromObject(register PyObject obj)
				351	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	352	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				353	}
				354
				355	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				356	const char *encoding,
				357	const char *errors)
				358	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	359	const char *s;
				360	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	361	int owned = 0;
				362	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	363
				364	if (obj == NULL) {
				365	PyErr_BadInternalCall();
				366	return NULL;
				367	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	368
				369	/* Coerce object */
				370	if (PyInstance_Check(obj)) {
				371	PyObject *func;
				372	func = PyObject_GetAttrString(obj, "__str__");
				373	if (func == NULL) {
				374	PyErr_SetString(PyExc_TypeError,
				375	"coercing to Unicode: instance doesn't define __str__");
				376	return NULL;
				377	}
				378	obj = PyEval_CallObject(func, NULL);
				379	Py_DECREF(func);
				380	if (obj == NULL)
				381	return NULL;
				382	owned = 1;
				383	}
				384	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	385	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	386	v = obj;
				387	if (encoding) {
				388	PyErr_SetString(PyExc_TypeError,
				389	"decoding Unicode is not supported");
				390	return NULL;
				391	}
				392	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	393	}
				394	else if (PyString_Check(obj)) {
				395	s = PyString_AS_STRING(obj);
				396	len = PyString_GET_SIZE(obj);
				397	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	398	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				399	/* Overwrite the error message with something more useful in
				400	case of a TypeError. */
				401	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	402	PyErr_Format(PyExc_TypeError,
				403	"coercing to Unicode: need string or buffer, "
				404	"%.80s found",
				405	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	406	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	407	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	408
				409	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	410	if (len == 0) {
				411	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	412	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	413	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	414	else
				415	v = PyUnicode_Decode(s, len, encoding, errors);
				416	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	417	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	418	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	419	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	420	return v;
				421
				422	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	423	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	424	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	425	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	426	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	427	}
				428
				429	PyObject PyUnicode_Decode(const char s,
				430	int size,
				431	const char *encoding,
				432	const char *errors)
				433	{
				434	PyObject buffer = NULL, unicode;
				435
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	436	if (encoding == NULL)
				437	encoding = PyUnicode_GetDefaultEncoding();
				438
				439	/* Shortcuts for common default encodings */
				440	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	441	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	442	else if (strcmp(encoding, "latin-1") == 0)
				443	return PyUnicode_DecodeLatin1(s, size, errors);
				444	else if (strcmp(encoding, "ascii") == 0)
				445	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	446
				447	/* Decode via the codec registry */
				448	buffer = PyBuffer_FromMemory((void *)s, size);
				449	if (buffer == NULL)
				450	goto onError;
				451	unicode = PyCodec_Decode(buffer, encoding, errors);
				452	if (unicode == NULL)
				453	goto onError;
				454	if (!PyUnicode_Check(unicode)) {
				455	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	456	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	457	unicode->ob_type->tp_name);
				458	Py_DECREF(unicode);
				459	goto onError;
				460	}
				461	Py_DECREF(buffer);
				462	return unicode;
				463
				464	onError:
				465	Py_XDECREF(buffer);
				466	return NULL;
				467	}
				468
				469	PyObject PyUnicode_Encode(const Py_UNICODE s,
				470	int size,
				471	const char *encoding,
				472	const char *errors)
				473	{
				474	PyObject v, unicode;
				475
				476	unicode = PyUnicode_FromUnicode(s, size);
				477	if (unicode == NULL)
				478	return NULL;
				479	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				480	Py_DECREF(unicode);
				481	return v;
				482	}
				483
				484	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				485	const char *encoding,
				486	const char *errors)
				487	{
				488	PyObject *v;
				489
				490	if (!PyUnicode_Check(unicode)) {
				491	PyErr_BadArgument();
				492	goto onError;
				493	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	494
				495	if (encoding == NULL)
				496	encoding = PyUnicode_GetDefaultEncoding();
				497
				498	/* Shortcuts for common default encodings */
				499	if (errors == NULL) {
				500	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	501	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	502	else if (strcmp(encoding, "latin-1") == 0)
				503	return PyUnicode_AsLatin1String(unicode);
				504	else if (strcmp(encoding, "ascii") == 0)
				505	return PyUnicode_AsASCIIString(unicode);
				506	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	507
				508	/* Encode via the codec registry */
				509	v = PyCodec_Encode(unicode, encoding, errors);
				510	if (v == NULL)
				511	goto onError;
				512	/* XXX Should we really enforce this ? */
				513	if (!PyString_Check(v)) {
				514	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	515	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	516	v->ob_type->tp_name);
				517	Py_DECREF(v);
				518	goto onError;
				519	}
				520	return v;
				521
				522	onError:
				523	return NULL;
				524	}
				525
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	526	/* Return a Python string holding the default encoded value of the
				527	Unicode object.
				528
				529	The resulting string is cached in the Unicode object for subsequent
				530	usage by this function. The cached version is needed to implement
				531	the character buffer interface and will live (at least) as long as
				532	the Unicode object itself.
				533
				534	The refcount of the string is not incremented.
				535
				536	* Exported for internal use by the interpreter only !!! *
				537
				538	*/
				539
				540	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				541	const char *errors)
				542	{
				543	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				544
				545	if (v)
				546	return v;
				547	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				548	if (v && errors == NULL)
				549	((PyUnicodeObject *)unicode)->defenc = v;
				550	return v;
				551	}
				552
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	553	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				554	{
				555	if (!PyUnicode_Check(unicode)) {
				556	PyErr_BadArgument();
				557	goto onError;
				558	}
				559	return PyUnicode_AS_UNICODE(unicode);
				560
				561	onError:
				562	return NULL;
				563	}
				564
				565	int PyUnicode_GetSize(PyObject *unicode)
				566	{
				567	if (!PyUnicode_Check(unicode)) {
				568	PyErr_BadArgument();
				569	goto onError;
				570	}
				571	return PyUnicode_GET_SIZE(unicode);
				572
				573	onError:
				574	return -1;
				575	}
				576
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	577	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	578	{
				579	return unicode_default_encoding;
				580	}
				581
				582	int PyUnicode_SetDefaultEncoding(const char *encoding)
				583	{
				584	PyObject *v;
				585
				586	/* Make sure the encoding is valid. As side effect, this also
				587	loads the encoding into the codec registry cache. */
				588	v = _PyCodec_Lookup(encoding);
				589	if (v == NULL)
				590	goto onError;
				591	Py_DECREF(v);
				592	strncpy(unicode_default_encoding,
				593	encoding,
				594	sizeof(unicode_default_encoding));
				595	return 0;
				596
				597	onError:
				598	return -1;
				599	}
				600
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	601	/* --- UTF-8 Codec -------------------------------------------------------- */
				602
				603	static
				604	char utf8_code_length[256] = {
				605	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				606	illegal prefix. see RFC 2279 for details */
				607	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				608	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				609	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				610	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				611	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				612	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				613	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				614	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				615	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				616	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				617	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				618	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				619	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				620	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				621	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				622	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				623	};
				624
				625	static
				626	int utf8_decoding_error(const char **source,
				627	Py_UNICODE **dest,
				628	const char *errors,
				629	const char *details)
				630	{
				631	if ((errors == NULL) \|\|
				632	(strcmp(errors,"strict") == 0)) {
				633	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	634	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	635	details);
				636	return -1;
				637	}
				638	else if (strcmp(errors,"ignore") == 0) {
				639	(*source)++;
				640	return 0;
				641	}
				642	else if (strcmp(errors,"replace") == 0) {
				643	(*source)++;
				644	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				645	(*dest)++;
				646	return 0;
				647	}
				648	else {
				649	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	650	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	651	errors);
				652	return -1;
				653	}
				654	}
				655
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	656	PyObject PyUnicode_DecodeUTF8(const char s,
				657	int size,
				658	const char *errors)
				659	{
				660	int n;
				661	const char *e;
				662	PyUnicodeObject *unicode;
				663	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	664	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	665
				666	/* Note: size will always be longer than the resulting Unicode
				667	character count */
				668	unicode = _PyUnicode_New(size);
				669	if (!unicode)
				670	return NULL;
				671	if (size == 0)
				672	return (PyObject *)unicode;
				673
				674	/* Unpack UTF-8 encoded data */
				675	p = unicode->str;
				676	e = s + size;
				677
				678	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	679	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	680
				681	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	682	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	683	s++;
				684	continue;
				685	}
				686
				687	n = utf8_code_length[ch];
				688
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	689	if (s + n > e) {
				690	errmsg = "unexpected end of data";
				691	goto utf8Error;
				692	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	693
				694	switch (n) {
				695
				696	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	697	errmsg = "unexpected code byte";
				698	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699	break;
				700
				701	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	702	errmsg = "internal error";
				703	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	704	break;
				705
				706	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	707	if ((s[1] & 0xc0) != 0x80) {
				708	errmsg = "invalid data";
				709	goto utf8Error;
				710	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	711	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	712	if (ch < 0x80) {
				713	errmsg = "illegal encoding";
				714	goto utf8Error;
				715	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	716	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	717	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	718	break;
				719
				720	case 3:
				721	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	722	(s[2] & 0xc0) != 0x80) {
				723	errmsg = "invalid data";
				724	goto utf8Error;
				725	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	726	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	727	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				728	errmsg = "illegal encoding";
				729	goto utf8Error;
				730	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	731	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	732	*p++ = (Py_UNICODE)ch;
				733	break;
				734
				735	case 4:
				736	if ((s[1] & 0xc0) != 0x80 \|\|
				737	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	738	(s[3] & 0xc0) != 0x80) {
				739	errmsg = "invalid data";
				740	goto utf8Error;
				741	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	742	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				743	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				744	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	745	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				746	byte encoding */
				747	(ch > 0x10ffff)) { /* maximum value allowed for
				748	UTF-16 */
				749	errmsg = "illegal encoding";
				750	goto utf8Error;
				751	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	752	/* compute and append the two surrogates: */
				753
				754	/* translate from 10000..10FFFF to 0..FFFF */
				755	ch -= 0x10000;
				756
				757	/* high surrogate = top 10 bits added to D800 */
				758	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				759
				760	/* low surrogate = bottom 10 bits added to DC00 */
				761	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	762	break;
				763
				764	default:
				765	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	766	errmsg = "unsupported Unicode code range";
				767	goto utf8Error;
				768	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	769	}
				770	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	771	continue;
				772
				773	utf8Error:
				774	if (utf8_decoding_error(&s, &p, errors, errmsg))
				775	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	776	}
				777
				778	/* Adjust length */
				779	if (_PyUnicode_Resize(unicode, p - unicode->str))
				780	goto onError;
				781
				782	return (PyObject *)unicode;
				783
				784	onError:
				785	Py_DECREF(unicode);
				786	return NULL;
				787	}
				788
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	789	/* Not used anymore, now that the encoder supports UTF-16
				790	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	791	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	792	static
				793	int utf8_encoding_error(const Py_UNICODE **source,
				794	char **dest,
				795	const char *errors,
				796	const char *details)
				797	{
				798	if ((errors == NULL) \|\|
				799	(strcmp(errors,"strict") == 0)) {
				800	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	801	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	802	details);
				803	return -1;
				804	}
				805	else if (strcmp(errors,"ignore") == 0) {
				806	return 0;
				807	}
				808	else if (strcmp(errors,"replace") == 0) {
				809	**dest = '?';
				810	(*dest)++;
				811	return 0;
				812	}
				813	else {
				814	PyErr_Format(PyExc_ValueError,
				815	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	816	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	817	errors);
				818	return -1;
				819	}
				820	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	821	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	822
				823	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				824	int size,
				825	const char *errors)
				826	{
				827	PyObject *v;
				828	char *p;
				829	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	830	Py_UCS4 ch2;
				831	unsigned int cbAllocated = 3 * size;
				832	unsigned int cbWritten = 0;
				833	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	834
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	835	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	836	if (v == NULL)
				837	return NULL;
				838	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	839	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	840
				841	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	842	while (i < size) {
				843	Py_UCS4 ch = s[i++];
				844	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	845	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	846	cbWritten++;
				847	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	848	else if (ch < 0x0800) {
				849	*p++ = 0xc0 \| (ch >> 6);
				850	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	851	cbWritten += 2;
				852	}
				853	else {
				854	/* Check for high surrogate */
				855	if (0xD800 <= ch && ch <= 0xDBFF) {
				856	if (i != size) {
				857	ch2 = s[i];
				858	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				859
				860	if (cbWritten >= (cbAllocated - 4)) {
				861	/* Provide enough room for some more
				862	surrogates */
				863	cbAllocated += 4*10;
				864	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	865	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	866	}
				867
				868	/* combine the two values */
				869	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				870
				871	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	872	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	873	i++;
				874	cbWritten += 4;
				875	}
				876	}
				877	}
				878	else {
				879	*p++ = (char)(0xe0 \| (ch >> 12));
				880	cbWritten += 3;
				881	}
				882	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				883	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	884	}
				885	}
				886	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	887	if (_PyString_Resize(&v, p - q))
				888	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	889	return v;
				890
				891	onError:
				892	Py_DECREF(v);
				893	return NULL;
				894	}
				895
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	896	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				897	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	898	if (!PyUnicode_Check(unicode)) {
				899	PyErr_BadArgument();
				900	return NULL;
				901	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	902	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				903	PyUnicode_GET_SIZE(unicode),
				904	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	905	}
				906
				907	/* --- UTF-16 Codec ------------------------------------------------------- */
				908
				909	static
				910	int utf16_decoding_error(const Py_UNICODE **source,
				911	Py_UNICODE **dest,
				912	const char *errors,
				913	const char *details)
				914	{
				915	if ((errors == NULL) \|\|
				916	(strcmp(errors,"strict") == 0)) {
				917	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	918	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	919	details);
				920	return -1;
				921	}
				922	else if (strcmp(errors,"ignore") == 0) {
				923	return 0;
				924	}
				925	else if (strcmp(errors,"replace") == 0) {
				926	if (dest) {
				927	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				928	(*dest)++;
				929	}
				930	return 0;
				931	}
				932	else {
				933	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	934	"UTF-16 decoding error; "
				935	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	936	errors);
				937	return -1;
				938	}
				939	}
				940
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	941	PyObject PyUnicode_DecodeUTF16(const char s,
				942	int size,
				943	const char *errors,
				944	int *byteorder)
				945	{
				946	PyUnicodeObject *unicode;
				947	Py_UNICODE *p;
				948	const Py_UNICODE q, e;
				949	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	950	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	951
				952	/* size should be an even number */
				953	if (size % sizeof(Py_UNICODE) != 0) {
				954	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				955	return NULL;
				956	/* The remaining input chars are ignored if we fall through
				957	here... */
				958	}
				959
				960	/* Note: size will always be longer than the resulting Unicode
				961	character count */
				962	unicode = _PyUnicode_New(size);
				963	if (!unicode)
				964	return NULL;
				965	if (size == 0)
				966	return (PyObject *)unicode;
				967
				968	/* Unpack UTF-16 encoded data */
				969	p = unicode->str;
				970	q = (Py_UNICODE *)s;
				971	e = q + (size / sizeof(Py_UNICODE));
				972
				973	if (byteorder)
				974	bo = *byteorder;
				975
				976	while (q < e) {
				977	register Py_UNICODE ch = *q++;
				978
				979	/* Check for BOM marks (U+FEFF) in the input and adjust
				980	current byte order setting accordingly. Swap input
				981	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				982	!) */
				983	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				984	if (ch == 0xFEFF) {
				985	bo = -1;
				986	continue;
				987	} else if (ch == 0xFFFE) {
				988	bo = 1;
				989	continue;
				990	}
				991	if (bo == 1)
				992	ch = (ch >> 8) \| (ch << 8);
				993	#else
				994	if (ch == 0xFEFF) {
				995	bo = 1;
				996	continue;
				997	} else if (ch == 0xFFFE) {
				998	bo = -1;
				999	continue;
				1000	}
				1001	if (bo == -1)
				1002	ch = (ch >> 8) \| (ch << 8);
				1003	#endif
				1004	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1005	*p++ = ch;
				1006	continue;
				1007	}
				1008
				1009	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1010	if (q >= e) {
				1011	errmsg = "unexpected end of data";
				1012	goto utf16Error;
				1013	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1014	if (0xDC00 <= q && q <= 0xDFFF) {
				1015	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1016	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1017	/* This is valid data (a UTF-16 surrogate pair), but
				1018	we are not able to store this information since our
				1019	Py_UNICODE type only has 16 bits... this might
				1020	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1021	errmsg = "code pairs are not supported";
				1022	goto utf16Error;
				1023	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1024	else
				1025	continue;
				1026	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1027	errmsg = "illegal encoding";
				1028	/* Fall through to report the error */
				1029
				1030	utf16Error:
				1031	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1032	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1033	}
				1034
				1035	if (byteorder)
				1036	*byteorder = bo;
				1037
				1038	/* Adjust length */
				1039	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1040	goto onError;
				1041
				1042	return (PyObject *)unicode;
				1043
				1044	onError:
				1045	Py_DECREF(unicode);
				1046	return NULL;
				1047	}
				1048
				1049	#undef UTF16_ERROR
				1050
				1051	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1052	int size,
				1053	const char *errors,
				1054	int byteorder)
				1055	{
				1056	PyObject *v;
				1057	Py_UNICODE *p;
				1058	char *q;
				1059
				1060	/* We don't create UTF-16 pairs... */
				1061	v = PyString_FromStringAndSize(NULL,
				1062	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1063	if (v == NULL)
				1064	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1065
				1066	q = PyString_AS_STRING(v);
				1067	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1068	if (byteorder == 0)
				1069	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1070	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1071	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1072	if (byteorder == 0 \|\|
				1073	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1074	byteorder == -1
				1075	#else
				1076	byteorder == 1
				1077	#endif
				1078	)
				1079	memcpy(p, s, size * sizeof(Py_UNICODE));
				1080	else
				1081	while (size-- > 0) {
				1082	Py_UNICODE ch = *s++;
				1083	*p++ = (ch >> 8) \| (ch << 8);
				1084	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1085	return v;
				1086	}
				1087
				1088	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1089	{
				1090	if (!PyUnicode_Check(unicode)) {
				1091	PyErr_BadArgument();
				1092	return NULL;
				1093	}
				1094	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1095	PyUnicode_GET_SIZE(unicode),
				1096	NULL,
				1097	0);
				1098	}
				1099
				1100	/* --- Unicode Escape Codec ----------------------------------------------- */
				1101
				1102	static
				1103	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1104	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1105	const char *errors,
				1106	const char *details)
				1107	{
				1108	if ((errors == NULL) \|\|
				1109	(strcmp(errors,"strict") == 0)) {
				1110	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1111	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1112	details);
				1113	return -1;
				1114	}
				1115	else if (strcmp(errors,"ignore") == 0) {
				1116	return 0;
				1117	}
				1118	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1119	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1120	return 0;
				1121	}
				1122	else {
				1123	PyErr_Format(PyExc_ValueError,
				1124	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1125	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1126	errors);
				1127	return -1;
				1128	}
				1129	}
				1130
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1131	static _Py_UCNHashAPI *pucnHash = NULL;
				1132
				1133	static
				1134	int mystrnicmp(const char s1, const char s2, size_t count)
				1135	{
				1136	char c1, c2;
				1137
				1138	if (count)
				1139	{
				1140	do
				1141	{
				1142	c1 = tolower(*(s1++));
				1143	c2 = tolower(*(s2++));
				1144	}
				1145	while(--count && c1 == c2);
				1146
				1147	return c1 - c2;
				1148	}
				1149
				1150	return 0;
				1151	}
				1152
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1153	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1154	int size,
				1155	const char *errors)
				1156	{
				1157	PyUnicodeObject *v;
				1158	Py_UNICODE p = NULL, buf = NULL;
				1159	const char *end;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1160	Py_UCS4 chr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1161
				1162	/* Escaped strings will always be longer than the resulting
				1163	Unicode string, so we start with size here and then reduce the
				1164	length after conversion to the true value. */
				1165	v = _PyUnicode_New(size);
				1166	if (v == NULL)
				1167	goto onError;
				1168	if (size == 0)
				1169	return (PyObject *)v;
				1170	p = buf = PyUnicode_AS_UNICODE(v);
				1171	end = s + size;
				1172	while (s < end) {
				1173	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1174	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1175	int i;
				1176
				1177	/* Non-escape characters are interpreted as Unicode ordinals */
				1178	if (*s != '\\') {
				1179	p++ = (unsigned char)s++;
				1180	continue;
				1181	}
				1182
				1183	/* \ - Escapes */
				1184	s++;
				1185	switch (*s++) {
				1186
				1187	/* \x escapes */
				1188	case '\n': break;
				1189	case '\\': *p++ = '\\'; break;
				1190	case '\'': *p++ = '\''; break;
				1191	case '\"': *p++ = '\"'; break;
				1192	case 'b': *p++ = '\b'; break;
				1193	case 'f': p++ = '\014'; break; / FF */
				1194	case 't': *p++ = '\t'; break;
				1195	case 'n': *p++ = '\n'; break;
				1196	case 'r': *p++ = '\r'; break;
				1197	case 'v': p++ = '\013'; break; / VT */
				1198	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1199
				1200	/* \OOO (octal) escapes */
				1201	case '0': case '1': case '2': case '3':
				1202	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1203	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1204	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1205	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1206	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1207	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1208	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1209	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1210	break;
				1211
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1212	/* \xXX with two hex digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1213	case 'x':
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1214	for (x = 0, i = 0; i < 2; i++) {
				1215	c = (unsigned char)s[i];
				1216	if (!isxdigit(c)) {
				1217	if (unicodeescape_decoding_error(&s, &x, errors,
				1218	"truncated \\xXX"))
				1219	goto onError;
				1220	i++;
				1221	break;
				1222	}
				1223	x = (x<<4) & ~0xF;
				1224	if (c >= '0' && c <= '9')
				1225	x += c - '0';
				1226	else if (c >= 'a' && c <= 'f')
				1227	x += 10 + c - 'a';
				1228	else
				1229	x += 10 + c - 'A';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1230	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1231	s += i;
				1232	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1233	break;
				1234
				1235	/* \uXXXX with 4 hex digits */
				1236	case 'u':
				1237	for (x = 0, i = 0; i < 4; i++) {
				1238	c = (unsigned char)s[i];
				1239	if (!isxdigit(c)) {
				1240	if (unicodeescape_decoding_error(&s, &x, errors,
				1241	"truncated \\uXXXX"))
				1242	goto onError;
				1243	i++;
				1244	break;
				1245	}
				1246	x = (x<<4) & ~0xF;
				1247	if (c >= '0' && c <= '9')
				1248	x += c - '0';
				1249	else if (c >= 'a' && c <= 'f')
				1250	x += 10 + c - 'a';
				1251	else
				1252	x += 10 + c - 'A';
				1253	}
				1254	s += i;
				1255	*p++ = x;
				1256	break;
				1257
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1258	/* \UXXXXXXXX with 8 hex digits */
				1259	case 'U':
				1260	for (chr = 0, i = 0; i < 8; i++) {
				1261	c = (unsigned char)s[i];
				1262	if (!isxdigit(c)) {
				1263	if (unicodeescape_decoding_error(&s, &x, errors,
				1264	"truncated \\uXXXX"))
				1265	goto onError;
				1266	i++;
				1267	break;
				1268	}
				1269	chr = (chr<<4) & ~0xF;
				1270	if (c >= '0' && c <= '9')
				1271	chr += c - '0';
				1272	else if (c >= 'a' && c <= 'f')
				1273	chr += 10 + c - 'a';
				1274	else
				1275	chr += 10 + c - 'A';
				1276	}
				1277	s += i;
				1278	goto store;
				1279
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1280	case 'N':
				1281	/* Ok, we need to deal with Unicode Character Names now,
				1282	* make sure we've imported the hash table data...
				1283	*/
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1284	if (pucnHash == NULL) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1285	PyObject mod = 0, v = 0;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1286	mod = PyImport_ImportModule("ucnhash");
				1287	if (mod == NULL)
				1288	goto onError;
				1289	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1290	Py_DECREF(mod);
				1291	if (v == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1292	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1293	pucnHash = PyCObject_AsVoidPtr(v);
				1294	Py_DECREF(v);
				1295	if (pucnHash == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1296	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1297	}
				1298
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1299	if (*s == '{') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1300	const char *start = s + 1;
				1301	const char *endBrace = start;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1302	unsigned long j;
				1303
				1304	/* look for either the closing brace, or we
				1305	* exceed the maximum length of the unicode character names
				1306	*/
				1307	while (*endBrace != '}' &&
				1308	(unsigned int)(endBrace - start) <=
				1309	pucnHash->cchMax &&
				1310	endBrace < end)
				1311	{
				1312	endBrace++;
				1313	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1314	if (endBrace != end && *endBrace == '}') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1315	j = pucnHash->hash(start, endBrace - start);
				1316	if (j > pucnHash->cKeys \|\|
				1317	mystrnicmp(
				1318	start,
				1319	((_Py_UnicodeCharacterName *)
				1320	(pucnHash->getValue(j)))->pszUCN,
				1321	(int)(endBrace - start)) != 0)
				1322	{
				1323	if (unicodeescape_decoding_error(
				1324	&s, &x, errors,
				1325	"Invalid Unicode Character Name"))
				1326	{
				1327	goto onError;
				1328	}
				1329	goto ucnFallthrough;
				1330	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1331	chr = ((_Py_UnicodeCharacterName *)
				1332	(pucnHash->getValue(j)))->value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1333	s = endBrace + 1;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1334	goto store;
				1335	} else {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1336	if (unicodeescape_decoding_error(
				1337	&s, &x, errors,
				1338	"Unicode name missing closing brace"))
				1339	goto onError;
				1340	goto ucnFallthrough;
				1341	}
				1342	break;
				1343	}
				1344	if (unicodeescape_decoding_error(
				1345	&s, &x, errors,
				1346	"Missing opening brace for Unicode Character Name escape"))
				1347	goto onError;
				1348	ucnFallthrough:
				1349	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1350	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1351	*p++ = '\\';
				1352	*p++ = (unsigned char)s[-1];
				1353	break;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1354	store:
				1355	/* when we get here, chr is a 32-bit unicode character */
				1356	if (chr <= 0xffff)
				1357	/* UCS-2 character */
				1358	*p++ = (Py_UNICODE) chr;
				1359	else if (chr <= 0x10ffff) {
				1360	/* UCS-4 character. store as two surrogate characters */
				1361	chr -= 0x10000L;
				1362	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
				1363	*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
				1364	} else {
				1365	if (unicodeescape_decoding_error(
				1366	&s, &x, errors,
				1367	"Illegal Unicode character")
				1368	)
				1369	goto onError;
				1370	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1371	}
				1372	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1373	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1374	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1375	return (PyObject *)v;
				1376
				1377	onError:
				1378	Py_XDECREF(v);
				1379	return NULL;
				1380	}
				1381
				1382	/* Return a Unicode-Escape string version of the Unicode object.
				1383
				1384	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1385	appropriate.
				1386
				1387	*/
				1388
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1389	static const Py_UNICODE findchar(const Py_UNICODE s,
				1390	int size,
				1391	Py_UNICODE ch);
				1392
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1393	static
				1394	PyObject unicodeescape_string(const Py_UNICODE s,
				1395	int size,
				1396	int quotes)
				1397	{
				1398	PyObject *repr;
				1399	char *p;
				1400	char *q;
				1401
				1402	static const char *hexdigit = "0123456789ABCDEF";
				1403
				1404	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1405	if (repr == NULL)
				1406	return NULL;
				1407
				1408	p = q = PyString_AS_STRING(repr);
				1409
				1410	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1411	*p++ = 'u';
				1412	*p++ = (findchar(s, size, '\'') &&
				1413	!findchar(s, size, '"')) ? '"' : '\'';
				1414	}
				1415	while (size-- > 0) {
				1416	Py_UNICODE ch = *s++;
				1417	/* Escape quotes */
				1418	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1419	*p++ = '\\';
				1420	*p++ = (char) ch;
				1421	}
				1422	/* Map 16-bit characters to '\uxxxx' */
				1423	else if (ch >= 256) {
				1424	*p++ = '\\';
				1425	*p++ = 'u';
				1426	*p++ = hexdigit[(ch >> 12) & 0xf];
				1427	*p++ = hexdigit[(ch >> 8) & 0xf];
				1428	*p++ = hexdigit[(ch >> 4) & 0xf];
				1429	*p++ = hexdigit[ch & 15];
				1430	}
				1431	/* Map non-printable US ASCII to '\ooo' */
				1432	else if (ch < ' ' \|\| ch >= 128) {
				1433	*p++ = '\\';
				1434	*p++ = hexdigit[(ch >> 6) & 7];
				1435	*p++ = hexdigit[(ch >> 3) & 7];
				1436	*p++ = hexdigit[ch & 7];
				1437	}
				1438	/* Copy everything else as-is */
				1439	else
				1440	*p++ = (char) ch;
				1441	}
				1442	if (quotes)
				1443	*p++ = q[1];
				1444
				1445	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1446	if (_PyString_Resize(&repr, p - q))
				1447	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1448
				1449	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1450
				1451	onError:
				1452	Py_DECREF(repr);
				1453	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1454	}
				1455
				1456	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1457	int size)
				1458	{
				1459	return unicodeescape_string(s, size, 0);
				1460	}
				1461
				1462	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1463	{
				1464	if (!PyUnicode_Check(unicode)) {
				1465	PyErr_BadArgument();
				1466	return NULL;
				1467	}
				1468	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1469	PyUnicode_GET_SIZE(unicode));
				1470	}
				1471
				1472	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1473
				1474	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1475	int size,
				1476	const char *errors)
				1477	{
				1478	PyUnicodeObject *v;
				1479	Py_UNICODE p, buf;
				1480	const char *end;
				1481	const char *bs;
				1482
				1483	/* Escaped strings will always be longer than the resulting
				1484	Unicode string, so we start with size here and then reduce the
				1485	length after conversion to the true value. */
				1486	v = _PyUnicode_New(size);
				1487	if (v == NULL)
				1488	goto onError;
				1489	if (size == 0)
				1490	return (PyObject *)v;
				1491	p = buf = PyUnicode_AS_UNICODE(v);
				1492	end = s + size;
				1493	while (s < end) {
				1494	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1495	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1496	int i;
				1497
				1498	/* Non-escape characters are interpreted as Unicode ordinals */
				1499	if (*s != '\\') {
				1500	p++ = (unsigned char)s++;
				1501	continue;
				1502	}
				1503
				1504	/* \u-escapes are only interpreted iff the number of leading
				1505	backslashes if odd */
				1506	bs = s;
				1507	for (;s < end;) {
				1508	if (*s != '\\')
				1509	break;
				1510	p++ = (unsigned char)s++;
				1511	}
				1512	if (((s - bs) & 1) == 0 \|\|
				1513	s >= end \|\|
				1514	*s != 'u') {
				1515	continue;
				1516	}
				1517	p--;
				1518	s++;
				1519
				1520	/* \uXXXX with 4 hex digits */
				1521	for (x = 0, i = 0; i < 4; i++) {
				1522	c = (unsigned char)s[i];
				1523	if (!isxdigit(c)) {
				1524	if (unicodeescape_decoding_error(&s, &x, errors,
				1525	"truncated \\uXXXX"))
				1526	goto onError;
				1527	i++;
				1528	break;
				1529	}
				1530	x = (x<<4) & ~0xF;
				1531	if (c >= '0' && c <= '9')
				1532	x += c - '0';
				1533	else if (c >= 'a' && c <= 'f')
				1534	x += 10 + c - 'a';
				1535	else
				1536	x += 10 + c - 'A';
				1537	}
				1538	s += i;
				1539	*p++ = x;
				1540	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1541	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1542	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1543	return (PyObject *)v;
				1544
				1545	onError:
				1546	Py_XDECREF(v);
				1547	return NULL;
				1548	}
				1549
				1550	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1551	int size)
				1552	{
				1553	PyObject *repr;
				1554	char *p;
				1555	char *q;
				1556
				1557	static const char *hexdigit = "0123456789ABCDEF";
				1558
				1559	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1560	if (repr == NULL)
				1561	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1562	if (size == 0)
				1563	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1564
				1565	p = q = PyString_AS_STRING(repr);
				1566	while (size-- > 0) {
				1567	Py_UNICODE ch = *s++;
				1568	/* Map 16-bit characters to '\uxxxx' */
				1569	if (ch >= 256) {
				1570	*p++ = '\\';
				1571	*p++ = 'u';
				1572	*p++ = hexdigit[(ch >> 12) & 0xf];
				1573	*p++ = hexdigit[(ch >> 8) & 0xf];
				1574	*p++ = hexdigit[(ch >> 4) & 0xf];
				1575	*p++ = hexdigit[ch & 15];
				1576	}
				1577	/* Copy everything else as-is */
				1578	else
				1579	*p++ = (char) ch;
				1580	}
				1581	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1582	if (_PyString_Resize(&repr, p - q))
				1583	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1584
				1585	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1586
				1587	onError:
				1588	Py_DECREF(repr);
				1589	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1590	}
				1591
				1592	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1593	{
				1594	if (!PyUnicode_Check(unicode)) {
				1595	PyErr_BadArgument();
				1596	return NULL;
				1597	}
				1598	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1599	PyUnicode_GET_SIZE(unicode));
				1600	}
				1601
				1602	/* --- Latin-1 Codec ------------------------------------------------------ */
				1603
				1604	PyObject PyUnicode_DecodeLatin1(const char s,
				1605	int size,
				1606	const char *errors)
				1607	{
				1608	PyUnicodeObject *v;
				1609	Py_UNICODE *p;
				1610
				1611	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1612	v = _PyUnicode_New(size);
				1613	if (v == NULL)
				1614	goto onError;
				1615	if (size == 0)
				1616	return (PyObject *)v;
				1617	p = PyUnicode_AS_UNICODE(v);
				1618	while (size-- > 0)
				1619	p++ = (unsigned char)s++;
				1620	return (PyObject *)v;
				1621
				1622	onError:
				1623	Py_XDECREF(v);
				1624	return NULL;
				1625	}
				1626
				1627	static
				1628	int latin1_encoding_error(const Py_UNICODE **source,
				1629	char **dest,
				1630	const char *errors,
				1631	const char *details)
				1632	{
				1633	if ((errors == NULL) \|\|
				1634	(strcmp(errors,"strict") == 0)) {
				1635	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1636	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1637	details);
				1638	return -1;
				1639	}
				1640	else if (strcmp(errors,"ignore") == 0) {
				1641	return 0;
				1642	}
				1643	else if (strcmp(errors,"replace") == 0) {
				1644	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1645	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1646	return 0;
				1647	}
				1648	else {
				1649	PyErr_Format(PyExc_ValueError,
				1650	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1651	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1652	errors);
				1653	return -1;
				1654	}
				1655	}
				1656
				1657	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1658	int size,
				1659	const char *errors)
				1660	{
				1661	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1662	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1663
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1664	repr = PyString_FromStringAndSize(NULL, size);
				1665	if (repr == NULL)
				1666	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1667	if (size == 0)
				1668	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1669
				1670	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1671	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1672	while (size-- > 0) {
				1673	Py_UNICODE ch = *p++;
				1674	if (ch >= 256) {
				1675	if (latin1_encoding_error(&p, &s, errors,
				1676	"ordinal not in range(256)"))
				1677	goto onError;
				1678	}
				1679	else
				1680	*s++ = (char)ch;
				1681	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1682	/* Resize if error handling skipped some characters */
				1683	if (s - start < PyString_GET_SIZE(repr))
				1684	if (_PyString_Resize(&repr, s - start))
				1685	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1686	return repr;
				1687
				1688	onError:
				1689	Py_DECREF(repr);
				1690	return NULL;
				1691	}
				1692
				1693	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1694	{
				1695	if (!PyUnicode_Check(unicode)) {
				1696	PyErr_BadArgument();
				1697	return NULL;
				1698	}
				1699	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1700	PyUnicode_GET_SIZE(unicode),
				1701	NULL);
				1702	}
				1703
				1704	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1705
				1706	static
				1707	int ascii_decoding_error(const char **source,
				1708	Py_UNICODE **dest,
				1709	const char *errors,
				1710	const char *details)
				1711	{
				1712	if ((errors == NULL) \|\|
				1713	(strcmp(errors,"strict") == 0)) {
				1714	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1715	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1716	details);
				1717	return -1;
				1718	}
				1719	else if (strcmp(errors,"ignore") == 0) {
				1720	return 0;
				1721	}
				1722	else if (strcmp(errors,"replace") == 0) {
				1723	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1724	(*dest)++;
				1725	return 0;
				1726	}
				1727	else {
				1728	PyErr_Format(PyExc_ValueError,
				1729	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1730	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1731	errors);
				1732	return -1;
				1733	}
				1734	}
				1735
				1736	PyObject PyUnicode_DecodeASCII(const char s,
				1737	int size,
				1738	const char *errors)
				1739	{
				1740	PyUnicodeObject *v;
				1741	Py_UNICODE *p;
				1742
				1743	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1744	v = _PyUnicode_New(size);
				1745	if (v == NULL)
				1746	goto onError;
				1747	if (size == 0)
				1748	return (PyObject *)v;
				1749	p = PyUnicode_AS_UNICODE(v);
				1750	while (size-- > 0) {
				1751	register unsigned char c;
				1752
				1753	c = (unsigned char)*s++;
				1754	if (c < 128)
				1755	*p++ = c;
				1756	else if (ascii_decoding_error(&s, &p, errors,
				1757	"ordinal not in range(128)"))
				1758	goto onError;
				1759	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1760	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1761	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1762	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1763	return (PyObject *)v;
				1764
				1765	onError:
				1766	Py_XDECREF(v);
				1767	return NULL;
				1768	}
				1769
				1770	static
				1771	int ascii_encoding_error(const Py_UNICODE **source,
				1772	char **dest,
				1773	const char *errors,
				1774	const char *details)
				1775	{
				1776	if ((errors == NULL) \|\|
				1777	(strcmp(errors,"strict") == 0)) {
				1778	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1779	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1780	details);
				1781	return -1;
				1782	}
				1783	else if (strcmp(errors,"ignore") == 0) {
				1784	return 0;
				1785	}
				1786	else if (strcmp(errors,"replace") == 0) {
				1787	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1788	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1789	return 0;
				1790	}
				1791	else {
				1792	PyErr_Format(PyExc_ValueError,
				1793	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1794	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1795	errors);
				1796	return -1;
				1797	}
				1798	}
				1799
				1800	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1801	int size,
				1802	const char *errors)
				1803	{
				1804	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1805	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1806
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1807	repr = PyString_FromStringAndSize(NULL, size);
				1808	if (repr == NULL)
				1809	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1810	if (size == 0)
				1811	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1812
				1813	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1814	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1815	while (size-- > 0) {
				1816	Py_UNICODE ch = *p++;
				1817	if (ch >= 128) {
				1818	if (ascii_encoding_error(&p, &s, errors,
				1819	"ordinal not in range(128)"))
				1820	goto onError;
				1821	}
				1822	else
				1823	*s++ = (char)ch;
				1824	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1825	/* Resize if error handling skipped some characters */
				1826	if (s - start < PyString_GET_SIZE(repr))
				1827	if (_PyString_Resize(&repr, s - start))
				1828	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1829	return repr;
				1830
				1831	onError:
				1832	Py_DECREF(repr);
				1833	return NULL;
				1834	}
				1835
				1836	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1837	{
				1838	if (!PyUnicode_Check(unicode)) {
				1839	PyErr_BadArgument();
				1840	return NULL;
				1841	}
				1842	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1843	PyUnicode_GET_SIZE(unicode),
				1844	NULL);
				1845	}
				1846
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1847	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1848
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1849	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1850
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1851	PyObject PyUnicode_DecodeMBCS(const char s,
				1852	int size,
				1853	const char *errors)
				1854	{
				1855	PyUnicodeObject *v;
				1856	Py_UNICODE *p;
				1857
				1858	/* First get the size of the result */
				1859	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1860	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1861	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1862
				1863	v = _PyUnicode_New(usize);
				1864	if (v == NULL)
				1865	return NULL;
				1866	if (usize == 0)
				1867	return (PyObject *)v;
				1868	p = PyUnicode_AS_UNICODE(v);
				1869	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1870	Py_DECREF(v);
				1871	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1872	}
				1873
				1874	return (PyObject *)v;
				1875	}
				1876
				1877	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1878	int size,
				1879	const char *errors)
				1880	{
				1881	PyObject *repr;
				1882	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1883	DWORD mbcssize;
				1884
				1885	/* If there are no characters, bail now! */
				1886	if (size==0)
				1887	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1888
				1889	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1890	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1891	if (mbcssize==0)
				1892	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1893
				1894	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1895	if (repr == NULL)
				1896	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1897	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1898	return repr;
				1899
				1900	/* Do the conversion */
				1901	s = PyString_AS_STRING(repr);
				1902	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1903	Py_DECREF(repr);
				1904	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1905	}
				1906	return repr;
				1907	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1908
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1909	#endif /* MS_WIN32 */
				1910
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1911	/* --- Character Mapping Codec -------------------------------------------- */
				1912
				1913	static
				1914	int charmap_decoding_error(const char **source,
				1915	Py_UNICODE **dest,
				1916	const char *errors,
				1917	const char *details)
				1918	{
				1919	if ((errors == NULL) \|\|
				1920	(strcmp(errors,"strict") == 0)) {
				1921	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1922	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1923	details);
				1924	return -1;
				1925	}
				1926	else if (strcmp(errors,"ignore") == 0) {
				1927	return 0;
				1928	}
				1929	else if (strcmp(errors,"replace") == 0) {
				1930	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1931	(*dest)++;
				1932	return 0;
				1933	}
				1934	else {
				1935	PyErr_Format(PyExc_ValueError,
				1936	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1937	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1938	errors);
				1939	return -1;
				1940	}
				1941	}
				1942
				1943	PyObject PyUnicode_DecodeCharmap(const char s,
				1944	int size,
				1945	PyObject *mapping,
				1946	const char *errors)
				1947	{
				1948	PyUnicodeObject *v;
				1949	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame^]	1950	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1951
				1952	/* Default to Latin-1 */
				1953	if (mapping == NULL)
				1954	return PyUnicode_DecodeLatin1(s, size, errors);
				1955
				1956	v = _PyUnicode_New(size);
				1957	if (v == NULL)
				1958	goto onError;
				1959	if (size == 0)
				1960	return (PyObject *)v;
				1961	p = PyUnicode_AS_UNICODE(v);
				1962	while (size-- > 0) {
				1963	unsigned char ch = *s++;
				1964	PyObject w, x;
				1965
				1966	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1967	w = PyInt_FromLong((long)ch);
				1968	if (w == NULL)
				1969	goto onError;
				1970	x = PyObject_GetItem(mapping, w);
				1971	Py_DECREF(w);
				1972	if (x == NULL) {
				1973	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1974	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1975	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1976	x = Py_None;
				1977	Py_INCREF(x);
				1978	} else
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1979	goto onError;
				1980	}
				1981
				1982	/* Apply mapping */
				1983	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1984	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1985	if (value < 0 \|\| value > 65535) {
				1986	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1987	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1988	Py_DECREF(x);
				1989	goto onError;
				1990	}
				1991	*p++ = (Py_UNICODE)value;
				1992	}
				1993	else if (x == Py_None) {
				1994	/* undefined mapping */
				1995	if (charmap_decoding_error(&s, &p, errors,
				1996	"character maps to <undefined>")) {
				1997	Py_DECREF(x);
				1998	goto onError;
				1999	}
				2000	}
				2001	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame^]	2002	int targetsize = PyUnicode_GET_SIZE(x);
				2003
				2004	if (targetsize == 1)
				2005	/* 1-1 mapping */
				2006	p++ = PyUnicode_AS_UNICODE(x);
				2007
				2008	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2009	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame^]	2010	if (targetsize > extrachars) {
				2011	/* resize first */
				2012	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2013	int needed = (targetsize - extrachars) + \
				2014	(targetsize << 2);
				2015	extrachars += needed;
				2016	if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2017	Py_DECREF(x);
				2018	goto onError;
				2019	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame^]	2020	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2021	}
				2022	Py_UNICODE_COPY(p,
				2023	PyUnicode_AS_UNICODE(x),
				2024	targetsize);
				2025	p += targetsize;
				2026	extrachars -= targetsize;
				2027	}
				2028	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2029	}
				2030	else {
				2031	/* wrong return value */
				2032	PyErr_SetString(PyExc_TypeError,
				2033	"character mapping must return integer, None or unicode");
				2034	Py_DECREF(x);
				2035	goto onError;
				2036	}
				2037	Py_DECREF(x);
				2038	}
				2039	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				2040	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2041	goto onError;
				2042	return (PyObject *)v;
				2043
				2044	onError:
				2045	Py_XDECREF(v);
				2046	return NULL;
				2047	}
				2048
				2049	static
				2050	int charmap_encoding_error(const Py_UNICODE **source,
				2051	char **dest,
				2052	const char *errors,
				2053	const char *details)
				2054	{
				2055	if ((errors == NULL) \|\|
				2056	(strcmp(errors,"strict") == 0)) {
				2057	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2058	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2059	details);
				2060	return -1;
				2061	}
				2062	else if (strcmp(errors,"ignore") == 0) {
				2063	return 0;
				2064	}
				2065	else if (strcmp(errors,"replace") == 0) {
				2066	**dest = '?';
				2067	(*dest)++;
				2068	return 0;
				2069	}
				2070	else {
				2071	PyErr_Format(PyExc_ValueError,
				2072	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2073	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2074	errors);
				2075	return -1;
				2076	}
				2077	}
				2078
				2079	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2080	int size,
				2081	PyObject *mapping,
				2082	const char *errors)
				2083	{
				2084	PyObject *v;
				2085	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame^]	2086	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2087
				2088	/* Default to Latin-1 */
				2089	if (mapping == NULL)
				2090	return PyUnicode_EncodeLatin1(p, size, errors);
				2091
				2092	v = PyString_FromStringAndSize(NULL, size);
				2093	if (v == NULL)
				2094	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2095	if (size == 0)
				2096	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2097	s = PyString_AS_STRING(v);
				2098	while (size-- > 0) {
				2099	Py_UNICODE ch = *p++;
				2100	PyObject w, x;
				2101
				2102	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2103	w = PyInt_FromLong((long)ch);
				2104	if (w == NULL)
				2105	goto onError;
				2106	x = PyObject_GetItem(mapping, w);
				2107	Py_DECREF(w);
				2108	if (x == NULL) {
				2109	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2110	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2111	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2112	x = Py_None;
				2113	Py_INCREF(x);
				2114	} else
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2115	goto onError;
				2116	}
				2117
				2118	/* Apply mapping */
				2119	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2120	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2121	if (value < 0 \|\| value > 255) {
				2122	PyErr_SetString(PyExc_TypeError,
				2123	"character mapping must be in range(256)");
				2124	Py_DECREF(x);
				2125	goto onError;
				2126	}
				2127	*s++ = (char)value;
				2128	}
				2129	else if (x == Py_None) {
				2130	/* undefined mapping */
				2131	if (charmap_encoding_error(&p, &s, errors,
				2132	"character maps to <undefined>")) {
				2133	Py_DECREF(x);
				2134	goto onError;
				2135	}
				2136	}
				2137	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame^]	2138	int targetsize = PyString_GET_SIZE(x);
				2139
				2140	if (targetsize == 1)
				2141	/* 1-1 mapping */
				2142	s++ = PyString_AS_STRING(x);
				2143
				2144	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2145	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame^]	2146	if (targetsize > extrachars) {
				2147	/* resize first */
				2148	int oldpos = (int)(s - PyString_AS_STRING(v));
				2149	int needed = (targetsize - extrachars) + \
				2150	(targetsize << 2);
				2151	extrachars += needed;
				2152	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2153	Py_DECREF(x);
				2154	goto onError;
				2155	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame^]	2156	s = PyString_AS_STRING(v) + oldpos;
				2157	}
				2158	memcpy(s,
				2159	PyString_AS_STRING(x),
				2160	targetsize);
				2161	s += targetsize;
				2162	extrachars -= targetsize;
				2163	}
				2164	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2165	}
				2166	else {
				2167	/* wrong return value */
				2168	PyErr_SetString(PyExc_TypeError,
				2169	"character mapping must return integer, None or unicode");
				2170	Py_DECREF(x);
				2171	goto onError;
				2172	}
				2173	Py_DECREF(x);
				2174	}
				2175	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2176	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2177	goto onError;
				2178	return v;
				2179
				2180	onError:
				2181	Py_DECREF(v);
				2182	return NULL;
				2183	}
				2184
				2185	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2186	PyObject *mapping)
				2187	{
				2188	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2189	PyErr_BadArgument();
				2190	return NULL;
				2191	}
				2192	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2193	PyUnicode_GET_SIZE(unicode),
				2194	mapping,
				2195	NULL);
				2196	}
				2197
				2198	static
				2199	int translate_error(const Py_UNICODE **source,
				2200	Py_UNICODE **dest,
				2201	const char *errors,
				2202	const char *details)
				2203	{
				2204	if ((errors == NULL) \|\|
				2205	(strcmp(errors,"strict") == 0)) {
				2206	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2207	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2208	details);
				2209	return -1;
				2210	}
				2211	else if (strcmp(errors,"ignore") == 0) {
				2212	return 0;
				2213	}
				2214	else if (strcmp(errors,"replace") == 0) {
				2215	**dest = '?';
				2216	(*dest)++;
				2217	return 0;
				2218	}
				2219	else {
				2220	PyErr_Format(PyExc_ValueError,
				2221	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2222	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2223	errors);
				2224	return -1;
				2225	}
				2226	}
				2227
				2228	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2229	int size,
				2230	PyObject *mapping,
				2231	const char *errors)
				2232	{
				2233	PyUnicodeObject *v;
				2234	Py_UNICODE *p;
				2235
				2236	if (mapping == NULL) {
				2237	PyErr_BadArgument();
				2238	return NULL;
				2239	}
				2240
				2241	/* Output will never be longer than input */
				2242	v = _PyUnicode_New(size);
				2243	if (v == NULL)
				2244	goto onError;
				2245	if (size == 0)
				2246	goto done;
				2247	p = PyUnicode_AS_UNICODE(v);
				2248	while (size-- > 0) {
				2249	Py_UNICODE ch = *s++;
				2250	PyObject w, x;
				2251
				2252	/* Get mapping */
				2253	w = PyInt_FromLong(ch);
				2254	if (w == NULL)
				2255	goto onError;
				2256	x = PyObject_GetItem(mapping, w);
				2257	Py_DECREF(w);
				2258	if (x == NULL) {
				2259	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2260	/* No mapping found: default to 1-1 mapping */
				2261	PyErr_Clear();
				2262	*p++ = ch;
				2263	continue;
				2264	}
				2265	goto onError;
				2266	}
				2267
				2268	/* Apply mapping */
				2269	if (PyInt_Check(x))
				2270	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2271	else if (x == Py_None) {
				2272	/* undefined mapping */
				2273	if (translate_error(&s, &p, errors,
				2274	"character maps to <undefined>")) {
				2275	Py_DECREF(x);
				2276	goto onError;
				2277	}
				2278	}
				2279	else if (PyUnicode_Check(x)) {
				2280	if (PyUnicode_GET_SIZE(x) != 1) {
				2281	/* 1-n mapping */
				2282	PyErr_SetString(PyExc_NotImplementedError,
				2283	"1-n mappings are currently not implemented");
				2284	Py_DECREF(x);
				2285	goto onError;
				2286	}
				2287	p++ = PyUnicode_AS_UNICODE(x);
				2288	}
				2289	else {
				2290	/* wrong return value */
				2291	PyErr_SetString(PyExc_TypeError,
				2292	"translate mapping must return integer, None or unicode");
				2293	Py_DECREF(x);
				2294	goto onError;
				2295	}
				2296	Py_DECREF(x);
				2297	}
				2298	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2299	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2300	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2301
				2302	done:
				2303	return (PyObject *)v;
				2304
				2305	onError:
				2306	Py_XDECREF(v);
				2307	return NULL;
				2308	}
				2309
				2310	PyObject PyUnicode_Translate(PyObject str,
				2311	PyObject *mapping,
				2312	const char *errors)
				2313	{
				2314	PyObject *result;
				2315
				2316	str = PyUnicode_FromObject(str);
				2317	if (str == NULL)
				2318	goto onError;
				2319	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2320	PyUnicode_GET_SIZE(str),
				2321	mapping,
				2322	errors);
				2323	Py_DECREF(str);
				2324	return result;
				2325
				2326	onError:
				2327	Py_XDECREF(str);
				2328	return NULL;
				2329	}
				2330
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2331	/* --- Decimal Encoder ---------------------------------------------------- */
				2332
				2333	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2334	int length,
				2335	char *output,
				2336	const char *errors)
				2337	{
				2338	Py_UNICODE p, end;
				2339
				2340	if (output == NULL) {
				2341	PyErr_BadArgument();
				2342	return -1;
				2343	}
				2344
				2345	p = s;
				2346	end = s + length;
				2347	while (p < end) {
				2348	register Py_UNICODE ch = *p++;
				2349	int decimal;
				2350
				2351	if (Py_UNICODE_ISSPACE(ch)) {
				2352	*output++ = ' ';
				2353	continue;
				2354	}
				2355	decimal = Py_UNICODE_TODECIMAL(ch);
				2356	if (decimal >= 0) {
				2357	*output++ = '0' + decimal;
				2358	continue;
				2359	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2360	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2361	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2362	continue;
				2363	}
				2364	/* All other characters are considered invalid */
				2365	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2366	PyErr_SetString(PyExc_ValueError,
				2367	"invalid decimal Unicode string");
				2368	goto onError;
				2369	}
				2370	else if (strcmp(errors, "ignore") == 0)
				2371	continue;
				2372	else if (strcmp(errors, "replace") == 0) {
				2373	*output++ = '?';
				2374	continue;
				2375	}
				2376	}
				2377	/* 0-terminate the output string */
				2378	*output++ = '\0';
				2379	return 0;
				2380
				2381	onError:
				2382	return -1;
				2383	}
				2384
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2385	/* --- Helpers ------------------------------------------------------------ */
				2386
				2387	static
				2388	int count(PyUnicodeObject *self,
				2389	int start,
				2390	int end,
				2391	PyUnicodeObject *substring)
				2392	{
				2393	int count = 0;
				2394
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2395	if (substring->length == 0)
				2396	return (end - start + 1);
				2397
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2398	end -= substring->length;
				2399
				2400	while (start <= end)
				2401	if (Py_UNICODE_MATCH(self, start, substring)) {
				2402	count++;
				2403	start += substring->length;
				2404	} else
				2405	start++;
				2406
				2407	return count;
				2408	}
				2409
				2410	int PyUnicode_Count(PyObject *str,
				2411	PyObject *substr,
				2412	int start,
				2413	int end)
				2414	{
				2415	int result;
				2416
				2417	str = PyUnicode_FromObject(str);
				2418	if (str == NULL)
				2419	return -1;
				2420	substr = PyUnicode_FromObject(substr);
				2421	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2422	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2423	return -1;
				2424	}
				2425
				2426	result = count((PyUnicodeObject *)str,
				2427	start, end,
				2428	(PyUnicodeObject *)substr);
				2429
				2430	Py_DECREF(str);
				2431	Py_DECREF(substr);
				2432	return result;
				2433	}
				2434
				2435	static
				2436	int findstring(PyUnicodeObject *self,
				2437	PyUnicodeObject *substring,
				2438	int start,
				2439	int end,
				2440	int direction)
				2441	{
				2442	if (start < 0)
				2443	start += self->length;
				2444	if (start < 0)
				2445	start = 0;
				2446
				2447	if (substring->length == 0)
				2448	return start;
				2449
				2450	if (end > self->length)
				2451	end = self->length;
				2452	if (end < 0)
				2453	end += self->length;
				2454	if (end < 0)
				2455	end = 0;
				2456
				2457	end -= substring->length;
				2458
				2459	if (direction < 0) {
				2460	for (; end >= start; end--)
				2461	if (Py_UNICODE_MATCH(self, end, substring))
				2462	return end;
				2463	} else {
				2464	for (; start <= end; start++)
				2465	if (Py_UNICODE_MATCH(self, start, substring))
				2466	return start;
				2467	}
				2468
				2469	return -1;
				2470	}
				2471
				2472	int PyUnicode_Find(PyObject *str,
				2473	PyObject *substr,
				2474	int start,
				2475	int end,
				2476	int direction)
				2477	{
				2478	int result;
				2479
				2480	str = PyUnicode_FromObject(str);
				2481	if (str == NULL)
				2482	return -1;
				2483	substr = PyUnicode_FromObject(substr);
				2484	if (substr == NULL) {
				2485	Py_DECREF(substr);
				2486	return -1;
				2487	}
				2488
				2489	result = findstring((PyUnicodeObject *)str,
				2490	(PyUnicodeObject *)substr,
				2491	start, end, direction);
				2492	Py_DECREF(str);
				2493	Py_DECREF(substr);
				2494	return result;
				2495	}
				2496
				2497	static
				2498	int tailmatch(PyUnicodeObject *self,
				2499	PyUnicodeObject *substring,
				2500	int start,
				2501	int end,
				2502	int direction)
				2503	{
				2504	if (start < 0)
				2505	start += self->length;
				2506	if (start < 0)
				2507	start = 0;
				2508
				2509	if (substring->length == 0)
				2510	return 1;
				2511
				2512	if (end > self->length)
				2513	end = self->length;
				2514	if (end < 0)
				2515	end += self->length;
				2516	if (end < 0)
				2517	end = 0;
				2518
				2519	end -= substring->length;
				2520	if (end < start)
				2521	return 0;
				2522
				2523	if (direction > 0) {
				2524	if (Py_UNICODE_MATCH(self, end, substring))
				2525	return 1;
				2526	} else {
				2527	if (Py_UNICODE_MATCH(self, start, substring))
				2528	return 1;
				2529	}
				2530
				2531	return 0;
				2532	}
				2533
				2534	int PyUnicode_Tailmatch(PyObject *str,
				2535	PyObject *substr,
				2536	int start,
				2537	int end,
				2538	int direction)
				2539	{
				2540	int result;
				2541
				2542	str = PyUnicode_FromObject(str);
				2543	if (str == NULL)
				2544	return -1;
				2545	substr = PyUnicode_FromObject(substr);
				2546	if (substr == NULL) {
				2547	Py_DECREF(substr);
				2548	return -1;
				2549	}
				2550
				2551	result = tailmatch((PyUnicodeObject *)str,
				2552	(PyUnicodeObject *)substr,
				2553	start, end, direction);
				2554	Py_DECREF(str);
				2555	Py_DECREF(substr);
				2556	return result;
				2557	}
				2558
				2559	static
				2560	const Py_UNICODE findchar(const Py_UNICODE s,
				2561	int size,
				2562	Py_UNICODE ch)
				2563	{
				2564	/* like wcschr, but doesn't stop at NULL characters */
				2565
				2566	while (size-- > 0) {
				2567	if (*s == ch)
				2568	return s;
				2569	s++;
				2570	}
				2571
				2572	return NULL;
				2573	}
				2574
				2575	/* Apply fixfct filter to the Unicode object self and return a
				2576	reference to the modified object */
				2577
				2578	static
				2579	PyObject fixup(PyUnicodeObject self,
				2580	int (fixfct)(PyUnicodeObject s))
				2581	{
				2582
				2583	PyUnicodeObject *u;
				2584
				2585	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2586	self->length);
				2587	if (u == NULL)
				2588	return NULL;
				2589	if (!fixfct(u)) {
				2590	/* fixfct should return TRUE if it modified the buffer. If
				2591	FALSE, return a reference to the original buffer instead
				2592	(to save space, not time) */
				2593	Py_INCREF(self);
				2594	Py_DECREF(u);
				2595	return (PyObject*) self;
				2596	}
				2597	return (PyObject*) u;
				2598	}
				2599
				2600	static
				2601	int fixupper(PyUnicodeObject *self)
				2602	{
				2603	int len = self->length;
				2604	Py_UNICODE *s = self->str;
				2605	int status = 0;
				2606
				2607	while (len-- > 0) {
				2608	register Py_UNICODE ch;
				2609
				2610	ch = Py_UNICODE_TOUPPER(*s);
				2611	if (ch != *s) {
				2612	status = 1;
				2613	*s = ch;
				2614	}
				2615	s++;
				2616	}
				2617
				2618	return status;
				2619	}
				2620
				2621	static
				2622	int fixlower(PyUnicodeObject *self)
				2623	{
				2624	int len = self->length;
				2625	Py_UNICODE *s = self->str;
				2626	int status = 0;
				2627
				2628	while (len-- > 0) {
				2629	register Py_UNICODE ch;
				2630
				2631	ch = Py_UNICODE_TOLOWER(*s);
				2632	if (ch != *s) {
				2633	status = 1;
				2634	*s = ch;
				2635	}
				2636	s++;
				2637	}
				2638
				2639	return status;
				2640	}
				2641
				2642	static
				2643	int fixswapcase(PyUnicodeObject *self)
				2644	{
				2645	int len = self->length;
				2646	Py_UNICODE *s = self->str;
				2647	int status = 0;
				2648
				2649	while (len-- > 0) {
				2650	if (Py_UNICODE_ISUPPER(*s)) {
				2651	s = Py_UNICODE_TOLOWER(s);
				2652	status = 1;
				2653	} else if (Py_UNICODE_ISLOWER(*s)) {
				2654	s = Py_UNICODE_TOUPPER(s);
				2655	status = 1;
				2656	}
				2657	s++;
				2658	}
				2659
				2660	return status;
				2661	}
				2662
				2663	static
				2664	int fixcapitalize(PyUnicodeObject *self)
				2665	{
				2666	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2667	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2668	return 1;
				2669	}
				2670	return 0;
				2671	}
				2672
				2673	static
				2674	int fixtitle(PyUnicodeObject *self)
				2675	{
				2676	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2677	register Py_UNICODE *e;
				2678	int previous_is_cased;
				2679
				2680	/* Shortcut for single character strings */
				2681	if (PyUnicode_GET_SIZE(self) == 1) {
				2682	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2683	if (*p != ch) {
				2684	*p = ch;
				2685	return 1;
				2686	}
				2687	else
				2688	return 0;
				2689	}
				2690
				2691	e = p + PyUnicode_GET_SIZE(self);
				2692	previous_is_cased = 0;
				2693	for (; p < e; p++) {
				2694	register const Py_UNICODE ch = *p;
				2695
				2696	if (previous_is_cased)
				2697	*p = Py_UNICODE_TOLOWER(ch);
				2698	else
				2699	*p = Py_UNICODE_TOTITLE(ch);
				2700
				2701	if (Py_UNICODE_ISLOWER(ch) \|\|
				2702	Py_UNICODE_ISUPPER(ch) \|\|
				2703	Py_UNICODE_ISTITLE(ch))
				2704	previous_is_cased = 1;
				2705	else
				2706	previous_is_cased = 0;
				2707	}
				2708	return 1;
				2709	}
				2710
				2711	PyObject PyUnicode_Join(PyObject separator,
				2712	PyObject *seq)
				2713	{
				2714	Py_UNICODE *sep;
				2715	int seplen;
				2716	PyUnicodeObject *res = NULL;
				2717	int reslen = 0;
				2718	Py_UNICODE *p;
				2719	int seqlen = 0;
				2720	int sz = 100;
				2721	int i;
				2722
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2723	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2724	if (seqlen < 0 && PyErr_Occurred())
				2725	return NULL;
				2726
				2727	if (separator == NULL) {
				2728	Py_UNICODE blank = ' ';
				2729	sep = &blank;
				2730	seplen = 1;
				2731	}
				2732	else {
				2733	separator = PyUnicode_FromObject(separator);
				2734	if (separator == NULL)
				2735	return NULL;
				2736	sep = PyUnicode_AS_UNICODE(separator);
				2737	seplen = PyUnicode_GET_SIZE(separator);
				2738	}
				2739
				2740	res = _PyUnicode_New(sz);
				2741	if (res == NULL)
				2742	goto onError;
				2743	p = PyUnicode_AS_UNICODE(res);
				2744	reslen = 0;
				2745
				2746	for (i = 0; i < seqlen; i++) {
				2747	int itemlen;
				2748	PyObject *item;
				2749
				2750	item = PySequence_GetItem(seq, i);
				2751	if (item == NULL)
				2752	goto onError;
				2753	if (!PyUnicode_Check(item)) {
				2754	PyObject *v;
				2755	v = PyUnicode_FromObject(item);
				2756	Py_DECREF(item);
				2757	item = v;
				2758	if (item == NULL)
				2759	goto onError;
				2760	}
				2761	itemlen = PyUnicode_GET_SIZE(item);
				2762	while (reslen + itemlen + seplen >= sz) {
				2763	if (_PyUnicode_Resize(res, sz*2))
				2764	goto onError;
				2765	sz *= 2;
				2766	p = PyUnicode_AS_UNICODE(res) + reslen;
				2767	}
				2768	if (i > 0) {
				2769	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2770	p += seplen;
				2771	reslen += seplen;
				2772	}
				2773	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2774	p += itemlen;
				2775	reslen += itemlen;
				2776	Py_DECREF(item);
				2777	}
				2778	if (_PyUnicode_Resize(res, reslen))
				2779	goto onError;
				2780
				2781	Py_XDECREF(separator);
				2782	return (PyObject *)res;
				2783
				2784	onError:
				2785	Py_XDECREF(separator);
				2786	Py_DECREF(res);
				2787	return NULL;
				2788	}
				2789
				2790	static
				2791	PyUnicodeObject pad(PyUnicodeObject self,
				2792	int left,
				2793	int right,
				2794	Py_UNICODE fill)
				2795	{
				2796	PyUnicodeObject *u;
				2797
				2798	if (left < 0)
				2799	left = 0;
				2800	if (right < 0)
				2801	right = 0;
				2802
				2803	if (left == 0 && right == 0) {
				2804	Py_INCREF(self);
				2805	return self;
				2806	}
				2807
				2808	u = _PyUnicode_New(left + self->length + right);
				2809	if (u) {
				2810	if (left)
				2811	Py_UNICODE_FILL(u->str, fill, left);
				2812	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2813	if (right)
				2814	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2815	}
				2816
				2817	return u;
				2818	}
				2819
				2820	#define SPLIT_APPEND(data, left, right) \
				2821	str = PyUnicode_FromUnicode(data + left, right - left); \
				2822	if (!str) \
				2823	goto onError; \
				2824	if (PyList_Append(list, str)) { \
				2825	Py_DECREF(str); \
				2826	goto onError; \
				2827	} \
				2828	else \
				2829	Py_DECREF(str);
				2830
				2831	static
				2832	PyObject split_whitespace(PyUnicodeObject self,
				2833	PyObject *list,
				2834	int maxcount)
				2835	{
				2836	register int i;
				2837	register int j;
				2838	int len = self->length;
				2839	PyObject *str;
				2840
				2841	for (i = j = 0; i < len; ) {
				2842	/* find a token */
				2843	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2844	i++;
				2845	j = i;
				2846	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2847	i++;
				2848	if (j < i) {
				2849	if (maxcount-- <= 0)
				2850	break;
				2851	SPLIT_APPEND(self->str, j, i);
				2852	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2853	i++;
				2854	j = i;
				2855	}
				2856	}
				2857	if (j < len) {
				2858	SPLIT_APPEND(self->str, j, len);
				2859	}
				2860	return list;
				2861
				2862	onError:
				2863	Py_DECREF(list);
				2864	return NULL;
				2865	}
				2866
				2867	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2868	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2869	{
				2870	register int i;
				2871	register int j;
				2872	int len;
				2873	PyObject *list;
				2874	PyObject *str;
				2875	Py_UNICODE *data;
				2876
				2877	string = PyUnicode_FromObject(string);
				2878	if (string == NULL)
				2879	return NULL;
				2880	data = PyUnicode_AS_UNICODE(string);
				2881	len = PyUnicode_GET_SIZE(string);
				2882
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2883	list = PyList_New(0);
				2884	if (!list)
				2885	goto onError;
				2886
				2887	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2888	int eol;
				2889
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2890	/* Find a line and append it */
				2891	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2892	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2893
				2894	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2895	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2896	if (i < len) {
				2897	if (data[i] == '\r' && i + 1 < len &&
				2898	data[i+1] == '\n')
				2899	i += 2;
				2900	else
				2901	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2902	if (keepends)
				2903	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2904	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2905	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2906	j = i;
				2907	}
				2908	if (j < len) {
				2909	SPLIT_APPEND(data, j, len);
				2910	}
				2911
				2912	Py_DECREF(string);
				2913	return list;
				2914
				2915	onError:
				2916	Py_DECREF(list);
				2917	Py_DECREF(string);
				2918	return NULL;
				2919	}
				2920
				2921	static
				2922	PyObject split_char(PyUnicodeObject self,
				2923	PyObject *list,
				2924	Py_UNICODE ch,
				2925	int maxcount)
				2926	{
				2927	register int i;
				2928	register int j;
				2929	int len = self->length;
				2930	PyObject *str;
				2931
				2932	for (i = j = 0; i < len; ) {
				2933	if (self->str[i] == ch) {
				2934	if (maxcount-- <= 0)
				2935	break;
				2936	SPLIT_APPEND(self->str, j, i);
				2937	i = j = i + 1;
				2938	} else
				2939	i++;
				2940	}
				2941	if (j <= len) {
				2942	SPLIT_APPEND(self->str, j, len);
				2943	}
				2944	return list;
				2945
				2946	onError:
				2947	Py_DECREF(list);
				2948	return NULL;
				2949	}
				2950
				2951	static
				2952	PyObject split_substring(PyUnicodeObject self,
				2953	PyObject *list,
				2954	PyUnicodeObject *substring,
				2955	int maxcount)
				2956	{
				2957	register int i;
				2958	register int j;
				2959	int len = self->length;
				2960	int sublen = substring->length;
				2961	PyObject *str;
				2962
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	2963	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2964	if (Py_UNICODE_MATCH(self, i, substring)) {
				2965	if (maxcount-- <= 0)
				2966	break;
				2967	SPLIT_APPEND(self->str, j, i);
				2968	i = j = i + sublen;
				2969	} else
				2970	i++;
				2971	}
				2972	if (j <= len) {
				2973	SPLIT_APPEND(self->str, j, len);
				2974	}
				2975	return list;
				2976
				2977	onError:
				2978	Py_DECREF(list);
				2979	return NULL;
				2980	}
				2981
				2982	#undef SPLIT_APPEND
				2983
				2984	static
				2985	PyObject split(PyUnicodeObject self,
				2986	PyUnicodeObject *substring,
				2987	int maxcount)
				2988	{
				2989	PyObject *list;
				2990
				2991	if (maxcount < 0)
				2992	maxcount = INT_MAX;
				2993
				2994	list = PyList_New(0);
				2995	if (!list)
				2996	return NULL;
				2997
				2998	if (substring == NULL)
				2999	return split_whitespace(self,list,maxcount);
				3000
				3001	else if (substring->length == 1)
				3002	return split_char(self,list,substring->str[0],maxcount);
				3003
				3004	else if (substring->length == 0) {
				3005	Py_DECREF(list);
				3006	PyErr_SetString(PyExc_ValueError, "empty separator");
				3007	return NULL;
				3008	}
				3009	else
				3010	return split_substring(self,list,substring,maxcount);
				3011	}
				3012
				3013	static
				3014	PyObject strip(PyUnicodeObject self,
				3015	int left,
				3016	int right)
				3017	{
				3018	Py_UNICODE *p = self->str;
				3019	int start = 0;
				3020	int end = self->length;
				3021
				3022	if (left)
				3023	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3024	start++;
				3025
				3026	if (right)
				3027	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3028	end--;
				3029
				3030	if (start == 0 && end == self->length) {
				3031	/* couldn't strip anything off, return original string */
				3032	Py_INCREF(self);
				3033	return (PyObject*) self;
				3034	}
				3035
				3036	return (PyObject*) PyUnicode_FromUnicode(
				3037	self->str + start,
				3038	end - start
				3039	);
				3040	}
				3041
				3042	static
				3043	PyObject replace(PyUnicodeObject self,
				3044	PyUnicodeObject *str1,
				3045	PyUnicodeObject *str2,
				3046	int maxcount)
				3047	{
				3048	PyUnicodeObject *u;
				3049
				3050	if (maxcount < 0)
				3051	maxcount = INT_MAX;
				3052
				3053	if (str1->length == 1 && str2->length == 1) {
				3054	int i;
				3055
				3056	/* replace characters */
				3057	if (!findchar(self->str, self->length, str1->str[0])) {
				3058	/* nothing to replace, return original string */
				3059	Py_INCREF(self);
				3060	u = self;
				3061	} else {
				3062	Py_UNICODE u1 = str1->str[0];
				3063	Py_UNICODE u2 = str2->str[0];
				3064
				3065	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3066	self->str,
				3067	self->length
				3068	);
				3069	if (u)
				3070	for (i = 0; i < u->length; i++)
				3071	if (u->str[i] == u1) {
				3072	if (--maxcount < 0)
				3073	break;
				3074	u->str[i] = u2;
				3075	}
				3076	}
				3077
				3078	} else {
				3079	int n, i;
				3080	Py_UNICODE *p;
				3081
				3082	/* replace strings */
				3083	n = count(self, 0, self->length, str1);
				3084	if (n > maxcount)
				3085	n = maxcount;
				3086	if (n == 0) {
				3087	/* nothing to replace, return original string */
				3088	Py_INCREF(self);
				3089	u = self;
				3090	} else {
				3091	u = _PyUnicode_New(
				3092	self->length + n * (str2->length - str1->length));
				3093	if (u) {
				3094	i = 0;
				3095	p = u->str;
				3096	while (i <= self->length - str1->length)
				3097	if (Py_UNICODE_MATCH(self, i, str1)) {
				3098	/* replace string segment */
				3099	Py_UNICODE_COPY(p, str2->str, str2->length);
				3100	p += str2->length;
				3101	i += str1->length;
				3102	if (--n <= 0) {
				3103	/* copy remaining part */
				3104	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3105	break;
				3106	}
				3107	} else
				3108	*p++ = self->str[i++];
				3109	}
				3110	}
				3111	}
				3112
				3113	return (PyObject *) u;
				3114	}
				3115
				3116	/* --- Unicode Object Methods --------------------------------------------- */
				3117
				3118	static char title__doc__[] =
				3119	"S.title() -> unicode\n\
				3120	\n\
				3121	Return a titlecased version of S, i.e. words start with title case\n\
				3122	characters, all remaining cased characters have lower case.";
				3123
				3124	static PyObject*
				3125	unicode_title(PyUnicodeObject self, PyObject args)
				3126	{
				3127	if (!PyArg_NoArgs(args))
				3128	return NULL;
				3129	return fixup(self, fixtitle);
				3130	}
				3131
				3132	static char capitalize__doc__[] =
				3133	"S.capitalize() -> unicode\n\
				3134	\n\
				3135	Return a capitalized version of S, i.e. make the first character\n\
				3136	have upper case.";
				3137
				3138	static PyObject*
				3139	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3140	{
				3141	if (!PyArg_NoArgs(args))
				3142	return NULL;
				3143	return fixup(self, fixcapitalize);
				3144	}
				3145
				3146	#if 0
				3147	static char capwords__doc__[] =
				3148	"S.capwords() -> unicode\n\
				3149	\n\
				3150	Apply .capitalize() to all words in S and return the result with\n\
				3151	normalized whitespace (all whitespace strings are replaced by ' ').";
				3152
				3153	static PyObject*
				3154	unicode_capwords(PyUnicodeObject self, PyObject args)
				3155	{
				3156	PyObject *list;
				3157	PyObject *item;
				3158	int i;
				3159
				3160	if (!PyArg_NoArgs(args))
				3161	return NULL;
				3162
				3163	/* Split into words */
				3164	list = split(self, NULL, -1);
				3165	if (!list)
				3166	return NULL;
				3167
				3168	/* Capitalize each word */
				3169	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3170	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3171	fixcapitalize);
				3172	if (item == NULL)
				3173	goto onError;
				3174	Py_DECREF(PyList_GET_ITEM(list, i));
				3175	PyList_SET_ITEM(list, i, item);
				3176	}
				3177
				3178	/* Join the words to form a new string */
				3179	item = PyUnicode_Join(NULL, list);
				3180
				3181	onError:
				3182	Py_DECREF(list);
				3183	return (PyObject *)item;
				3184	}
				3185	#endif
				3186
				3187	static char center__doc__[] =
				3188	"S.center(width) -> unicode\n\
				3189	\n\
				3190	Return S centered in a Unicode string of length width. Padding is done\n\
				3191	using spaces.";
				3192
				3193	static PyObject *
				3194	unicode_center(PyUnicodeObject self, PyObject args)
				3195	{
				3196	int marg, left;
				3197	int width;
				3198
				3199	if (!PyArg_ParseTuple(args, "i:center", &width))
				3200	return NULL;
				3201
				3202	if (self->length >= width) {
				3203	Py_INCREF(self);
				3204	return (PyObject*) self;
				3205	}
				3206
				3207	marg = width - self->length;
				3208	left = marg / 2 + (marg & width & 1);
				3209
				3210	return (PyObject*) pad(self, left, marg - left, ' ');
				3211	}
				3212
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3213	#if 0
				3214
				3215	/* This code should go into some future Unicode collation support
				3216	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3217	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3218
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3219	/* speedy UTF-16 code point order comparison */
				3220	/* gleaned from: */
				3221	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3222
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3223	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3224	{
				3225	0, 0, 0, 0, 0, 0, 0, 0,
				3226	0, 0, 0, 0, 0, 0, 0, 0,
				3227	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3228	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3229	};
				3230
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3231	static int
				3232	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3233	{
				3234	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3235
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3236	Py_UNICODE *s1 = str1->str;
				3237	Py_UNICODE *s2 = str2->str;
				3238
				3239	len1 = str1->length;
				3240	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3241
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3242	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3243	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3244	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3245
				3246	c1 = *s1++;
				3247	c2 = *s2++;
				3248	if (c1 > (1<<11) * 26)
				3249	c1 += utf16Fixup[c1>>11];
				3250	if (c2 > (1<<11) * 26)
				3251	c2 += utf16Fixup[c2>>11];
				3252
				3253	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3254	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3255	if (diff)
				3256	return (diff < 0) ? -1 : (diff != 0);
				3257	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3258	}
				3259
				3260	return (len1 < len2) ? -1 : (len1 != len2);
				3261	}
				3262
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3263	#else
				3264
				3265	static int
				3266	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3267	{
				3268	register int len1, len2;
				3269
				3270	Py_UNICODE *s1 = str1->str;
				3271	Py_UNICODE *s2 = str2->str;
				3272
				3273	len1 = str1->length;
				3274	len2 = str2->length;
				3275
				3276	while (len1 > 0 && len2 > 0) {
				3277	register long diff;
				3278
				3279	diff = (long)s1++ - (long)s2++;
				3280	if (diff)
				3281	return (diff < 0) ? -1 : (diff != 0);
				3282	len1--; len2--;
				3283	}
				3284
				3285	return (len1 < len2) ? -1 : (len1 != len2);
				3286	}
				3287
				3288	#endif
				3289
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3290	int PyUnicode_Compare(PyObject *left,
				3291	PyObject *right)
				3292	{
				3293	PyUnicodeObject u = NULL, v = NULL;
				3294	int result;
				3295
				3296	/* Coerce the two arguments */
				3297	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3298	if (u == NULL)
				3299	goto onError;
				3300	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3301	if (v == NULL)
				3302	goto onError;
				3303
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3304	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3305	if (v == u) {
				3306	Py_DECREF(u);
				3307	Py_DECREF(v);
				3308	return 0;
				3309	}
				3310
				3311	result = unicode_compare(u, v);
				3312
				3313	Py_DECREF(u);
				3314	Py_DECREF(v);
				3315	return result;
				3316
				3317	onError:
				3318	Py_XDECREF(u);
				3319	Py_XDECREF(v);
				3320	return -1;
				3321	}
				3322
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3323	int PyUnicode_Contains(PyObject *container,
				3324	PyObject *element)
				3325	{
				3326	PyUnicodeObject u = NULL, v = NULL;
				3327	int result;
				3328	register const Py_UNICODE p, e;
				3329	register Py_UNICODE ch;
				3330
				3331	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3332	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3333	if (v == NULL) {
				3334	PyErr_SetString(PyExc_TypeError,
				3335	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3336	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3337	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3338	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3339	if (u == NULL) {
				3340	Py_DECREF(v);
				3341	goto onError;
				3342	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3343
				3344	/* Check v in u */
				3345	if (PyUnicode_GET_SIZE(v) != 1) {
				3346	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3347	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3348	goto onError;
				3349	}
				3350	ch = *PyUnicode_AS_UNICODE(v);
				3351	p = PyUnicode_AS_UNICODE(u);
				3352	e = p + PyUnicode_GET_SIZE(u);
				3353	result = 0;
				3354	while (p < e) {
				3355	if (*p++ == ch) {
				3356	result = 1;
				3357	break;
				3358	}
				3359	}
				3360
				3361	Py_DECREF(u);
				3362	Py_DECREF(v);
				3363	return result;
				3364
				3365	onError:
				3366	Py_XDECREF(u);
				3367	Py_XDECREF(v);
				3368	return -1;
				3369	}
				3370
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3371	/* Concat to string or Unicode object giving a new Unicode object. */
				3372
				3373	PyObject PyUnicode_Concat(PyObject left,
				3374	PyObject *right)
				3375	{
				3376	PyUnicodeObject u = NULL, v = NULL, *w;
				3377
				3378	/* Coerce the two arguments */
				3379	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3380	if (u == NULL)
				3381	goto onError;
				3382	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3383	if (v == NULL)
				3384	goto onError;
				3385
				3386	/* Shortcuts */
				3387	if (v == unicode_empty) {
				3388	Py_DECREF(v);
				3389	return (PyObject *)u;
				3390	}
				3391	if (u == unicode_empty) {
				3392	Py_DECREF(u);
				3393	return (PyObject *)v;
				3394	}
				3395
				3396	/* Concat the two Unicode strings */
				3397	w = _PyUnicode_New(u->length + v->length);
				3398	if (w == NULL)
				3399	goto onError;
				3400	Py_UNICODE_COPY(w->str, u->str, u->length);
				3401	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3402
				3403	Py_DECREF(u);
				3404	Py_DECREF(v);
				3405	return (PyObject *)w;
				3406
				3407	onError:
				3408	Py_XDECREF(u);
				3409	Py_XDECREF(v);
				3410	return NULL;
				3411	}
				3412
				3413	static char count__doc__[] =
				3414	"S.count(sub[, start[, end]]) -> int\n\
				3415	\n\
				3416	Return the number of occurrences of substring sub in Unicode string\n\
				3417	S[start:end]. Optional arguments start and end are\n\
				3418	interpreted as in slice notation.";
				3419
				3420	static PyObject *
				3421	unicode_count(PyUnicodeObject self, PyObject args)
				3422	{
				3423	PyUnicodeObject *substring;
				3424	int start = 0;
				3425	int end = INT_MAX;
				3426	PyObject *result;
				3427
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3428	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3429	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3430	return NULL;
				3431
				3432	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3433	(PyObject *)substring);
				3434	if (substring == NULL)
				3435	return NULL;
				3436
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3437	if (start < 0)
				3438	start += self->length;
				3439	if (start < 0)
				3440	start = 0;
				3441	if (end > self->length)
				3442	end = self->length;
				3443	if (end < 0)
				3444	end += self->length;
				3445	if (end < 0)
				3446	end = 0;
				3447
				3448	result = PyInt_FromLong((long) count(self, start, end, substring));
				3449
				3450	Py_DECREF(substring);
				3451	return result;
				3452	}
				3453
				3454	static char encode__doc__[] =
				3455	"S.encode([encoding[,errors]]) -> string\n\
				3456	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3457	Return an encoded string version of S. Default encoding is the current\n\
				3458	default string encoding. errors may be given to set a different error\n\
				3459	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3460	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3461
				3462	static PyObject *
				3463	unicode_encode(PyUnicodeObject self, PyObject args)
				3464	{
				3465	char *encoding = NULL;
				3466	char *errors = NULL;
				3467	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3468	return NULL;
				3469	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3470	}
				3471
				3472	static char expandtabs__doc__[] =
				3473	"S.expandtabs([tabsize]) -> unicode\n\
				3474	\n\
				3475	Return a copy of S where all tab characters are expanded using spaces.\n\
				3476	If tabsize is not given, a tab size of 8 characters is assumed.";
				3477
				3478	static PyObject*
				3479	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3480	{
				3481	Py_UNICODE *e;
				3482	Py_UNICODE *p;
				3483	Py_UNICODE *q;
				3484	int i, j;
				3485	PyUnicodeObject *u;
				3486	int tabsize = 8;
				3487
				3488	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3489	return NULL;
				3490
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3491	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3492	i = j = 0;
				3493	e = self->str + self->length;
				3494	for (p = self->str; p < e; p++)
				3495	if (*p == '\t') {
				3496	if (tabsize > 0)
				3497	j += tabsize - (j % tabsize);
				3498	}
				3499	else {
				3500	j++;
				3501	if (p == '\n' \|\| p == '\r') {
				3502	i += j;
				3503	j = 0;
				3504	}
				3505	}
				3506
				3507	/* Second pass: create output string and fill it */
				3508	u = _PyUnicode_New(i + j);
				3509	if (!u)
				3510	return NULL;
				3511
				3512	j = 0;
				3513	q = u->str;
				3514
				3515	for (p = self->str; p < e; p++)
				3516	if (*p == '\t') {
				3517	if (tabsize > 0) {
				3518	i = tabsize - (j % tabsize);
				3519	j += i;
				3520	while (i--)
				3521	*q++ = ' ';
				3522	}
				3523	}
				3524	else {
				3525	j++;
				3526	q++ = p;
				3527	if (p == '\n' \|\| p == '\r')
				3528	j = 0;
				3529	}
				3530
				3531	return (PyObject*) u;
				3532	}
				3533
				3534	static char find__doc__[] =
				3535	"S.find(sub [,start [,end]]) -> int\n\
				3536	\n\
				3537	Return the lowest index in S where substring sub is found,\n\
				3538	such that sub is contained within s[start,end]. Optional\n\
				3539	arguments start and end are interpreted as in slice notation.\n\
				3540	\n\
				3541	Return -1 on failure.";
				3542
				3543	static PyObject *
				3544	unicode_find(PyUnicodeObject self, PyObject args)
				3545	{
				3546	PyUnicodeObject *substring;
				3547	int start = 0;
				3548	int end = INT_MAX;
				3549	PyObject *result;
				3550
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3551	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3552	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3553	return NULL;
				3554	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3555	(PyObject *)substring);
				3556	if (substring == NULL)
				3557	return NULL;
				3558
				3559	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3560
				3561	Py_DECREF(substring);
				3562	return result;
				3563	}
				3564
				3565	static PyObject *
				3566	unicode_getitem(PyUnicodeObject *self, int index)
				3567	{
				3568	if (index < 0 \|\| index >= self->length) {
				3569	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3570	return NULL;
				3571	}
				3572
				3573	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3574	}
				3575
				3576	static long
				3577	unicode_hash(PyUnicodeObject *self)
				3578	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3579	/* Since Unicode objects compare equal to their ASCII string
				3580	counterparts, they should use the individual character values
				3581	as basis for their hash value. This is needed to assure that
				3582	strings and Unicode objects behave in the same way as
				3583	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3584
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3585	register int len;
				3586	register Py_UNICODE *p;
				3587	register long x;
				3588
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3589	if (self->hash != -1)
				3590	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3591	len = PyUnicode_GET_SIZE(self);
				3592	p = PyUnicode_AS_UNICODE(self);
				3593	x = *p << 7;
				3594	while (--len >= 0)
				3595	x = (1000003x) ^ p++;
				3596	x ^= PyUnicode_GET_SIZE(self);
				3597	if (x == -1)
				3598	x = -2;
				3599	self->hash = x;
				3600	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3601	}
				3602
				3603	static char index__doc__[] =
				3604	"S.index(sub [,start [,end]]) -> int\n\
				3605	\n\
				3606	Like S.find() but raise ValueError when the substring is not found.";
				3607
				3608	static PyObject *
				3609	unicode_index(PyUnicodeObject self, PyObject args)
				3610	{
				3611	int result;
				3612	PyUnicodeObject *substring;
				3613	int start = 0;
				3614	int end = INT_MAX;
				3615
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3616	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3617	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3618	return NULL;
				3619
				3620	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3621	(PyObject *)substring);
				3622	if (substring == NULL)
				3623	return NULL;
				3624
				3625	result = findstring(self, substring, start, end, 1);
				3626
				3627	Py_DECREF(substring);
				3628	if (result < 0) {
				3629	PyErr_SetString(PyExc_ValueError, "substring not found");
				3630	return NULL;
				3631	}
				3632	return PyInt_FromLong(result);
				3633	}
				3634
				3635	static char islower__doc__[] =
				3636	"S.islower() -> int\n\
				3637	\n\
				3638	Return 1 if all cased characters in S are lowercase and there is\n\
				3639	at least one cased character in S, 0 otherwise.";
				3640
				3641	static PyObject*
				3642	unicode_islower(PyUnicodeObject self, PyObject args)
				3643	{
				3644	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3645	register const Py_UNICODE *e;
				3646	int cased;
				3647
				3648	if (!PyArg_NoArgs(args))
				3649	return NULL;
				3650
				3651	/* Shortcut for single character strings */
				3652	if (PyUnicode_GET_SIZE(self) == 1)
				3653	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3654
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3655	/* Special case for empty strings */
				3656	if (PyString_GET_SIZE(self) == 0)
				3657	return PyInt_FromLong(0);
				3658
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3659	e = p + PyUnicode_GET_SIZE(self);
				3660	cased = 0;
				3661	for (; p < e; p++) {
				3662	register const Py_UNICODE ch = *p;
				3663
				3664	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3665	return PyInt_FromLong(0);
				3666	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3667	cased = 1;
				3668	}
				3669	return PyInt_FromLong(cased);
				3670	}
				3671
				3672	static char isupper__doc__[] =
				3673	"S.isupper() -> int\n\
				3674	\n\
				3675	Return 1 if all cased characters in S are uppercase and there is\n\
				3676	at least one cased character in S, 0 otherwise.";
				3677
				3678	static PyObject*
				3679	unicode_isupper(PyUnicodeObject self, PyObject args)
				3680	{
				3681	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3682	register const Py_UNICODE *e;
				3683	int cased;
				3684
				3685	if (!PyArg_NoArgs(args))
				3686	return NULL;
				3687
				3688	/* Shortcut for single character strings */
				3689	if (PyUnicode_GET_SIZE(self) == 1)
				3690	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3691
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3692	/* Special case for empty strings */
				3693	if (PyString_GET_SIZE(self) == 0)
				3694	return PyInt_FromLong(0);
				3695
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3696	e = p + PyUnicode_GET_SIZE(self);
				3697	cased = 0;
				3698	for (; p < e; p++) {
				3699	register const Py_UNICODE ch = *p;
				3700
				3701	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3702	return PyInt_FromLong(0);
				3703	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3704	cased = 1;
				3705	}
				3706	return PyInt_FromLong(cased);
				3707	}
				3708
				3709	static char istitle__doc__[] =
				3710	"S.istitle() -> int\n\
				3711	\n\
				3712	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3713	may only follow uncased characters and lowercase characters only cased\n\
				3714	ones. Return 0 otherwise.";
				3715
				3716	static PyObject*
				3717	unicode_istitle(PyUnicodeObject self, PyObject args)
				3718	{
				3719	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3720	register const Py_UNICODE *e;
				3721	int cased, previous_is_cased;
				3722
				3723	if (!PyArg_NoArgs(args))
				3724	return NULL;
				3725
				3726	/* Shortcut for single character strings */
				3727	if (PyUnicode_GET_SIZE(self) == 1)
				3728	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3729	(Py_UNICODE_ISUPPER(*p) != 0));
				3730
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3731	/* Special case for empty strings */
				3732	if (PyString_GET_SIZE(self) == 0)
				3733	return PyInt_FromLong(0);
				3734
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3735	e = p + PyUnicode_GET_SIZE(self);
				3736	cased = 0;
				3737	previous_is_cased = 0;
				3738	for (; p < e; p++) {
				3739	register const Py_UNICODE ch = *p;
				3740
				3741	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3742	if (previous_is_cased)
				3743	return PyInt_FromLong(0);
				3744	previous_is_cased = 1;
				3745	cased = 1;
				3746	}
				3747	else if (Py_UNICODE_ISLOWER(ch)) {
				3748	if (!previous_is_cased)
				3749	return PyInt_FromLong(0);
				3750	previous_is_cased = 1;
				3751	cased = 1;
				3752	}
				3753	else
				3754	previous_is_cased = 0;
				3755	}
				3756	return PyInt_FromLong(cased);
				3757	}
				3758
				3759	static char isspace__doc__[] =
				3760	"S.isspace() -> int\n\
				3761	\n\
				3762	Return 1 if there are only whitespace characters in S,\n\
				3763	0 otherwise.";
				3764
				3765	static PyObject*
				3766	unicode_isspace(PyUnicodeObject self, PyObject args)
				3767	{
				3768	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3769	register const Py_UNICODE *e;
				3770
				3771	if (!PyArg_NoArgs(args))
				3772	return NULL;
				3773
				3774	/* Shortcut for single character strings */
				3775	if (PyUnicode_GET_SIZE(self) == 1 &&
				3776	Py_UNICODE_ISSPACE(*p))
				3777	return PyInt_FromLong(1);
				3778
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3779	/* Special case for empty strings */
				3780	if (PyString_GET_SIZE(self) == 0)
				3781	return PyInt_FromLong(0);
				3782
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3783	e = p + PyUnicode_GET_SIZE(self);
				3784	for (; p < e; p++) {
				3785	if (!Py_UNICODE_ISSPACE(*p))
				3786	return PyInt_FromLong(0);
				3787	}
				3788	return PyInt_FromLong(1);
				3789	}
				3790
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3791	static char isalpha__doc__[] =
				3792	"S.isalpha() -> int\n\
				3793	\n\
				3794	Return 1 if all characters in S are alphabetic\n\
				3795	and there is at least one character in S, 0 otherwise.";
				3796
				3797	static PyObject*
				3798	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3799	{
				3800	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3801	register const Py_UNICODE *e;
				3802
				3803	if (!PyArg_NoArgs(args))
				3804	return NULL;
				3805
				3806	/* Shortcut for single character strings */
				3807	if (PyUnicode_GET_SIZE(self) == 1 &&
				3808	Py_UNICODE_ISALPHA(*p))
				3809	return PyInt_FromLong(1);
				3810
				3811	/* Special case for empty strings */
				3812	if (PyString_GET_SIZE(self) == 0)
				3813	return PyInt_FromLong(0);
				3814
				3815	e = p + PyUnicode_GET_SIZE(self);
				3816	for (; p < e; p++) {
				3817	if (!Py_UNICODE_ISALPHA(*p))
				3818	return PyInt_FromLong(0);
				3819	}
				3820	return PyInt_FromLong(1);
				3821	}
				3822
				3823	static char isalnum__doc__[] =
				3824	"S.isalnum() -> int\n\
				3825	\n\
				3826	Return 1 if all characters in S are alphanumeric\n\
				3827	and there is at least one character in S, 0 otherwise.";
				3828
				3829	static PyObject*
				3830	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3831	{
				3832	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3833	register const Py_UNICODE *e;
				3834
				3835	if (!PyArg_NoArgs(args))
				3836	return NULL;
				3837
				3838	/* Shortcut for single character strings */
				3839	if (PyUnicode_GET_SIZE(self) == 1 &&
				3840	Py_UNICODE_ISALNUM(*p))
				3841	return PyInt_FromLong(1);
				3842
				3843	/* Special case for empty strings */
				3844	if (PyString_GET_SIZE(self) == 0)
				3845	return PyInt_FromLong(0);
				3846
				3847	e = p + PyUnicode_GET_SIZE(self);
				3848	for (; p < e; p++) {
				3849	if (!Py_UNICODE_ISALNUM(*p))
				3850	return PyInt_FromLong(0);
				3851	}
				3852	return PyInt_FromLong(1);
				3853	}
				3854
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3855	static char isdecimal__doc__[] =
				3856	"S.isdecimal() -> int\n\
				3857	\n\
				3858	Return 1 if there are only decimal characters in S,\n\
				3859	0 otherwise.";
				3860
				3861	static PyObject*
				3862	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3863	{
				3864	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3865	register const Py_UNICODE *e;
				3866
				3867	if (!PyArg_NoArgs(args))
				3868	return NULL;
				3869
				3870	/* Shortcut for single character strings */
				3871	if (PyUnicode_GET_SIZE(self) == 1 &&
				3872	Py_UNICODE_ISDECIMAL(*p))
				3873	return PyInt_FromLong(1);
				3874
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3875	/* Special case for empty strings */
				3876	if (PyString_GET_SIZE(self) == 0)
				3877	return PyInt_FromLong(0);
				3878
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3879	e = p + PyUnicode_GET_SIZE(self);
				3880	for (; p < e; p++) {
				3881	if (!Py_UNICODE_ISDECIMAL(*p))
				3882	return PyInt_FromLong(0);
				3883	}
				3884	return PyInt_FromLong(1);
				3885	}
				3886
				3887	static char isdigit__doc__[] =
				3888	"S.isdigit() -> int\n\
				3889	\n\
				3890	Return 1 if there are only digit characters in S,\n\
				3891	0 otherwise.";
				3892
				3893	static PyObject*
				3894	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3895	{
				3896	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3897	register const Py_UNICODE *e;
				3898
				3899	if (!PyArg_NoArgs(args))
				3900	return NULL;
				3901
				3902	/* Shortcut for single character strings */
				3903	if (PyUnicode_GET_SIZE(self) == 1 &&
				3904	Py_UNICODE_ISDIGIT(*p))
				3905	return PyInt_FromLong(1);
				3906
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3907	/* Special case for empty strings */
				3908	if (PyString_GET_SIZE(self) == 0)
				3909	return PyInt_FromLong(0);
				3910
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3911	e = p + PyUnicode_GET_SIZE(self);
				3912	for (; p < e; p++) {
				3913	if (!Py_UNICODE_ISDIGIT(*p))
				3914	return PyInt_FromLong(0);
				3915	}
				3916	return PyInt_FromLong(1);
				3917	}
				3918
				3919	static char isnumeric__doc__[] =
				3920	"S.isnumeric() -> int\n\
				3921	\n\
				3922	Return 1 if there are only numeric characters in S,\n\
				3923	0 otherwise.";
				3924
				3925	static PyObject*
				3926	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3927	{
				3928	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3929	register const Py_UNICODE *e;
				3930
				3931	if (!PyArg_NoArgs(args))
				3932	return NULL;
				3933
				3934	/* Shortcut for single character strings */
				3935	if (PyUnicode_GET_SIZE(self) == 1 &&
				3936	Py_UNICODE_ISNUMERIC(*p))
				3937	return PyInt_FromLong(1);
				3938
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3939	/* Special case for empty strings */
				3940	if (PyString_GET_SIZE(self) == 0)
				3941	return PyInt_FromLong(0);
				3942
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3943	e = p + PyUnicode_GET_SIZE(self);
				3944	for (; p < e; p++) {
				3945	if (!Py_UNICODE_ISNUMERIC(*p))
				3946	return PyInt_FromLong(0);
				3947	}
				3948	return PyInt_FromLong(1);
				3949	}
				3950
				3951	static char join__doc__[] =
				3952	"S.join(sequence) -> unicode\n\
				3953	\n\
				3954	Return a string which is the concatenation of the strings in the\n\
				3955	sequence. The separator between elements is S.";
				3956
				3957	static PyObject*
				3958	unicode_join(PyUnicodeObject self, PyObject args)
				3959	{
				3960	PyObject *data;
				3961	if (!PyArg_ParseTuple(args, "O:join", &data))
				3962	return NULL;
				3963
				3964	return PyUnicode_Join((PyObject *)self, data);
				3965	}
				3966
				3967	static int
				3968	unicode_length(PyUnicodeObject *self)
				3969	{
				3970	return self->length;
				3971	}
				3972
				3973	static char ljust__doc__[] =
				3974	"S.ljust(width) -> unicode\n\
				3975	\n\
				3976	Return S left justified in a Unicode string of length width. Padding is\n\
				3977	done using spaces.";
				3978
				3979	static PyObject *
				3980	unicode_ljust(PyUnicodeObject self, PyObject args)
				3981	{
				3982	int width;
				3983	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3984	return NULL;
				3985
				3986	if (self->length >= width) {
				3987	Py_INCREF(self);
				3988	return (PyObject*) self;
				3989	}
				3990
				3991	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3992	}
				3993
				3994	static char lower__doc__[] =
				3995	"S.lower() -> unicode\n\
				3996	\n\
				3997	Return a copy of the string S converted to lowercase.";
				3998
				3999	static PyObject*
				4000	unicode_lower(PyUnicodeObject self, PyObject args)
				4001	{
				4002	if (!PyArg_NoArgs(args))
				4003	return NULL;
				4004	return fixup(self, fixlower);
				4005	}
				4006
				4007	static char lstrip__doc__[] =
				4008	"S.lstrip() -> unicode\n\
				4009	\n\
				4010	Return a copy of the string S with leading whitespace removed.";
				4011
				4012	static PyObject *
				4013	unicode_lstrip(PyUnicodeObject self, PyObject args)
				4014	{
				4015	if (!PyArg_NoArgs(args))
				4016	return NULL;
				4017	return strip(self, 1, 0);
				4018	}
				4019
				4020	static PyObject*
				4021	unicode_repeat(PyUnicodeObject *str, int len)
				4022	{
				4023	PyUnicodeObject *u;
				4024	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4025	int nchars;
				4026	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4027
				4028	if (len < 0)
				4029	len = 0;
				4030
				4031	if (len == 1) {
				4032	/* no repeat, return original string */
				4033	Py_INCREF(str);
				4034	return (PyObject*) str;
				4035	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4036
				4037	/* ensure # of chars needed doesn't overflow int and # of bytes
				4038	* needed doesn't overflow size_t
				4039	*/
				4040	nchars = len * str->length;
				4041	if (len && nchars / len != str->length) {
				4042	PyErr_SetString(PyExc_OverflowError,
				4043	"repeated string is too long");
				4044	return NULL;
				4045	}
				4046	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4047	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4048	PyErr_SetString(PyExc_OverflowError,
				4049	"repeated string is too long");
				4050	return NULL;
				4051	}
				4052	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4053	if (!u)
				4054	return NULL;
				4055
				4056	p = u->str;
				4057
				4058	while (len-- > 0) {
				4059	Py_UNICODE_COPY(p, str->str, str->length);
				4060	p += str->length;
				4061	}
				4062
				4063	return (PyObject*) u;
				4064	}
				4065
				4066	PyObject PyUnicode_Replace(PyObject obj,
				4067	PyObject *subobj,
				4068	PyObject *replobj,
				4069	int maxcount)
				4070	{
				4071	PyObject *self;
				4072	PyObject *str1;
				4073	PyObject *str2;
				4074	PyObject *result;
				4075
				4076	self = PyUnicode_FromObject(obj);
				4077	if (self == NULL)
				4078	return NULL;
				4079	str1 = PyUnicode_FromObject(subobj);
				4080	if (str1 == NULL) {
				4081	Py_DECREF(self);
				4082	return NULL;
				4083	}
				4084	str2 = PyUnicode_FromObject(replobj);
				4085	if (str2 == NULL) {
				4086	Py_DECREF(self);
				4087	Py_DECREF(str1);
				4088	return NULL;
				4089	}
				4090	result = replace((PyUnicodeObject *)self,
				4091	(PyUnicodeObject *)str1,
				4092	(PyUnicodeObject *)str2,
				4093	maxcount);
				4094	Py_DECREF(self);
				4095	Py_DECREF(str1);
				4096	Py_DECREF(str2);
				4097	return result;
				4098	}
				4099
				4100	static char replace__doc__[] =
				4101	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4102	\n\
				4103	Return a copy of S with all occurrences of substring\n\
				4104	old replaced by new. If the optional argument maxsplit is\n\
				4105	given, only the first maxsplit occurrences are replaced.";
				4106
				4107	static PyObject*
				4108	unicode_replace(PyUnicodeObject self, PyObject args)
				4109	{
				4110	PyUnicodeObject *str1;
				4111	PyUnicodeObject *str2;
				4112	int maxcount = -1;
				4113	PyObject *result;
				4114
				4115	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4116	return NULL;
				4117	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4118	if (str1 == NULL)
				4119	return NULL;
				4120	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4121	if (str2 == NULL)
				4122	return NULL;
				4123
				4124	result = replace(self, str1, str2, maxcount);
				4125
				4126	Py_DECREF(str1);
				4127	Py_DECREF(str2);
				4128	return result;
				4129	}
				4130
				4131	static
				4132	PyObject unicode_repr(PyObject unicode)
				4133	{
				4134	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4135	PyUnicode_GET_SIZE(unicode),
				4136	1);
				4137	}
				4138
				4139	static char rfind__doc__[] =
				4140	"S.rfind(sub [,start [,end]]) -> int\n\
				4141	\n\
				4142	Return the highest index in S where substring sub is found,\n\
				4143	such that sub is contained within s[start,end]. Optional\n\
				4144	arguments start and end are interpreted as in slice notation.\n\
				4145	\n\
				4146	Return -1 on failure.";
				4147
				4148	static PyObject *
				4149	unicode_rfind(PyUnicodeObject self, PyObject args)
				4150	{
				4151	PyUnicodeObject *substring;
				4152	int start = 0;
				4153	int end = INT_MAX;
				4154	PyObject *result;
				4155
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4156	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4157	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4158	return NULL;
				4159	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4160	(PyObject *)substring);
				4161	if (substring == NULL)
				4162	return NULL;
				4163
				4164	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4165
				4166	Py_DECREF(substring);
				4167	return result;
				4168	}
				4169
				4170	static char rindex__doc__[] =
				4171	"S.rindex(sub [,start [,end]]) -> int\n\
				4172	\n\
				4173	Like S.rfind() but raise ValueError when the substring is not found.";
				4174
				4175	static PyObject *
				4176	unicode_rindex(PyUnicodeObject self, PyObject args)
				4177	{
				4178	int result;
				4179	PyUnicodeObject *substring;
				4180	int start = 0;
				4181	int end = INT_MAX;
				4182
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4183	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4184	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4185	return NULL;
				4186	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4187	(PyObject *)substring);
				4188	if (substring == NULL)
				4189	return NULL;
				4190
				4191	result = findstring(self, substring, start, end, -1);
				4192
				4193	Py_DECREF(substring);
				4194	if (result < 0) {
				4195	PyErr_SetString(PyExc_ValueError, "substring not found");
				4196	return NULL;
				4197	}
				4198	return PyInt_FromLong(result);
				4199	}
				4200
				4201	static char rjust__doc__[] =
				4202	"S.rjust(width) -> unicode\n\
				4203	\n\
				4204	Return S right justified in a Unicode string of length width. Padding is\n\
				4205	done using spaces.";
				4206
				4207	static PyObject *
				4208	unicode_rjust(PyUnicodeObject self, PyObject args)
				4209	{
				4210	int width;
				4211	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4212	return NULL;
				4213
				4214	if (self->length >= width) {
				4215	Py_INCREF(self);
				4216	return (PyObject*) self;
				4217	}
				4218
				4219	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4220	}
				4221
				4222	static char rstrip__doc__[] =
				4223	"S.rstrip() -> unicode\n\
				4224	\n\
				4225	Return a copy of the string S with trailing whitespace removed.";
				4226
				4227	static PyObject *
				4228	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4229	{
				4230	if (!PyArg_NoArgs(args))
				4231	return NULL;
				4232	return strip(self, 0, 1);
				4233	}
				4234
				4235	static PyObject*
				4236	unicode_slice(PyUnicodeObject *self, int start, int end)
				4237	{
				4238	/* standard clamping */
				4239	if (start < 0)
				4240	start = 0;
				4241	if (end < 0)
				4242	end = 0;
				4243	if (end > self->length)
				4244	end = self->length;
				4245	if (start == 0 && end == self->length) {
				4246	/* full slice, return original string */
				4247	Py_INCREF(self);
				4248	return (PyObject*) self;
				4249	}
				4250	if (start > end)
				4251	start = end;
				4252	/* copy slice */
				4253	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4254	end - start);
				4255	}
				4256
				4257	PyObject PyUnicode_Split(PyObject s,
				4258	PyObject *sep,
				4259	int maxsplit)
				4260	{
				4261	PyObject *result;
				4262
				4263	s = PyUnicode_FromObject(s);
				4264	if (s == NULL)
				4265	return NULL;
				4266	if (sep != NULL) {
				4267	sep = PyUnicode_FromObject(sep);
				4268	if (sep == NULL) {
				4269	Py_DECREF(s);
				4270	return NULL;
				4271	}
				4272	}
				4273
				4274	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4275
				4276	Py_DECREF(s);
				4277	Py_XDECREF(sep);
				4278	return result;
				4279	}
				4280
				4281	static char split__doc__[] =
				4282	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4283	\n\
				4284	Return a list of the words in S, using sep as the\n\
				4285	delimiter string. If maxsplit is given, at most maxsplit\n\
				4286	splits are done. If sep is not specified, any whitespace string\n\
				4287	is a separator.";
				4288
				4289	static PyObject*
				4290	unicode_split(PyUnicodeObject self, PyObject args)
				4291	{
				4292	PyObject *substring = Py_None;
				4293	int maxcount = -1;
				4294
				4295	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4296	return NULL;
				4297
				4298	if (substring == Py_None)
				4299	return split(self, NULL, maxcount);
				4300	else if (PyUnicode_Check(substring))
				4301	return split(self, (PyUnicodeObject *)substring, maxcount);
				4302	else
				4303	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4304	}
				4305
				4306	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4307	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4308	\n\
				4309	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4310	Line breaks are not included in the resulting list unless keepends\n\
				4311	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4312
				4313	static PyObject*
				4314	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4315	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4316	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4317
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4318	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4319	return NULL;
				4320
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4321	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4322	}
				4323
				4324	static
				4325	PyObject unicode_str(PyUnicodeObject self)
				4326	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4327	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4328	}
				4329
				4330	static char strip__doc__[] =
				4331	"S.strip() -> unicode\n\
				4332	\n\
				4333	Return a copy of S with leading and trailing whitespace removed.";
				4334
				4335	static PyObject *
				4336	unicode_strip(PyUnicodeObject self, PyObject args)
				4337	{
				4338	if (!PyArg_NoArgs(args))
				4339	return NULL;
				4340	return strip(self, 1, 1);
				4341	}
				4342
				4343	static char swapcase__doc__[] =
				4344	"S.swapcase() -> unicode\n\
				4345	\n\
				4346	Return a copy of S with uppercase characters converted to lowercase\n\
				4347	and vice versa.";
				4348
				4349	static PyObject*
				4350	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4351	{
				4352	if (!PyArg_NoArgs(args))
				4353	return NULL;
				4354	return fixup(self, fixswapcase);
				4355	}
				4356
				4357	static char translate__doc__[] =
				4358	"S.translate(table) -> unicode\n\
				4359	\n\
				4360	Return a copy of the string S, where all characters have been mapped\n\
				4361	through the given translation table, which must be a mapping of\n\
				4362	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4363	are left untouched. Characters mapped to None are deleted.";
				4364
				4365	static PyObject*
				4366	unicode_translate(PyUnicodeObject self, PyObject args)
				4367	{
				4368	PyObject *table;
				4369
				4370	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4371	return NULL;
				4372	return PyUnicode_TranslateCharmap(self->str,
				4373	self->length,
				4374	table,
				4375	"ignore");
				4376	}
				4377
				4378	static char upper__doc__[] =
				4379	"S.upper() -> unicode\n\
				4380	\n\
				4381	Return a copy of S converted to uppercase.";
				4382
				4383	static PyObject*
				4384	unicode_upper(PyUnicodeObject self, PyObject args)
				4385	{
				4386	if (!PyArg_NoArgs(args))
				4387	return NULL;
				4388	return fixup(self, fixupper);
				4389	}
				4390
				4391	#if 0
				4392	static char zfill__doc__[] =
				4393	"S.zfill(width) -> unicode\n\
				4394	\n\
				4395	Pad a numeric string x with zeros on the left, to fill a field\n\
				4396	of the specified width. The string x is never truncated.";
				4397
				4398	static PyObject *
				4399	unicode_zfill(PyUnicodeObject self, PyObject args)
				4400	{
				4401	int fill;
				4402	PyUnicodeObject *u;
				4403
				4404	int width;
				4405	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4406	return NULL;
				4407
				4408	if (self->length >= width) {
				4409	Py_INCREF(self);
				4410	return (PyObject*) self;
				4411	}
				4412
				4413	fill = width - self->length;
				4414
				4415	u = pad(self, fill, 0, '0');
				4416
				4417	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4418	/* move sign to beginning of string */
				4419	u->str[0] = u->str[fill];
				4420	u->str[fill] = '0';
				4421	}
				4422
				4423	return (PyObject*) u;
				4424	}
				4425	#endif
				4426
				4427	#if 0
				4428	static PyObject*
				4429	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4430	{
				4431	if (!PyArg_NoArgs(args))
				4432	return NULL;
				4433	return PyInt_FromLong(unicode_freelist_size);
				4434	}
				4435	#endif
				4436
				4437	static char startswith__doc__[] =
				4438	"S.startswith(prefix[, start[, end]]) -> int\n\
				4439	\n\
				4440	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4441	optional start, test S beginning at that position. With optional end, stop\n\
				4442	comparing S at that position.";
				4443
				4444	static PyObject *
				4445	unicode_startswith(PyUnicodeObject *self,
				4446	PyObject *args)
				4447	{
				4448	PyUnicodeObject *substring;
				4449	int start = 0;
				4450	int end = INT_MAX;
				4451	PyObject *result;
				4452
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4453	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4454	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4455	return NULL;
				4456	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4457	(PyObject *)substring);
				4458	if (substring == NULL)
				4459	return NULL;
				4460
				4461	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4462
				4463	Py_DECREF(substring);
				4464	return result;
				4465	}
				4466
				4467
				4468	static char endswith__doc__[] =
				4469	"S.endswith(suffix[, start[, end]]) -> int\n\
				4470	\n\
				4471	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4472	optional start, test S beginning at that position. With optional end, stop\n\
				4473	comparing S at that position.";
				4474
				4475	static PyObject *
				4476	unicode_endswith(PyUnicodeObject *self,
				4477	PyObject *args)
				4478	{
				4479	PyUnicodeObject *substring;
				4480	int start = 0;
				4481	int end = INT_MAX;
				4482	PyObject *result;
				4483
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4484	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4485	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4486	return NULL;
				4487	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4488	(PyObject *)substring);
				4489	if (substring == NULL)
				4490	return NULL;
				4491
				4492	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4493
				4494	Py_DECREF(substring);
				4495	return result;
				4496	}
				4497
				4498
				4499	static PyMethodDef unicode_methods[] = {
				4500
				4501	/* Order is according to common usage: often used methods should
				4502	appear first, since lookup is done sequentially. */
				4503
				4504	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4505	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4506	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4507	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4508	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4509	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4510	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4511	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4512	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4513	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4514	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4515	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4516	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4517	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4518	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4519	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4520	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4521	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4522	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4523	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4524	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4525	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4526	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4527	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4528	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4529	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4530	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4531	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4532	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4533	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4534	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4535	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4536	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4537	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4538	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4539	#if 0
				4540	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4541	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4542	#endif
				4543
				4544	#if 0
				4545	/* This one is just used for debugging the implementation. */
				4546	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4547	#endif
				4548
				4549	{NULL, NULL}
				4550	};
				4551
				4552	static PyObject *
				4553	unicode_getattr(PyUnicodeObject self, char name)
				4554	{
				4555	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4556	}
				4557
				4558	static PySequenceMethods unicode_as_sequence = {
				4559	(inquiry) unicode_length, /* sq_length */
				4560	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4561	(intargfunc) unicode_repeat, /* sq_repeat */
				4562	(intargfunc) unicode_getitem, /* sq_item */
				4563	(intintargfunc) unicode_slice, /* sq_slice */
				4564	0, /* sq_ass_item */
				4565	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4566	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4567	};
				4568
				4569	static int
				4570	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4571	int index,
				4572	const void **ptr)
				4573	{
				4574	if (index != 0) {
				4575	PyErr_SetString(PyExc_SystemError,
				4576	"accessing non-existent unicode segment");
				4577	return -1;
				4578	}
				4579	ptr = (void ) self->str;
				4580	return PyUnicode_GET_DATA_SIZE(self);
				4581	}
				4582
				4583	static int
				4584	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4585	const void **ptr)
				4586	{
				4587	PyErr_SetString(PyExc_TypeError,
				4588	"cannot use unicode as modifyable buffer");
				4589	return -1;
				4590	}
				4591
				4592	static int
				4593	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4594	int *lenp)
				4595	{
				4596	if (lenp)
				4597	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4598	return 1;
				4599	}
				4600
				4601	static int
				4602	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4603	int index,
				4604	const void **ptr)
				4605	{
				4606	PyObject *str;
				4607
				4608	if (index != 0) {
				4609	PyErr_SetString(PyExc_SystemError,
				4610	"accessing non-existent unicode segment");
				4611	return -1;
				4612	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4613	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4614	if (str == NULL)
				4615	return -1;
				4616	ptr = (void ) PyString_AS_STRING(str);
				4617	return PyString_GET_SIZE(str);
				4618	}
				4619
				4620	/* Helpers for PyUnicode_Format() */
				4621
				4622	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4623	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4624	{
				4625	int argidx = *p_argidx;
				4626	if (argidx < arglen) {
				4627	(*p_argidx)++;
				4628	if (arglen < 0)
				4629	return args;
				4630	else
				4631	return PyTuple_GetItem(args, argidx);
				4632	}
				4633	PyErr_SetString(PyExc_TypeError,
				4634	"not enough arguments for format string");
				4635	return NULL;
				4636	}
				4637
				4638	#define F_LJUST (1<<0)
				4639	#define F_SIGN (1<<1)
				4640	#define F_BLANK (1<<2)
				4641	#define F_ALT (1<<3)
				4642	#define F_ZERO (1<<4)
				4643
				4644	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4645	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4646	{
				4647	register int i;
				4648	int len;
				4649	va_list va;
				4650	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4651	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4652
				4653	/* First, format the string as char array, then expand to Py_UNICODE
				4654	array. */
				4655	charbuffer = (char *)buffer;
				4656	len = vsprintf(charbuffer, format, va);
				4657	for (i = len - 1; i >= 0; i--)
				4658	buffer[i] = (Py_UNICODE) charbuffer[i];
				4659
				4660	va_end(va);
				4661	return len;
				4662	}
				4663
				4664	static int
				4665	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4666	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4667	int flags,
				4668	int prec,
				4669	int type,
				4670	PyObject *v)
				4671	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4672	/* fmt = '%#.' + `prec` + `type`
				4673	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4674	char fmt[20];
				4675	double x;
				4676
				4677	x = PyFloat_AsDouble(v);
				4678	if (x == -1.0 && PyErr_Occurred())
				4679	return -1;
				4680	if (prec < 0)
				4681	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4682	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4683	type = 'g';
				4684	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4685	/* worst case length calc to ensure no buffer overrun:
				4686	fmt = %#.<prec>g
				4687	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4688	for any double rep.)
				4689	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4690	If prec=0 the effective precision is 1 (the leading digit is
				4691	always given), therefore increase by one to 10+prec. */
				4692	if (buflen <= (size_t)10 + (size_t)prec) {
				4693	PyErr_SetString(PyExc_OverflowError,
				4694	"formatted float is too long (precision too long?)");
				4695	return -1;
				4696	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4697	return usprintf(buf, fmt, x);
				4698	}
				4699
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4700	static PyObject*
				4701	formatlong(PyObject *val, int flags, int prec, int type)
				4702	{
				4703	char *buf;
				4704	int i, len;
				4705	PyObject str; / temporary string object. */
				4706	PyUnicodeObject *result;
				4707
				4708	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4709	if (!str)
				4710	return NULL;
				4711	result = _PyUnicode_New(len);
				4712	for (i = 0; i < len; i++)
				4713	result->str[i] = buf[i];
				4714	result->str[len] = 0;
				4715	Py_DECREF(str);
				4716	return (PyObject*)result;
				4717	}
				4718
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4719	static int
				4720	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4721	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4722	int flags,
				4723	int prec,
				4724	int type,
				4725	PyObject *v)
				4726	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4727	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4728	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4729	+ 1 + 1 = 24*/
				4730	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4731	long x;
				4732
				4733	x = PyInt_AsLong(v);
				4734	if (x == -1 && PyErr_Occurred())
				4735	return -1;
				4736	if (prec < 0)
				4737	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4738	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4739	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4740	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4741	PyErr_SetString(PyExc_OverflowError,
				4742	"formatted integer is too long (precision too long?)");
				4743	return -1;
				4744	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4745	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4746	return usprintf(buf, fmt, x);
				4747	}
				4748
				4749	static int
				4750	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4751	size_t buflen,
				4752	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4753	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4754	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4755	if (PyUnicode_Check(v)) {
				4756	if (PyUnicode_GET_SIZE(v) != 1)
				4757	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4758	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4759	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4760
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4761	else if (PyString_Check(v)) {
				4762	if (PyString_GET_SIZE(v) != 1)
				4763	goto onError;
				4764	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4765	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4766
				4767	else {
				4768	/* Integer input truncated to a character */
				4769	long x;
				4770	x = PyInt_AsLong(v);
				4771	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4772	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4773	buf[0] = (char) x;
				4774	}
				4775	buf[1] = '\0';
				4776	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4777
				4778	onError:
				4779	PyErr_SetString(PyExc_TypeError,
				4780	"%c requires int or char");
				4781	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4782	}
				4783
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4784	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4785
				4786	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4787	chars are formatted. XXX This is a magic number. Each formatting
				4788	routine does bounds checking to ensure no overflow, but a better
				4789	solution may be to malloc a buffer of appropriate size for each
				4790	format. For now, the current solution is sufficient.
				4791	*/
				4792	#define FORMATBUFLEN (size_t)120
				4793
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4794	PyObject PyUnicode_Format(PyObject format,
				4795	PyObject *args)
				4796	{
				4797	Py_UNICODE fmt, res;
				4798	int fmtcnt, rescnt, reslen, arglen, argidx;
				4799	int args_owned = 0;
				4800	PyUnicodeObject *result = NULL;
				4801	PyObject *dict = NULL;
				4802	PyObject *uformat;
				4803
				4804	if (format == NULL \|\| args == NULL) {
				4805	PyErr_BadInternalCall();
				4806	return NULL;
				4807	}
				4808	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4809	if (uformat == NULL)
				4810	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4811	fmt = PyUnicode_AS_UNICODE(uformat);
				4812	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4813
				4814	reslen = rescnt = fmtcnt + 100;
				4815	result = _PyUnicode_New(reslen);
				4816	if (result == NULL)
				4817	goto onError;
				4818	res = PyUnicode_AS_UNICODE(result);
				4819
				4820	if (PyTuple_Check(args)) {
				4821	arglen = PyTuple_Size(args);
				4822	argidx = 0;
				4823	}
				4824	else {
				4825	arglen = -1;
				4826	argidx = -2;
				4827	}
				4828	if (args->ob_type->tp_as_mapping)
				4829	dict = args;
				4830
				4831	while (--fmtcnt >= 0) {
				4832	if (*fmt != '%') {
				4833	if (--rescnt < 0) {
				4834	rescnt = fmtcnt + 100;
				4835	reslen += rescnt;
				4836	if (_PyUnicode_Resize(result, reslen) < 0)
				4837	return NULL;
				4838	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4839	--rescnt;
				4840	}
				4841	res++ = fmt++;
				4842	}
				4843	else {
				4844	/* Got a format specifier */
				4845	int flags = 0;
				4846	int width = -1;
				4847	int prec = -1;
				4848	int size = 0;
				4849	Py_UNICODE c = '\0';
				4850	Py_UNICODE fill;
				4851	PyObject *v = NULL;
				4852	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4853	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4854	Py_UNICODE sign;
				4855	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4856	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4857
				4858	fmt++;
				4859	if (*fmt == '(') {
				4860	Py_UNICODE *keystart;
				4861	int keylen;
				4862	PyObject *key;
				4863	int pcount = 1;
				4864
				4865	if (dict == NULL) {
				4866	PyErr_SetString(PyExc_TypeError,
				4867	"format requires a mapping");
				4868	goto onError;
				4869	}
				4870	++fmt;
				4871	--fmtcnt;
				4872	keystart = fmt;
				4873	/* Skip over balanced parentheses */
				4874	while (pcount > 0 && --fmtcnt >= 0) {
				4875	if (*fmt == ')')
				4876	--pcount;
				4877	else if (*fmt == '(')
				4878	++pcount;
				4879	fmt++;
				4880	}
				4881	keylen = fmt - keystart - 1;
				4882	if (fmtcnt < 0 \|\| pcount > 0) {
				4883	PyErr_SetString(PyExc_ValueError,
				4884	"incomplete format key");
				4885	goto onError;
				4886	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4887	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4888	then looked up since Python uses strings to hold
				4889	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4890	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4891	key = PyUnicode_EncodeUTF8(keystart,
				4892	keylen,
				4893	NULL);
				4894	if (key == NULL)
				4895	goto onError;
				4896	if (args_owned) {
				4897	Py_DECREF(args);
				4898	args_owned = 0;
				4899	}
				4900	args = PyObject_GetItem(dict, key);
				4901	Py_DECREF(key);
				4902	if (args == NULL) {
				4903	goto onError;
				4904	}
				4905	args_owned = 1;
				4906	arglen = -1;
				4907	argidx = -2;
				4908	}
				4909	while (--fmtcnt >= 0) {
				4910	switch (c = *fmt++) {
				4911	case '-': flags \|= F_LJUST; continue;
				4912	case '+': flags \|= F_SIGN; continue;
				4913	case ' ': flags \|= F_BLANK; continue;
				4914	case '#': flags \|= F_ALT; continue;
				4915	case '0': flags \|= F_ZERO; continue;
				4916	}
				4917	break;
				4918	}
				4919	if (c == '*') {
				4920	v = getnextarg(args, arglen, &argidx);
				4921	if (v == NULL)
				4922	goto onError;
				4923	if (!PyInt_Check(v)) {
				4924	PyErr_SetString(PyExc_TypeError,
				4925	"* wants int");
				4926	goto onError;
				4927	}
				4928	width = PyInt_AsLong(v);
				4929	if (width < 0) {
				4930	flags \|= F_LJUST;
				4931	width = -width;
				4932	}
				4933	if (--fmtcnt >= 0)
				4934	c = *fmt++;
				4935	}
				4936	else if (c >= '0' && c <= '9') {
				4937	width = c - '0';
				4938	while (--fmtcnt >= 0) {
				4939	c = *fmt++;
				4940	if (c < '0' \|\| c > '9')
				4941	break;
				4942	if ((width*10) / 10 != width) {
				4943	PyErr_SetString(PyExc_ValueError,
				4944	"width too big");
				4945	goto onError;
				4946	}
				4947	width = width*10 + (c - '0');
				4948	}
				4949	}
				4950	if (c == '.') {
				4951	prec = 0;
				4952	if (--fmtcnt >= 0)
				4953	c = *fmt++;
				4954	if (c == '*') {
				4955	v = getnextarg(args, arglen, &argidx);
				4956	if (v == NULL)
				4957	goto onError;
				4958	if (!PyInt_Check(v)) {
				4959	PyErr_SetString(PyExc_TypeError,
				4960	"* wants int");
				4961	goto onError;
				4962	}
				4963	prec = PyInt_AsLong(v);
				4964	if (prec < 0)
				4965	prec = 0;
				4966	if (--fmtcnt >= 0)
				4967	c = *fmt++;
				4968	}
				4969	else if (c >= '0' && c <= '9') {
				4970	prec = c - '0';
				4971	while (--fmtcnt >= 0) {
				4972	c = Py_CHARMASK(*fmt++);
				4973	if (c < '0' \|\| c > '9')
				4974	break;
				4975	if ((prec*10) / 10 != prec) {
				4976	PyErr_SetString(PyExc_ValueError,
				4977	"prec too big");
				4978	goto onError;
				4979	}
				4980	prec = prec*10 + (c - '0');
				4981	}
				4982	}
				4983	} /* prec */
				4984	if (fmtcnt >= 0) {
				4985	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4986	size = c;
				4987	if (--fmtcnt >= 0)
				4988	c = *fmt++;
				4989	}
				4990	}
				4991	if (fmtcnt < 0) {
				4992	PyErr_SetString(PyExc_ValueError,
				4993	"incomplete format");
				4994	goto onError;
				4995	}
				4996	if (c != '%') {
				4997	v = getnextarg(args, arglen, &argidx);
				4998	if (v == NULL)
				4999	goto onError;
				5000	}
				5001	sign = 0;
				5002	fill = ' ';
				5003	switch (c) {
				5004
				5005	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5006	pbuf = formatbuf;
				5007	/* presume that buffer length is at least 1 */
				5008	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5009	len = 1;
				5010	break;
				5011
				5012	case 's':
				5013	case 'r':
				5014	if (PyUnicode_Check(v) && c == 's') {
				5015	temp = v;
				5016	Py_INCREF(temp);
				5017	}
				5018	else {
				5019	PyObject *unicode;
				5020	if (c == 's')
				5021	temp = PyObject_Str(v);
				5022	else
				5023	temp = PyObject_Repr(v);
				5024	if (temp == NULL)
				5025	goto onError;
				5026	if (!PyString_Check(temp)) {
				5027	/* XXX Note: this should never happen, since
				5028	PyObject_Repr() and PyObject_Str() assure
				5029	this */
				5030	Py_DECREF(temp);
				5031	PyErr_SetString(PyExc_TypeError,
				5032	"%s argument has non-string str()");
				5033	goto onError;
				5034	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5035	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5036	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5037	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5038	"strict");
				5039	Py_DECREF(temp);
				5040	temp = unicode;
				5041	if (temp == NULL)
				5042	goto onError;
				5043	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5044	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5045	len = PyUnicode_GET_SIZE(temp);
				5046	if (prec >= 0 && len > prec)
				5047	len = prec;
				5048	break;
				5049
				5050	case 'i':
				5051	case 'd':
				5052	case 'u':
				5053	case 'o':
				5054	case 'x':
				5055	case 'X':
				5056	if (c == 'i')
				5057	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5058	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5059	temp = formatlong(v, flags, prec, c);
				5060	if (!temp)
				5061	goto onError;
				5062	pbuf = PyUnicode_AS_UNICODE(temp);
				5063	len = PyUnicode_GET_SIZE(temp);
				5064	/* unbounded ints can always produce
				5065	a sign character! */
				5066	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5067	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5068	else {
				5069	pbuf = formatbuf;
				5070	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5071	flags, prec, c, v);
				5072	if (len < 0)
				5073	goto onError;
				5074	/* only d conversion is signed */
				5075	sign = c == 'd';
				5076	}
				5077	if (flags & F_ZERO)
				5078	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5079	break;
				5080
				5081	case 'e':
				5082	case 'E':
				5083	case 'f':
				5084	case 'g':
				5085	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5086	pbuf = formatbuf;
				5087	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5088	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5089	if (len < 0)
				5090	goto onError;
				5091	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5092	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5093	fill = '0';
				5094	break;
				5095
				5096	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5097	pbuf = formatbuf;
				5098	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5099	if (len < 0)
				5100	goto onError;
				5101	break;
				5102
				5103	default:
				5104	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5105	"unsupported format character '%c' (0x%x) "
				5106	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5107	(31<=c && c<=126) ? c : '?',
				5108	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5109	goto onError;
				5110	}
				5111	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5112	if (pbuf == '-' \|\| pbuf == '+') {
				5113	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5114	len--;
				5115	}
				5116	else if (flags & F_SIGN)
				5117	sign = '+';
				5118	else if (flags & F_BLANK)
				5119	sign = ' ';
				5120	else
				5121	sign = 0;
				5122	}
				5123	if (width < len)
				5124	width = len;
				5125	if (rescnt < width + (sign != 0)) {
				5126	reslen -= rescnt;
				5127	rescnt = width + fmtcnt + 100;
				5128	reslen += rescnt;
				5129	if (_PyUnicode_Resize(result, reslen) < 0)
				5130	return NULL;
				5131	res = PyUnicode_AS_UNICODE(result)
				5132	+ reslen - rescnt;
				5133	}
				5134	if (sign) {
				5135	if (fill != ' ')
				5136	*res++ = sign;
				5137	rescnt--;
				5138	if (width > len)
				5139	width--;
				5140	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5141	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5142	assert(pbuf[0] == '0');
				5143	assert(pbuf[1] == c);
				5144	if (fill != ' ') {
				5145	res++ = pbuf++;
				5146	res++ = pbuf++;
				5147	}
				5148	rescnt -= 2;
				5149	width -= 2;
				5150	if (width < 0)
				5151	width = 0;
				5152	len -= 2;
				5153	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5154	if (width > len && !(flags & F_LJUST)) {
				5155	do {
				5156	--rescnt;
				5157	*res++ = fill;
				5158	} while (--width > len);
				5159	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5160	if (fill == ' ') {
				5161	if (sign)
				5162	*res++ = sign;
				5163	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5164	assert(pbuf[0] == '0');
				5165	assert(pbuf[1] == c);
				5166	res++ = pbuf++;
				5167	res++ = pbuf++;
				5168	}
				5169	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5170	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5171	res += len;
				5172	rescnt -= len;
				5173	while (--width >= len) {
				5174	--rescnt;
				5175	*res++ = ' ';
				5176	}
				5177	if (dict && (argidx < arglen) && c != '%') {
				5178	PyErr_SetString(PyExc_TypeError,
				5179	"not all arguments converted");
				5180	goto onError;
				5181	}
				5182	Py_XDECREF(temp);
				5183	} /* '%' */
				5184	} /* until end */
				5185	if (argidx < arglen && !dict) {
				5186	PyErr_SetString(PyExc_TypeError,
				5187	"not all arguments converted");
				5188	goto onError;
				5189	}
				5190
				5191	if (args_owned) {
				5192	Py_DECREF(args);
				5193	}
				5194	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5195	if (_PyUnicode_Resize(result, reslen - rescnt))
				5196	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5197	return (PyObject *)result;
				5198
				5199	onError:
				5200	Py_XDECREF(result);
				5201	Py_DECREF(uformat);
				5202	if (args_owned) {
				5203	Py_DECREF(args);
				5204	}
				5205	return NULL;
				5206	}
				5207
				5208	static PyBufferProcs unicode_as_buffer = {
				5209	(getreadbufferproc) unicode_buffer_getreadbuf,
				5210	(getwritebufferproc) unicode_buffer_getwritebuf,
				5211	(getsegcountproc) unicode_buffer_getsegcount,
				5212	(getcharbufferproc) unicode_buffer_getcharbuf,
				5213	};
				5214
				5215	PyTypeObject PyUnicode_Type = {
				5216	PyObject_HEAD_INIT(&PyType_Type)
				5217	0, /* ob_size */
				5218	"unicode", /* tp_name */
				5219	sizeof(PyUnicodeObject), /* tp_size */
				5220	0, /* tp_itemsize */
				5221	/* Slots */
				5222	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5223	0, /* tp_print */
				5224	(getattrfunc)unicode_getattr, /* tp_getattr */
				5225	0, /* tp_setattr */
				5226	(cmpfunc) unicode_compare, /* tp_compare */
				5227	(reprfunc) unicode_repr, /* tp_repr */
				5228	0, /* tp_as_number */
				5229	&unicode_as_sequence, /* tp_as_sequence */
				5230	0, /* tp_as_mapping */
				5231	(hashfunc) unicode_hash, /* tp_hash*/
				5232	0, /* tp_call*/
				5233	(reprfunc) unicode_str, /* tp_str */
				5234	(getattrofunc) NULL, /* tp_getattro */
				5235	(setattrofunc) NULL, /* tp_setattro */
				5236	&unicode_as_buffer, /* tp_as_buffer */
				5237	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5238	};
				5239
				5240	/* Initialize the Unicode implementation */
				5241
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5242	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5243	{
				5244	/* Doublecheck the configuration... */
				5245	if (sizeof(Py_UNICODE) != 2)
				5246	Py_FatalError("Unicode configuration error: "
				5247	"sizeof(Py_UNICODE) != 2 bytes");
				5248
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5249	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5250	unicode_freelist = NULL;
				5251	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5252	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5253	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5254	}
				5255
				5256	/* Finalize the Unicode implementation */
				5257
				5258	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5259	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5260	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5261	PyUnicodeObject *u;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5262
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5263	Py_XDECREF(unicode_empty);
				5264	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5265
				5266	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5267	PyUnicodeObject *v = u;
				5268	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5269	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5270	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5271	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5272	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5273	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5274	unicode_freelist = NULL;
				5275	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5276	}