Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 5ee72bd128df8d94638c2113debf50d2acac44e5 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	67	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	68	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	69
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	70	#ifdef MS_WIN32
				71	#include <windows.h>
				72	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	73
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	74	/* Limit for the Unicode object free list */
				75
				76	#define MAX_UNICODE_FREELIST_SIZE 1024
				77
				78	/* Limit for the Unicode object free list stay alive optimization.
				79
				80	The implementation will keep allocated Unicode memory intact for
				81	all objects on the free list having a size less than this
				82	limit. This reduces malloc() overhead for small Unicode objects.
				83
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	84	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	85	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	malloc()-overhead) bytes of unused garbage.
				87
				88	Setting the limit to 0 effectively turns the feature off.
				89
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	90	Note: This is an experimental feature ! If you get core dumps when
				91	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92
				93	*/
				94
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	95	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	96
				97	/* Endianness switches; defaults to little endian */
				98
				99	#ifdef WORDS_BIGENDIAN
				100	# define BYTEORDER_IS_BIG_ENDIAN
				101	#else
				102	# define BYTEORDER_IS_LITTLE_ENDIAN
				103	#endif
				104
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	105	/* --- Globals ------------------------------------------------------------
				106
				107	The globals are initialized by the _PyUnicode_Init() API and should
				108	not be used before calling that API.
				109
				110	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	111
				112	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	113	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	114
				115	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	116	static PyUnicodeObject *unicode_freelist;
				117	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	119	/* Default encoding to use and assume when NULL is passed as encoding
				120	parameter; it is initialized by _PyUnicode_Init().
				121
				122	Always use the PyUnicode_SetDefaultEncoding() and
				123	PyUnicode_GetDefaultEncoding() APIs to access this global.
				124
				125	*/
				126
				127	static char unicode_default_encoding[100];
				128
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129	/* --- Unicode Object ----------------------------------------------------- */
				130
				131	static
				132	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				133	int length)
				134	{
				135	void *oldstr;
				136
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	137	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	138	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	139	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	140
				141	/* Resizing unicode_empty is not allowed. */
				142	if (unicode == unicode_empty) {
				143	PyErr_SetString(PyExc_SystemError,
				144	"can't resize empty unicode object");
				145	return -1;
				146	}
				147
				148	/* We allocate one more byte to make sure the string is
				149	Ux0000 terminated -- XXX is this needed ? */
				150	oldstr = unicode->str;
				151	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				152	if (!unicode->str) {
				153	unicode->str = oldstr;
				154	PyErr_NoMemory();
				155	return -1;
				156	}
				157	unicode->str[length] = 0;
				158	unicode->length = length;
				159
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	160	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	161	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	162	if (unicode->defenc) {
				163	Py_DECREF(unicode->defenc);
				164	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	165	}
				166	unicode->hash = -1;
				167
				168	return 0;
				169	}
				170
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	171	int PyUnicode_Resize(PyObject **unicode,
				172	int length)
				173	{
				174	PyUnicodeObject *v;
				175
				176	if (unicode == NULL) {
				177	PyErr_BadInternalCall();
				178	return -1;
				179	}
				180	v = (PyUnicodeObject )unicode;
				181	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				182	PyErr_BadInternalCall();
				183	return -1;
				184	}
				185	return _PyUnicode_Resize(v, length);
				186	}
				187
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	/* We allocate one more byte to make sure the string is
				189	Ux0000 terminated -- XXX is this needed ?
				190
				191	XXX This allocator could further be enhanced by assuring that the
				192	free list never reduces its size below 1.
				193
				194	*/
				195
				196	static
				197	PyUnicodeObject *_PyUnicode_New(int length)
				198	{
				199	register PyUnicodeObject *unicode;
				200
				201	/* Optimization for empty strings */
				202	if (length == 0 && unicode_empty != NULL) {
				203	Py_INCREF(unicode_empty);
				204	return unicode_empty;
				205	}
				206
				207	/* Unicode freelist & memory allocation */
				208	if (unicode_freelist) {
				209	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	210	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	211	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	212	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	213	/* Keep-Alive optimization: we only upsize the buffer,
				214	never downsize it. */
				215	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	216	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	217	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	218	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	219	}
				220	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	221	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	222	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	223	}
				224	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	225	}
				226	else {
				227	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				228	if (unicode == NULL)
				229	return NULL;
				230	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				231	}
				232
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	233	if (!unicode->str) {
				234	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	235	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	236	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	237	unicode->str[length] = 0;
				238	unicode->length = length;
				239	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	240	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	242
				243	onError:
				244	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	246	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	247	}
				248
				249	static
				250	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				251	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	252	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	253	/* Keep-Alive optimization */
				254	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	255	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	256	unicode->str = NULL;
				257	unicode->length = 0;
				258	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	259	if (unicode->defenc) {
				260	Py_DECREF(unicode->defenc);
				261	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	262	}
				263	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	264	(PyUnicodeObject *)unicode = unicode_freelist;
				265	unicode_freelist = unicode;
				266	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	267	}
				268	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	269	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	270	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	271	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	272	}
				273	}
				274
				275	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				276	int size)
				277	{
				278	PyUnicodeObject *unicode;
				279
				280	unicode = _PyUnicode_New(size);
				281	if (!unicode)
				282	return NULL;
				283
				284	/* Copy the Unicode data into the new object */
				285	if (u != NULL)
				286	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				287
				288	return (PyObject *)unicode;
				289	}
				290
				291	#ifdef HAVE_WCHAR_H
				292
				293	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				294	int size)
				295	{
				296	PyUnicodeObject *unicode;
				297
				298	if (w == NULL) {
				299	PyErr_BadInternalCall();
				300	return NULL;
				301	}
				302
				303	unicode = _PyUnicode_New(size);
				304	if (!unicode)
				305	return NULL;
				306
				307	/* Copy the wchar_t data into the new object */
				308	#ifdef HAVE_USABLE_WCHAR_T
				309	memcpy(unicode->str, w, size * sizeof(wchar_t));
				310	#else
				311	{
				312	register Py_UNICODE *u;
				313	register int i;
				314	u = PyUnicode_AS_UNICODE(unicode);
				315	for (i = size; i >= 0; i--)
				316	u++ = w++;
				317	}
				318	#endif
				319
				320	return (PyObject *)unicode;
				321	}
				322
				323	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				324	register wchar_t *w,
				325	int size)
				326	{
				327	if (unicode == NULL) {
				328	PyErr_BadInternalCall();
				329	return -1;
				330	}
				331	if (size > PyUnicode_GET_SIZE(unicode))
				332	size = PyUnicode_GET_SIZE(unicode);
				333	#ifdef HAVE_USABLE_WCHAR_T
				334	memcpy(w, unicode->str, size * sizeof(wchar_t));
				335	#else
				336	{
				337	register Py_UNICODE *u;
				338	register int i;
				339	u = PyUnicode_AS_UNICODE(unicode);
				340	for (i = size; i >= 0; i--)
				341	w++ = u++;
				342	}
				343	#endif
				344
				345	return size;
				346	}
				347
				348	#endif
				349
				350	PyObject PyUnicode_FromObject(register PyObject obj)
				351	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	352	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				353	}
				354
				355	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				356	const char *encoding,
				357	const char *errors)
				358	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	359	const char *s;
				360	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	361	int owned = 0;
				362	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	363
				364	if (obj == NULL) {
				365	PyErr_BadInternalCall();
				366	return NULL;
				367	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	368
				369	/* Coerce object */
				370	if (PyInstance_Check(obj)) {
				371	PyObject *func;
				372	func = PyObject_GetAttrString(obj, "__str__");
				373	if (func == NULL) {
				374	PyErr_SetString(PyExc_TypeError,
				375	"coercing to Unicode: instance doesn't define __str__");
				376	return NULL;
				377	}
				378	obj = PyEval_CallObject(func, NULL);
				379	Py_DECREF(func);
				380	if (obj == NULL)
				381	return NULL;
				382	owned = 1;
				383	}
				384	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	385	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	386	v = obj;
				387	if (encoding) {
				388	PyErr_SetString(PyExc_TypeError,
				389	"decoding Unicode is not supported");
				390	return NULL;
				391	}
				392	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	393	}
				394	else if (PyString_Check(obj)) {
				395	s = PyString_AS_STRING(obj);
				396	len = PyString_GET_SIZE(obj);
				397	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	398	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				399	/* Overwrite the error message with something more useful in
				400	case of a TypeError. */
				401	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	402	PyErr_Format(PyExc_TypeError,
				403	"coercing to Unicode: need string or buffer, "
				404	"%.80s found",
				405	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	406	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	407	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	408
				409	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	410	if (len == 0) {
				411	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	412	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	413	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	414	else
				415	v = PyUnicode_Decode(s, len, encoding, errors);
				416	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	417	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	418	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	419	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	420	return v;
				421
				422	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	423	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	424	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	425	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	426	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	427	}
				428
				429	PyObject PyUnicode_Decode(const char s,
				430	int size,
				431	const char *encoding,
				432	const char *errors)
				433	{
				434	PyObject buffer = NULL, unicode;
				435
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	436	if (encoding == NULL)
				437	encoding = PyUnicode_GetDefaultEncoding();
				438
				439	/* Shortcuts for common default encodings */
				440	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	441	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	442	else if (strcmp(encoding, "latin-1") == 0)
				443	return PyUnicode_DecodeLatin1(s, size, errors);
				444	else if (strcmp(encoding, "ascii") == 0)
				445	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	446
				447	/* Decode via the codec registry */
				448	buffer = PyBuffer_FromMemory((void *)s, size);
				449	if (buffer == NULL)
				450	goto onError;
				451	unicode = PyCodec_Decode(buffer, encoding, errors);
				452	if (unicode == NULL)
				453	goto onError;
				454	if (!PyUnicode_Check(unicode)) {
				455	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	456	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	457	unicode->ob_type->tp_name);
				458	Py_DECREF(unicode);
				459	goto onError;
				460	}
				461	Py_DECREF(buffer);
				462	return unicode;
				463
				464	onError:
				465	Py_XDECREF(buffer);
				466	return NULL;
				467	}
				468
				469	PyObject PyUnicode_Encode(const Py_UNICODE s,
				470	int size,
				471	const char *encoding,
				472	const char *errors)
				473	{
				474	PyObject v, unicode;
				475
				476	unicode = PyUnicode_FromUnicode(s, size);
				477	if (unicode == NULL)
				478	return NULL;
				479	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				480	Py_DECREF(unicode);
				481	return v;
				482	}
				483
				484	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				485	const char *encoding,
				486	const char *errors)
				487	{
				488	PyObject *v;
				489
				490	if (!PyUnicode_Check(unicode)) {
				491	PyErr_BadArgument();
				492	goto onError;
				493	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	494
				495	if (encoding == NULL)
				496	encoding = PyUnicode_GetDefaultEncoding();
				497
				498	/* Shortcuts for common default encodings */
				499	if (errors == NULL) {
				500	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	501	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	502	else if (strcmp(encoding, "latin-1") == 0)
				503	return PyUnicode_AsLatin1String(unicode);
				504	else if (strcmp(encoding, "ascii") == 0)
				505	return PyUnicode_AsASCIIString(unicode);
				506	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	507
				508	/* Encode via the codec registry */
				509	v = PyCodec_Encode(unicode, encoding, errors);
				510	if (v == NULL)
				511	goto onError;
				512	/* XXX Should we really enforce this ? */
				513	if (!PyString_Check(v)) {
				514	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	515	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	516	v->ob_type->tp_name);
				517	Py_DECREF(v);
				518	goto onError;
				519	}
				520	return v;
				521
				522	onError:
				523	return NULL;
				524	}
				525
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	526	/* Return a Python string holding the default encoded value of the
				527	Unicode object.
				528
				529	The resulting string is cached in the Unicode object for subsequent
				530	usage by this function. The cached version is needed to implement
				531	the character buffer interface and will live (at least) as long as
				532	the Unicode object itself.
				533
				534	The refcount of the string is not incremented.
				535
				536	* Exported for internal use by the interpreter only !!! *
				537
				538	*/
				539
				540	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				541	const char *errors)
				542	{
				543	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				544
				545	if (v)
				546	return v;
				547	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				548	if (v && errors == NULL)
				549	((PyUnicodeObject *)unicode)->defenc = v;
				550	return v;
				551	}
				552
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	553	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				554	{
				555	if (!PyUnicode_Check(unicode)) {
				556	PyErr_BadArgument();
				557	goto onError;
				558	}
				559	return PyUnicode_AS_UNICODE(unicode);
				560
				561	onError:
				562	return NULL;
				563	}
				564
				565	int PyUnicode_GetSize(PyObject *unicode)
				566	{
				567	if (!PyUnicode_Check(unicode)) {
				568	PyErr_BadArgument();
				569	goto onError;
				570	}
				571	return PyUnicode_GET_SIZE(unicode);
				572
				573	onError:
				574	return -1;
				575	}
				576
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	577	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	578	{
				579	return unicode_default_encoding;
				580	}
				581
				582	int PyUnicode_SetDefaultEncoding(const char *encoding)
				583	{
				584	PyObject *v;
				585
				586	/* Make sure the encoding is valid. As side effect, this also
				587	loads the encoding into the codec registry cache. */
				588	v = _PyCodec_Lookup(encoding);
				589	if (v == NULL)
				590	goto onError;
				591	Py_DECREF(v);
				592	strncpy(unicode_default_encoding,
				593	encoding,
				594	sizeof(unicode_default_encoding));
				595	return 0;
				596
				597	onError:
				598	return -1;
				599	}
				600
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	601	/* --- UTF-8 Codec -------------------------------------------------------- */
				602
				603	static
				604	char utf8_code_length[256] = {
				605	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				606	illegal prefix. see RFC 2279 for details */
				607	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				608	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				609	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				610	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				611	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				612	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				613	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				614	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				615	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				616	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				617	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				618	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				619	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				620	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				621	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				622	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				623	};
				624
				625	static
				626	int utf8_decoding_error(const char **source,
				627	Py_UNICODE **dest,
				628	const char *errors,
				629	const char *details)
				630	{
				631	if ((errors == NULL) \|\|
				632	(strcmp(errors,"strict") == 0)) {
				633	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	634	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	635	details);
				636	return -1;
				637	}
				638	else if (strcmp(errors,"ignore") == 0) {
				639	(*source)++;
				640	return 0;
				641	}
				642	else if (strcmp(errors,"replace") == 0) {
				643	(*source)++;
				644	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				645	(*dest)++;
				646	return 0;
				647	}
				648	else {
				649	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	650	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	651	errors);
				652	return -1;
				653	}
				654	}
				655
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	656	PyObject PyUnicode_DecodeUTF8(const char s,
				657	int size,
				658	const char *errors)
				659	{
				660	int n;
				661	const char *e;
				662	PyUnicodeObject *unicode;
				663	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	664	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	665
				666	/* Note: size will always be longer than the resulting Unicode
				667	character count */
				668	unicode = _PyUnicode_New(size);
				669	if (!unicode)
				670	return NULL;
				671	if (size == 0)
				672	return (PyObject *)unicode;
				673
				674	/* Unpack UTF-8 encoded data */
				675	p = unicode->str;
				676	e = s + size;
				677
				678	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	679	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	680
				681	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	682	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	683	s++;
				684	continue;
				685	}
				686
				687	n = utf8_code_length[ch];
				688
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	689	if (s + n > e) {
				690	errmsg = "unexpected end of data";
				691	goto utf8Error;
				692	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	693
				694	switch (n) {
				695
				696	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	697	errmsg = "unexpected code byte";
				698	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699	break;
				700
				701	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	702	errmsg = "internal error";
				703	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	704	break;
				705
				706	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	707	if ((s[1] & 0xc0) != 0x80) {
				708	errmsg = "invalid data";
				709	goto utf8Error;
				710	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	711	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	712	if (ch < 0x80) {
				713	errmsg = "illegal encoding";
				714	goto utf8Error;
				715	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	716	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	717	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	718	break;
				719
				720	case 3:
				721	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	722	(s[2] & 0xc0) != 0x80) {
				723	errmsg = "invalid data";
				724	goto utf8Error;
				725	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	726	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	727	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				728	errmsg = "illegal encoding";
				729	goto utf8Error;
				730	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	731	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	732	*p++ = (Py_UNICODE)ch;
				733	break;
				734
				735	case 4:
				736	if ((s[1] & 0xc0) != 0x80 \|\|
				737	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	738	(s[3] & 0xc0) != 0x80) {
				739	errmsg = "invalid data";
				740	goto utf8Error;
				741	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	742	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				743	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				744	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	745	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				746	byte encoding */
				747	(ch > 0x10ffff)) { /* maximum value allowed for
				748	UTF-16 */
				749	errmsg = "illegal encoding";
				750	goto utf8Error;
				751	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	752	/* compute and append the two surrogates: */
				753
				754	/* translate from 10000..10FFFF to 0..FFFF */
				755	ch -= 0x10000;
				756
				757	/* high surrogate = top 10 bits added to D800 */
				758	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				759
				760	/* low surrogate = bottom 10 bits added to DC00 */
				761	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	762	break;
				763
				764	default:
				765	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	766	errmsg = "unsupported Unicode code range";
				767	goto utf8Error;
				768	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	769	}
				770	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	771	continue;
				772
				773	utf8Error:
				774	if (utf8_decoding_error(&s, &p, errors, errmsg))
				775	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	776	}
				777
				778	/* Adjust length */
				779	if (_PyUnicode_Resize(unicode, p - unicode->str))
				780	goto onError;
				781
				782	return (PyObject *)unicode;
				783
				784	onError:
				785	Py_DECREF(unicode);
				786	return NULL;
				787	}
				788
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	789	/* Not used anymore, now that the encoder supports UTF-16
				790	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	791	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	792	static
				793	int utf8_encoding_error(const Py_UNICODE **source,
				794	char **dest,
				795	const char *errors,
				796	const char *details)
				797	{
				798	if ((errors == NULL) \|\|
				799	(strcmp(errors,"strict") == 0)) {
				800	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	801	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	802	details);
				803	return -1;
				804	}
				805	else if (strcmp(errors,"ignore") == 0) {
				806	return 0;
				807	}
				808	else if (strcmp(errors,"replace") == 0) {
				809	**dest = '?';
				810	(*dest)++;
				811	return 0;
				812	}
				813	else {
				814	PyErr_Format(PyExc_ValueError,
				815	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	816	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	817	errors);
				818	return -1;
				819	}
				820	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	821	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	822
				823	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				824	int size,
				825	const char *errors)
				826	{
				827	PyObject *v;
				828	char *p;
				829	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	830	Py_UCS4 ch2;
				831	unsigned int cbAllocated = 3 * size;
				832	unsigned int cbWritten = 0;
				833	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	834
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	835	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	836	if (v == NULL)
				837	return NULL;
				838	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	839	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	840
				841	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	842	while (i < size) {
				843	Py_UCS4 ch = s[i++];
				844	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	845	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	846	cbWritten++;
				847	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	848	else if (ch < 0x0800) {
				849	*p++ = 0xc0 \| (ch >> 6);
				850	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	851	cbWritten += 2;
				852	}
				853	else {
				854	/* Check for high surrogate */
				855	if (0xD800 <= ch && ch <= 0xDBFF) {
				856	if (i != size) {
				857	ch2 = s[i];
				858	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				859
				860	if (cbWritten >= (cbAllocated - 4)) {
				861	/* Provide enough room for some more
				862	surrogates */
				863	cbAllocated += 4*10;
				864	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	865	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	866	}
				867
				868	/* combine the two values */
				869	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				870
				871	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	872	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	873	i++;
				874	cbWritten += 4;
				875	}
				876	}
				877	}
				878	else {
				879	*p++ = (char)(0xe0 \| (ch >> 12));
				880	cbWritten += 3;
				881	}
				882	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				883	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	884	}
				885	}
				886	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	887	if (_PyString_Resize(&v, p - q))
				888	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	889	return v;
				890
				891	onError:
				892	Py_DECREF(v);
				893	return NULL;
				894	}
				895
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	896	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				897	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	898	if (!PyUnicode_Check(unicode)) {
				899	PyErr_BadArgument();
				900	return NULL;
				901	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	902	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				903	PyUnicode_GET_SIZE(unicode),
				904	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	905	}
				906
				907	/* --- UTF-16 Codec ------------------------------------------------------- */
				908
				909	static
				910	int utf16_decoding_error(const Py_UNICODE **source,
				911	Py_UNICODE **dest,
				912	const char *errors,
				913	const char *details)
				914	{
				915	if ((errors == NULL) \|\|
				916	(strcmp(errors,"strict") == 0)) {
				917	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	918	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	919	details);
				920	return -1;
				921	}
				922	else if (strcmp(errors,"ignore") == 0) {
				923	return 0;
				924	}
				925	else if (strcmp(errors,"replace") == 0) {
				926	if (dest) {
				927	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				928	(*dest)++;
				929	}
				930	return 0;
				931	}
				932	else {
				933	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	934	"UTF-16 decoding error; "
				935	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	936	errors);
				937	return -1;
				938	}
				939	}
				940
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	941	PyObject PyUnicode_DecodeUTF16(const char s,
				942	int size,
				943	const char *errors,
				944	int *byteorder)
				945	{
				946	PyUnicodeObject *unicode;
				947	Py_UNICODE *p;
				948	const Py_UNICODE q, e;
				949	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	950	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	951
				952	/* size should be an even number */
				953	if (size % sizeof(Py_UNICODE) != 0) {
				954	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				955	return NULL;
				956	/* The remaining input chars are ignored if we fall through
				957	here... */
				958	}
				959
				960	/* Note: size will always be longer than the resulting Unicode
				961	character count */
				962	unicode = _PyUnicode_New(size);
				963	if (!unicode)
				964	return NULL;
				965	if (size == 0)
				966	return (PyObject *)unicode;
				967
				968	/* Unpack UTF-16 encoded data */
				969	p = unicode->str;
				970	q = (Py_UNICODE *)s;
				971	e = q + (size / sizeof(Py_UNICODE));
				972
				973	if (byteorder)
				974	bo = *byteorder;
				975
				976	while (q < e) {
				977	register Py_UNICODE ch = *q++;
				978
				979	/* Check for BOM marks (U+FEFF) in the input and adjust
				980	current byte order setting accordingly. Swap input
				981	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				982	!) */
				983	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				984	if (ch == 0xFEFF) {
				985	bo = -1;
				986	continue;
				987	} else if (ch == 0xFFFE) {
				988	bo = 1;
				989	continue;
				990	}
				991	if (bo == 1)
				992	ch = (ch >> 8) \| (ch << 8);
				993	#else
				994	if (ch == 0xFEFF) {
				995	bo = 1;
				996	continue;
				997	} else if (ch == 0xFFFE) {
				998	bo = -1;
				999	continue;
				1000	}
				1001	if (bo == -1)
				1002	ch = (ch >> 8) \| (ch << 8);
				1003	#endif
				1004	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1005	*p++ = ch;
				1006	continue;
				1007	}
				1008
				1009	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1010	if (q >= e) {
				1011	errmsg = "unexpected end of data";
				1012	goto utf16Error;
				1013	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1014	if (0xDC00 <= q && q <= 0xDFFF) {
				1015	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1016	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1017	/* This is valid data (a UTF-16 surrogate pair), but
				1018	we are not able to store this information since our
				1019	Py_UNICODE type only has 16 bits... this might
				1020	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1021	errmsg = "code pairs are not supported";
				1022	goto utf16Error;
				1023	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1024	else
				1025	continue;
				1026	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1027	errmsg = "illegal encoding";
				1028	/* Fall through to report the error */
				1029
				1030	utf16Error:
				1031	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1032	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1033	}
				1034
				1035	if (byteorder)
				1036	*byteorder = bo;
				1037
				1038	/* Adjust length */
				1039	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1040	goto onError;
				1041
				1042	return (PyObject *)unicode;
				1043
				1044	onError:
				1045	Py_DECREF(unicode);
				1046	return NULL;
				1047	}
				1048
				1049	#undef UTF16_ERROR
				1050
				1051	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1052	int size,
				1053	const char *errors,
				1054	int byteorder)
				1055	{
				1056	PyObject *v;
				1057	Py_UNICODE *p;
				1058	char *q;
				1059
				1060	/* We don't create UTF-16 pairs... */
				1061	v = PyString_FromStringAndSize(NULL,
				1062	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1063	if (v == NULL)
				1064	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1065
				1066	q = PyString_AS_STRING(v);
				1067	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1068	if (byteorder == 0)
				1069	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1070	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1071	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1072	if (byteorder == 0 \|\|
				1073	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1074	byteorder == -1
				1075	#else
				1076	byteorder == 1
				1077	#endif
				1078	)
				1079	memcpy(p, s, size * sizeof(Py_UNICODE));
				1080	else
				1081	while (size-- > 0) {
				1082	Py_UNICODE ch = *s++;
				1083	*p++ = (ch >> 8) \| (ch << 8);
				1084	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1085	return v;
				1086	}
				1087
				1088	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1089	{
				1090	if (!PyUnicode_Check(unicode)) {
				1091	PyErr_BadArgument();
				1092	return NULL;
				1093	}
				1094	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1095	PyUnicode_GET_SIZE(unicode),
				1096	NULL,
				1097	0);
				1098	}
				1099
				1100	/* --- Unicode Escape Codec ----------------------------------------------- */
				1101
				1102	static
				1103	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1104	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1105	const char *errors,
				1106	const char *details)
				1107	{
				1108	if ((errors == NULL) \|\|
				1109	(strcmp(errors,"strict") == 0)) {
				1110	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1111	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1112	details);
				1113	return -1;
				1114	}
				1115	else if (strcmp(errors,"ignore") == 0) {
				1116	return 0;
				1117	}
				1118	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1119	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1120	return 0;
				1121	}
				1122	else {
				1123	PyErr_Format(PyExc_ValueError,
				1124	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1125	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1126	errors);
				1127	return -1;
				1128	}
				1129	}
				1130
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1131	static _Py_UCNHashAPI *pucnHash = NULL;
				1132
				1133	static
				1134	int mystrnicmp(const char s1, const char s2, size_t count)
				1135	{
				1136	char c1, c2;
				1137
				1138	if (count)
				1139	{
				1140	do
				1141	{
				1142	c1 = tolower(*(s1++));
				1143	c2 = tolower(*(s2++));
				1144	}
				1145	while(--count && c1 == c2);
				1146
				1147	return c1 - c2;
				1148	}
				1149
				1150	return 0;
				1151	}
				1152
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1153	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1154	int size,
				1155	const char *errors)
				1156	{
				1157	PyUnicodeObject *v;
				1158	Py_UNICODE p = NULL, buf = NULL;
				1159	const char *end;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1160	Py_UCS4 chr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1161
				1162	/* Escaped strings will always be longer than the resulting
				1163	Unicode string, so we start with size here and then reduce the
				1164	length after conversion to the true value. */
				1165	v = _PyUnicode_New(size);
				1166	if (v == NULL)
				1167	goto onError;
				1168	if (size == 0)
				1169	return (PyObject *)v;
				1170	p = buf = PyUnicode_AS_UNICODE(v);
				1171	end = s + size;
				1172	while (s < end) {
				1173	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1174	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1175	int i;
				1176
				1177	/* Non-escape characters are interpreted as Unicode ordinals */
				1178	if (*s != '\\') {
				1179	p++ = (unsigned char)s++;
				1180	continue;
				1181	}
				1182
				1183	/* \ - Escapes */
				1184	s++;
				1185	switch (*s++) {
				1186
				1187	/* \x escapes */
				1188	case '\n': break;
				1189	case '\\': *p++ = '\\'; break;
				1190	case '\'': *p++ = '\''; break;
				1191	case '\"': *p++ = '\"'; break;
				1192	case 'b': *p++ = '\b'; break;
				1193	case 'f': p++ = '\014'; break; / FF */
				1194	case 't': *p++ = '\t'; break;
				1195	case 'n': *p++ = '\n'; break;
				1196	case 'r': *p++ = '\r'; break;
				1197	case 'v': p++ = '\013'; break; / VT */
				1198	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1199
				1200	/* \OOO (octal) escapes */
				1201	case '0': case '1': case '2': case '3':
				1202	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1203	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1204	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1205	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1206	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1207	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1208	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1209	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1210	break;
				1211
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1212	/* \xXX with two hex digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1213	case 'x':
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1214	for (x = 0, i = 0; i < 2; i++) {
				1215	c = (unsigned char)s[i];
				1216	if (!isxdigit(c)) {
				1217	if (unicodeescape_decoding_error(&s, &x, errors,
				1218	"truncated \\xXX"))
				1219	goto onError;
				1220	i++;
				1221	break;
				1222	}
				1223	x = (x<<4) & ~0xF;
				1224	if (c >= '0' && c <= '9')
				1225	x += c - '0';
				1226	else if (c >= 'a' && c <= 'f')
				1227	x += 10 + c - 'a';
				1228	else
				1229	x += 10 + c - 'A';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1230	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1231	s += i;
				1232	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1233	break;
				1234
				1235	/* \uXXXX with 4 hex digits */
				1236	case 'u':
				1237	for (x = 0, i = 0; i < 4; i++) {
				1238	c = (unsigned char)s[i];
				1239	if (!isxdigit(c)) {
				1240	if (unicodeescape_decoding_error(&s, &x, errors,
				1241	"truncated \\uXXXX"))
				1242	goto onError;
				1243	i++;
				1244	break;
				1245	}
				1246	x = (x<<4) & ~0xF;
				1247	if (c >= '0' && c <= '9')
				1248	x += c - '0';
				1249	else if (c >= 'a' && c <= 'f')
				1250	x += 10 + c - 'a';
				1251	else
				1252	x += 10 + c - 'A';
				1253	}
				1254	s += i;
				1255	*p++ = x;
				1256	break;
				1257
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1258	/* \UXXXXXXXX with 8 hex digits */
				1259	case 'U':
				1260	for (chr = 0, i = 0; i < 8; i++) {
				1261	c = (unsigned char)s[i];
				1262	if (!isxdigit(c)) {
				1263	if (unicodeescape_decoding_error(&s, &x, errors,
				1264	"truncated \\uXXXX"))
				1265	goto onError;
				1266	i++;
				1267	break;
				1268	}
				1269	chr = (chr<<4) & ~0xF;
				1270	if (c >= '0' && c <= '9')
				1271	chr += c - '0';
				1272	else if (c >= 'a' && c <= 'f')
				1273	chr += 10 + c - 'a';
				1274	else
				1275	chr += 10 + c - 'A';
				1276	}
				1277	s += i;
				1278	goto store;
				1279
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1280	case 'N':
				1281	/* Ok, we need to deal with Unicode Character Names now,
				1282	* make sure we've imported the hash table data...
				1283	*/
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1284	if (pucnHash == NULL) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1285	PyObject mod = 0, v = 0;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1286	mod = PyImport_ImportModule("ucnhash");
				1287	if (mod == NULL)
				1288	goto onError;
				1289	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1290	Py_DECREF(mod);
				1291	if (v == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1292	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1293	pucnHash = PyCObject_AsVoidPtr(v);
				1294	Py_DECREF(v);
				1295	if (pucnHash == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1296	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1297	}
				1298
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1299	if (*s == '{') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1300	const char *start = s + 1;
				1301	const char *endBrace = start;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1302	unsigned long j;
				1303
				1304	/* look for either the closing brace, or we
				1305	* exceed the maximum length of the unicode character names
				1306	*/
				1307	while (*endBrace != '}' &&
				1308	(unsigned int)(endBrace - start) <=
				1309	pucnHash->cchMax &&
				1310	endBrace < end)
				1311	{
				1312	endBrace++;
				1313	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1314	if (endBrace != end && *endBrace == '}') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1315	j = pucnHash->hash(start, endBrace - start);
				1316	if (j > pucnHash->cKeys \|\|
				1317	mystrnicmp(
				1318	start,
				1319	((_Py_UnicodeCharacterName *)
				1320	(pucnHash->getValue(j)))->pszUCN,
				1321	(int)(endBrace - start)) != 0)
				1322	{
				1323	if (unicodeescape_decoding_error(
				1324	&s, &x, errors,
				1325	"Invalid Unicode Character Name"))
				1326	{
				1327	goto onError;
				1328	}
				1329	goto ucnFallthrough;
				1330	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1331	chr = ((_Py_UnicodeCharacterName *)
				1332	(pucnHash->getValue(j)))->value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1333	s = endBrace + 1;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1334	goto store;
				1335	} else {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1336	if (unicodeescape_decoding_error(
				1337	&s, &x, errors,
				1338	"Unicode name missing closing brace"))
				1339	goto onError;
				1340	goto ucnFallthrough;
				1341	}
				1342	break;
				1343	}
				1344	if (unicodeescape_decoding_error(
				1345	&s, &x, errors,
				1346	"Missing opening brace for Unicode Character Name escape"))
				1347	goto onError;
				1348	ucnFallthrough:
				1349	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1350	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1351	*p++ = '\\';
				1352	*p++ = (unsigned char)s[-1];
				1353	break;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1354	store:
				1355	/* when we get here, chr is a 32-bit unicode character */
				1356	if (chr <= 0xffff)
				1357	/* UCS-2 character */
				1358	*p++ = (Py_UNICODE) chr;
				1359	else if (chr <= 0x10ffff) {
				1360	/* UCS-4 character. store as two surrogate characters */
				1361	chr -= 0x10000L;
				1362	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
				1363	*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
				1364	} else {
				1365	if (unicodeescape_decoding_error(
				1366	&s, &x, errors,
				1367	"Illegal Unicode character")
				1368	)
				1369	goto onError;
				1370	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1371	}
				1372	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1373	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1374	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1375	return (PyObject *)v;
				1376
				1377	onError:
				1378	Py_XDECREF(v);
				1379	return NULL;
				1380	}
				1381
				1382	/* Return a Unicode-Escape string version of the Unicode object.
				1383
				1384	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1385	appropriate.
				1386
				1387	*/
				1388
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1389	static const Py_UNICODE findchar(const Py_UNICODE s,
				1390	int size,
				1391	Py_UNICODE ch);
				1392
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1393	static
				1394	PyObject unicodeescape_string(const Py_UNICODE s,
				1395	int size,
				1396	int quotes)
				1397	{
				1398	PyObject *repr;
				1399	char *p;
				1400	char *q;
				1401
				1402	static const char *hexdigit = "0123456789ABCDEF";
				1403
				1404	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1405	if (repr == NULL)
				1406	return NULL;
				1407
				1408	p = q = PyString_AS_STRING(repr);
				1409
				1410	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1411	*p++ = 'u';
				1412	*p++ = (findchar(s, size, '\'') &&
				1413	!findchar(s, size, '"')) ? '"' : '\'';
				1414	}
				1415	while (size-- > 0) {
				1416	Py_UNICODE ch = *s++;
				1417	/* Escape quotes */
				1418	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1419	*p++ = '\\';
				1420	*p++ = (char) ch;
				1421	}
				1422	/* Map 16-bit characters to '\uxxxx' */
				1423	else if (ch >= 256) {
				1424	*p++ = '\\';
				1425	*p++ = 'u';
				1426	*p++ = hexdigit[(ch >> 12) & 0xf];
				1427	*p++ = hexdigit[(ch >> 8) & 0xf];
				1428	*p++ = hexdigit[(ch >> 4) & 0xf];
				1429	*p++ = hexdigit[ch & 15];
				1430	}
				1431	/* Map non-printable US ASCII to '\ooo' */
				1432	else if (ch < ' ' \|\| ch >= 128) {
				1433	*p++ = '\\';
				1434	*p++ = hexdigit[(ch >> 6) & 7];
				1435	*p++ = hexdigit[(ch >> 3) & 7];
				1436	*p++ = hexdigit[ch & 7];
				1437	}
				1438	/* Copy everything else as-is */
				1439	else
				1440	*p++ = (char) ch;
				1441	}
				1442	if (quotes)
				1443	*p++ = q[1];
				1444
				1445	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1446	if (_PyString_Resize(&repr, p - q))
				1447	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1448
				1449	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1450
				1451	onError:
				1452	Py_DECREF(repr);
				1453	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1454	}
				1455
				1456	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1457	int size)
				1458	{
				1459	return unicodeescape_string(s, size, 0);
				1460	}
				1461
				1462	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1463	{
				1464	if (!PyUnicode_Check(unicode)) {
				1465	PyErr_BadArgument();
				1466	return NULL;
				1467	}
				1468	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1469	PyUnicode_GET_SIZE(unicode));
				1470	}
				1471
				1472	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1473
				1474	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1475	int size,
				1476	const char *errors)
				1477	{
				1478	PyUnicodeObject *v;
				1479	Py_UNICODE p, buf;
				1480	const char *end;
				1481	const char *bs;
				1482
				1483	/* Escaped strings will always be longer than the resulting
				1484	Unicode string, so we start with size here and then reduce the
				1485	length after conversion to the true value. */
				1486	v = _PyUnicode_New(size);
				1487	if (v == NULL)
				1488	goto onError;
				1489	if (size == 0)
				1490	return (PyObject *)v;
				1491	p = buf = PyUnicode_AS_UNICODE(v);
				1492	end = s + size;
				1493	while (s < end) {
				1494	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1495	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1496	int i;
				1497
				1498	/* Non-escape characters are interpreted as Unicode ordinals */
				1499	if (*s != '\\') {
				1500	p++ = (unsigned char)s++;
				1501	continue;
				1502	}
				1503
				1504	/* \u-escapes are only interpreted iff the number of leading
				1505	backslashes if odd */
				1506	bs = s;
				1507	for (;s < end;) {
				1508	if (*s != '\\')
				1509	break;
				1510	p++ = (unsigned char)s++;
				1511	}
				1512	if (((s - bs) & 1) == 0 \|\|
				1513	s >= end \|\|
				1514	*s != 'u') {
				1515	continue;
				1516	}
				1517	p--;
				1518	s++;
				1519
				1520	/* \uXXXX with 4 hex digits */
				1521	for (x = 0, i = 0; i < 4; i++) {
				1522	c = (unsigned char)s[i];
				1523	if (!isxdigit(c)) {
				1524	if (unicodeescape_decoding_error(&s, &x, errors,
				1525	"truncated \\uXXXX"))
				1526	goto onError;
				1527	i++;
				1528	break;
				1529	}
				1530	x = (x<<4) & ~0xF;
				1531	if (c >= '0' && c <= '9')
				1532	x += c - '0';
				1533	else if (c >= 'a' && c <= 'f')
				1534	x += 10 + c - 'a';
				1535	else
				1536	x += 10 + c - 'A';
				1537	}
				1538	s += i;
				1539	*p++ = x;
				1540	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1541	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1542	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1543	return (PyObject *)v;
				1544
				1545	onError:
				1546	Py_XDECREF(v);
				1547	return NULL;
				1548	}
				1549
				1550	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1551	int size)
				1552	{
				1553	PyObject *repr;
				1554	char *p;
				1555	char *q;
				1556
				1557	static const char *hexdigit = "0123456789ABCDEF";
				1558
				1559	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1560	if (repr == NULL)
				1561	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1562	if (size == 0)
				1563	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1564
				1565	p = q = PyString_AS_STRING(repr);
				1566	while (size-- > 0) {
				1567	Py_UNICODE ch = *s++;
				1568	/* Map 16-bit characters to '\uxxxx' */
				1569	if (ch >= 256) {
				1570	*p++ = '\\';
				1571	*p++ = 'u';
				1572	*p++ = hexdigit[(ch >> 12) & 0xf];
				1573	*p++ = hexdigit[(ch >> 8) & 0xf];
				1574	*p++ = hexdigit[(ch >> 4) & 0xf];
				1575	*p++ = hexdigit[ch & 15];
				1576	}
				1577	/* Copy everything else as-is */
				1578	else
				1579	*p++ = (char) ch;
				1580	}
				1581	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1582	if (_PyString_Resize(&repr, p - q))
				1583	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1584
				1585	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1586
				1587	onError:
				1588	Py_DECREF(repr);
				1589	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1590	}
				1591
				1592	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1593	{
				1594	if (!PyUnicode_Check(unicode)) {
				1595	PyErr_BadArgument();
				1596	return NULL;
				1597	}
				1598	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1599	PyUnicode_GET_SIZE(unicode));
				1600	}
				1601
				1602	/* --- Latin-1 Codec ------------------------------------------------------ */
				1603
				1604	PyObject PyUnicode_DecodeLatin1(const char s,
				1605	int size,
				1606	const char *errors)
				1607	{
				1608	PyUnicodeObject *v;
				1609	Py_UNICODE *p;
				1610
				1611	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1612	v = _PyUnicode_New(size);
				1613	if (v == NULL)
				1614	goto onError;
				1615	if (size == 0)
				1616	return (PyObject *)v;
				1617	p = PyUnicode_AS_UNICODE(v);
				1618	while (size-- > 0)
				1619	p++ = (unsigned char)s++;
				1620	return (PyObject *)v;
				1621
				1622	onError:
				1623	Py_XDECREF(v);
				1624	return NULL;
				1625	}
				1626
				1627	static
				1628	int latin1_encoding_error(const Py_UNICODE **source,
				1629	char **dest,
				1630	const char *errors,
				1631	const char *details)
				1632	{
				1633	if ((errors == NULL) \|\|
				1634	(strcmp(errors,"strict") == 0)) {
				1635	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1636	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1637	details);
				1638	return -1;
				1639	}
				1640	else if (strcmp(errors,"ignore") == 0) {
				1641	return 0;
				1642	}
				1643	else if (strcmp(errors,"replace") == 0) {
				1644	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1645	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1646	return 0;
				1647	}
				1648	else {
				1649	PyErr_Format(PyExc_ValueError,
				1650	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1651	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1652	errors);
				1653	return -1;
				1654	}
				1655	}
				1656
				1657	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1658	int size,
				1659	const char *errors)
				1660	{
				1661	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1662	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1663
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1664	repr = PyString_FromStringAndSize(NULL, size);
				1665	if (repr == NULL)
				1666	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1667	if (size == 0)
				1668	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1669
				1670	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1671	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1672	while (size-- > 0) {
				1673	Py_UNICODE ch = *p++;
				1674	if (ch >= 256) {
				1675	if (latin1_encoding_error(&p, &s, errors,
				1676	"ordinal not in range(256)"))
				1677	goto onError;
				1678	}
				1679	else
				1680	*s++ = (char)ch;
				1681	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1682	/* Resize if error handling skipped some characters */
				1683	if (s - start < PyString_GET_SIZE(repr))
				1684	if (_PyString_Resize(&repr, s - start))
				1685	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1686	return repr;
				1687
				1688	onError:
				1689	Py_DECREF(repr);
				1690	return NULL;
				1691	}
				1692
				1693	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1694	{
				1695	if (!PyUnicode_Check(unicode)) {
				1696	PyErr_BadArgument();
				1697	return NULL;
				1698	}
				1699	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1700	PyUnicode_GET_SIZE(unicode),
				1701	NULL);
				1702	}
				1703
				1704	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1705
				1706	static
				1707	int ascii_decoding_error(const char **source,
				1708	Py_UNICODE **dest,
				1709	const char *errors,
				1710	const char *details)
				1711	{
				1712	if ((errors == NULL) \|\|
				1713	(strcmp(errors,"strict") == 0)) {
				1714	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1715	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1716	details);
				1717	return -1;
				1718	}
				1719	else if (strcmp(errors,"ignore") == 0) {
				1720	return 0;
				1721	}
				1722	else if (strcmp(errors,"replace") == 0) {
				1723	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1724	(*dest)++;
				1725	return 0;
				1726	}
				1727	else {
				1728	PyErr_Format(PyExc_ValueError,
				1729	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1730	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1731	errors);
				1732	return -1;
				1733	}
				1734	}
				1735
				1736	PyObject PyUnicode_DecodeASCII(const char s,
				1737	int size,
				1738	const char *errors)
				1739	{
				1740	PyUnicodeObject *v;
				1741	Py_UNICODE *p;
				1742
				1743	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1744	v = _PyUnicode_New(size);
				1745	if (v == NULL)
				1746	goto onError;
				1747	if (size == 0)
				1748	return (PyObject *)v;
				1749	p = PyUnicode_AS_UNICODE(v);
				1750	while (size-- > 0) {
				1751	register unsigned char c;
				1752
				1753	c = (unsigned char)*s++;
				1754	if (c < 128)
				1755	*p++ = c;
				1756	else if (ascii_decoding_error(&s, &p, errors,
				1757	"ordinal not in range(128)"))
				1758	goto onError;
				1759	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1760	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1761	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1762	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1763	return (PyObject *)v;
				1764
				1765	onError:
				1766	Py_XDECREF(v);
				1767	return NULL;
				1768	}
				1769
				1770	static
				1771	int ascii_encoding_error(const Py_UNICODE **source,
				1772	char **dest,
				1773	const char *errors,
				1774	const char *details)
				1775	{
				1776	if ((errors == NULL) \|\|
				1777	(strcmp(errors,"strict") == 0)) {
				1778	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1779	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1780	details);
				1781	return -1;
				1782	}
				1783	else if (strcmp(errors,"ignore") == 0) {
				1784	return 0;
				1785	}
				1786	else if (strcmp(errors,"replace") == 0) {
				1787	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1788	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1789	return 0;
				1790	}
				1791	else {
				1792	PyErr_Format(PyExc_ValueError,
				1793	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1794	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1795	errors);
				1796	return -1;
				1797	}
				1798	}
				1799
				1800	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1801	int size,
				1802	const char *errors)
				1803	{
				1804	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1805	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1806
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1807	repr = PyString_FromStringAndSize(NULL, size);
				1808	if (repr == NULL)
				1809	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1810	if (size == 0)
				1811	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1812
				1813	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1814	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1815	while (size-- > 0) {
				1816	Py_UNICODE ch = *p++;
				1817	if (ch >= 128) {
				1818	if (ascii_encoding_error(&p, &s, errors,
				1819	"ordinal not in range(128)"))
				1820	goto onError;
				1821	}
				1822	else
				1823	*s++ = (char)ch;
				1824	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1825	/* Resize if error handling skipped some characters */
				1826	if (s - start < PyString_GET_SIZE(repr))
				1827	if (_PyString_Resize(&repr, s - start))
				1828	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1829	return repr;
				1830
				1831	onError:
				1832	Py_DECREF(repr);
				1833	return NULL;
				1834	}
				1835
				1836	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1837	{
				1838	if (!PyUnicode_Check(unicode)) {
				1839	PyErr_BadArgument();
				1840	return NULL;
				1841	}
				1842	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1843	PyUnicode_GET_SIZE(unicode),
				1844	NULL);
				1845	}
				1846
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1847	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1848
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1849	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1850
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1851	PyObject PyUnicode_DecodeMBCS(const char s,
				1852	int size,
				1853	const char *errors)
				1854	{
				1855	PyUnicodeObject *v;
				1856	Py_UNICODE *p;
				1857
				1858	/* First get the size of the result */
				1859	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1860	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1861	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1862
				1863	v = _PyUnicode_New(usize);
				1864	if (v == NULL)
				1865	return NULL;
				1866	if (usize == 0)
				1867	return (PyObject *)v;
				1868	p = PyUnicode_AS_UNICODE(v);
				1869	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1870	Py_DECREF(v);
				1871	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1872	}
				1873
				1874	return (PyObject *)v;
				1875	}
				1876
				1877	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1878	int size,
				1879	const char *errors)
				1880	{
				1881	PyObject *repr;
				1882	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1883	DWORD mbcssize;
				1884
				1885	/* If there are no characters, bail now! */
				1886	if (size==0)
				1887	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1888
				1889	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1890	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1891	if (mbcssize==0)
				1892	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1893
				1894	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1895	if (repr == NULL)
				1896	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1897	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1898	return repr;
				1899
				1900	/* Do the conversion */
				1901	s = PyString_AS_STRING(repr);
				1902	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1903	Py_DECREF(repr);
				1904	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1905	}
				1906	return repr;
				1907	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1908
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1909	#endif /* MS_WIN32 */
				1910
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1911	/* --- Character Mapping Codec -------------------------------------------- */
				1912
				1913	static
				1914	int charmap_decoding_error(const char **source,
				1915	Py_UNICODE **dest,
				1916	const char *errors,
				1917	const char *details)
				1918	{
				1919	if ((errors == NULL) \|\|
				1920	(strcmp(errors,"strict") == 0)) {
				1921	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1922	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1923	details);
				1924	return -1;
				1925	}
				1926	else if (strcmp(errors,"ignore") == 0) {
				1927	return 0;
				1928	}
				1929	else if (strcmp(errors,"replace") == 0) {
				1930	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1931	(*dest)++;
				1932	return 0;
				1933	}
				1934	else {
				1935	PyErr_Format(PyExc_ValueError,
				1936	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1937	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1938	errors);
				1939	return -1;
				1940	}
				1941	}
				1942
				1943	PyObject PyUnicode_DecodeCharmap(const char s,
				1944	int size,
				1945	PyObject *mapping,
				1946	const char *errors)
				1947	{
				1948	PyUnicodeObject *v;
				1949	Py_UNICODE *p;
				1950
				1951	/* Default to Latin-1 */
				1952	if (mapping == NULL)
				1953	return PyUnicode_DecodeLatin1(s, size, errors);
				1954
				1955	v = _PyUnicode_New(size);
				1956	if (v == NULL)
				1957	goto onError;
				1958	if (size == 0)
				1959	return (PyObject *)v;
				1960	p = PyUnicode_AS_UNICODE(v);
				1961	while (size-- > 0) {
				1962	unsigned char ch = *s++;
				1963	PyObject w, x;
				1964
				1965	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1966	w = PyInt_FromLong((long)ch);
				1967	if (w == NULL)
				1968	goto onError;
				1969	x = PyObject_GetItem(mapping, w);
				1970	Py_DECREF(w);
				1971	if (x == NULL) {
				1972	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1973	/* No mapping found: default to Latin-1 mapping */
				1974	PyErr_Clear();
				1975	*p++ = (Py_UNICODE)ch;
				1976	continue;
				1977	}
				1978	goto onError;
				1979	}
				1980
				1981	/* Apply mapping */
				1982	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1983	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1984	if (value < 0 \|\| value > 65535) {
				1985	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1986	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1987	Py_DECREF(x);
				1988	goto onError;
				1989	}
				1990	*p++ = (Py_UNICODE)value;
				1991	}
				1992	else if (x == Py_None) {
				1993	/* undefined mapping */
				1994	if (charmap_decoding_error(&s, &p, errors,
				1995	"character maps to <undefined>")) {
				1996	Py_DECREF(x);
				1997	goto onError;
				1998	}
				1999	}
				2000	else if (PyUnicode_Check(x)) {
				2001	if (PyUnicode_GET_SIZE(x) != 1) {
				2002	/* 1-n mapping */
				2003	PyErr_SetString(PyExc_NotImplementedError,
				2004	"1-n mappings are currently not implemented");
				2005	Py_DECREF(x);
				2006	goto onError;
				2007	}
				2008	p++ = PyUnicode_AS_UNICODE(x);
				2009	}
				2010	else {
				2011	/* wrong return value */
				2012	PyErr_SetString(PyExc_TypeError,
				2013	"character mapping must return integer, None or unicode");
				2014	Py_DECREF(x);
				2015	goto onError;
				2016	}
				2017	Py_DECREF(x);
				2018	}
				2019	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				2020	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2021	goto onError;
				2022	return (PyObject *)v;
				2023
				2024	onError:
				2025	Py_XDECREF(v);
				2026	return NULL;
				2027	}
				2028
				2029	static
				2030	int charmap_encoding_error(const Py_UNICODE **source,
				2031	char **dest,
				2032	const char *errors,
				2033	const char *details)
				2034	{
				2035	if ((errors == NULL) \|\|
				2036	(strcmp(errors,"strict") == 0)) {
				2037	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2038	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2039	details);
				2040	return -1;
				2041	}
				2042	else if (strcmp(errors,"ignore") == 0) {
				2043	return 0;
				2044	}
				2045	else if (strcmp(errors,"replace") == 0) {
				2046	**dest = '?';
				2047	(*dest)++;
				2048	return 0;
				2049	}
				2050	else {
				2051	PyErr_Format(PyExc_ValueError,
				2052	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2053	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2054	errors);
				2055	return -1;
				2056	}
				2057	}
				2058
				2059	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2060	int size,
				2061	PyObject *mapping,
				2062	const char *errors)
				2063	{
				2064	PyObject *v;
				2065	char *s;
				2066
				2067	/* Default to Latin-1 */
				2068	if (mapping == NULL)
				2069	return PyUnicode_EncodeLatin1(p, size, errors);
				2070
				2071	v = PyString_FromStringAndSize(NULL, size);
				2072	if (v == NULL)
				2073	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2074	if (size == 0)
				2075	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2076	s = PyString_AS_STRING(v);
				2077	while (size-- > 0) {
				2078	Py_UNICODE ch = *p++;
				2079	PyObject w, x;
				2080
				2081	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2082	w = PyInt_FromLong((long)ch);
				2083	if (w == NULL)
				2084	goto onError;
				2085	x = PyObject_GetItem(mapping, w);
				2086	Py_DECREF(w);
				2087	if (x == NULL) {
				2088	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2089	/* No mapping found: default to Latin-1 mapping if possible */
				2090	PyErr_Clear();
				2091	if (ch < 256) {
				2092	*s++ = (char)ch;
				2093	continue;
				2094	}
				2095	else if (!charmap_encoding_error(&p, &s, errors,
				2096	"missing character mapping"))
				2097	continue;
				2098	}
				2099	goto onError;
				2100	}
				2101
				2102	/* Apply mapping */
				2103	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2104	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2105	if (value < 0 \|\| value > 255) {
				2106	PyErr_SetString(PyExc_TypeError,
				2107	"character mapping must be in range(256)");
				2108	Py_DECREF(x);
				2109	goto onError;
				2110	}
				2111	*s++ = (char)value;
				2112	}
				2113	else if (x == Py_None) {
				2114	/* undefined mapping */
				2115	if (charmap_encoding_error(&p, &s, errors,
				2116	"character maps to <undefined>")) {
				2117	Py_DECREF(x);
				2118	goto onError;
				2119	}
				2120	}
				2121	else if (PyString_Check(x)) {
				2122	if (PyString_GET_SIZE(x) != 1) {
				2123	/* 1-n mapping */
				2124	PyErr_SetString(PyExc_NotImplementedError,
				2125	"1-n mappings are currently not implemented");
				2126	Py_DECREF(x);
				2127	goto onError;
				2128	}
				2129	s++ = PyString_AS_STRING(x);
				2130	}
				2131	else {
				2132	/* wrong return value */
				2133	PyErr_SetString(PyExc_TypeError,
				2134	"character mapping must return integer, None or unicode");
				2135	Py_DECREF(x);
				2136	goto onError;
				2137	}
				2138	Py_DECREF(x);
				2139	}
				2140	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2141	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2142	goto onError;
				2143	return v;
				2144
				2145	onError:
				2146	Py_DECREF(v);
				2147	return NULL;
				2148	}
				2149
				2150	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2151	PyObject *mapping)
				2152	{
				2153	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2154	PyErr_BadArgument();
				2155	return NULL;
				2156	}
				2157	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2158	PyUnicode_GET_SIZE(unicode),
				2159	mapping,
				2160	NULL);
				2161	}
				2162
				2163	static
				2164	int translate_error(const Py_UNICODE **source,
				2165	Py_UNICODE **dest,
				2166	const char *errors,
				2167	const char *details)
				2168	{
				2169	if ((errors == NULL) \|\|
				2170	(strcmp(errors,"strict") == 0)) {
				2171	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2172	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2173	details);
				2174	return -1;
				2175	}
				2176	else if (strcmp(errors,"ignore") == 0) {
				2177	return 0;
				2178	}
				2179	else if (strcmp(errors,"replace") == 0) {
				2180	**dest = '?';
				2181	(*dest)++;
				2182	return 0;
				2183	}
				2184	else {
				2185	PyErr_Format(PyExc_ValueError,
				2186	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2187	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2188	errors);
				2189	return -1;
				2190	}
				2191	}
				2192
				2193	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2194	int size,
				2195	PyObject *mapping,
				2196	const char *errors)
				2197	{
				2198	PyUnicodeObject *v;
				2199	Py_UNICODE *p;
				2200
				2201	if (mapping == NULL) {
				2202	PyErr_BadArgument();
				2203	return NULL;
				2204	}
				2205
				2206	/* Output will never be longer than input */
				2207	v = _PyUnicode_New(size);
				2208	if (v == NULL)
				2209	goto onError;
				2210	if (size == 0)
				2211	goto done;
				2212	p = PyUnicode_AS_UNICODE(v);
				2213	while (size-- > 0) {
				2214	Py_UNICODE ch = *s++;
				2215	PyObject w, x;
				2216
				2217	/* Get mapping */
				2218	w = PyInt_FromLong(ch);
				2219	if (w == NULL)
				2220	goto onError;
				2221	x = PyObject_GetItem(mapping, w);
				2222	Py_DECREF(w);
				2223	if (x == NULL) {
				2224	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2225	/* No mapping found: default to 1-1 mapping */
				2226	PyErr_Clear();
				2227	*p++ = ch;
				2228	continue;
				2229	}
				2230	goto onError;
				2231	}
				2232
				2233	/* Apply mapping */
				2234	if (PyInt_Check(x))
				2235	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2236	else if (x == Py_None) {
				2237	/* undefined mapping */
				2238	if (translate_error(&s, &p, errors,
				2239	"character maps to <undefined>")) {
				2240	Py_DECREF(x);
				2241	goto onError;
				2242	}
				2243	}
				2244	else if (PyUnicode_Check(x)) {
				2245	if (PyUnicode_GET_SIZE(x) != 1) {
				2246	/* 1-n mapping */
				2247	PyErr_SetString(PyExc_NotImplementedError,
				2248	"1-n mappings are currently not implemented");
				2249	Py_DECREF(x);
				2250	goto onError;
				2251	}
				2252	p++ = PyUnicode_AS_UNICODE(x);
				2253	}
				2254	else {
				2255	/* wrong return value */
				2256	PyErr_SetString(PyExc_TypeError,
				2257	"translate mapping must return integer, None or unicode");
				2258	Py_DECREF(x);
				2259	goto onError;
				2260	}
				2261	Py_DECREF(x);
				2262	}
				2263	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2264	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2265	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2266
				2267	done:
				2268	return (PyObject *)v;
				2269
				2270	onError:
				2271	Py_XDECREF(v);
				2272	return NULL;
				2273	}
				2274
				2275	PyObject PyUnicode_Translate(PyObject str,
				2276	PyObject *mapping,
				2277	const char *errors)
				2278	{
				2279	PyObject *result;
				2280
				2281	str = PyUnicode_FromObject(str);
				2282	if (str == NULL)
				2283	goto onError;
				2284	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2285	PyUnicode_GET_SIZE(str),
				2286	mapping,
				2287	errors);
				2288	Py_DECREF(str);
				2289	return result;
				2290
				2291	onError:
				2292	Py_XDECREF(str);
				2293	return NULL;
				2294	}
				2295
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2296	/* --- Decimal Encoder ---------------------------------------------------- */
				2297
				2298	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2299	int length,
				2300	char *output,
				2301	const char *errors)
				2302	{
				2303	Py_UNICODE p, end;
				2304
				2305	if (output == NULL) {
				2306	PyErr_BadArgument();
				2307	return -1;
				2308	}
				2309
				2310	p = s;
				2311	end = s + length;
				2312	while (p < end) {
				2313	register Py_UNICODE ch = *p++;
				2314	int decimal;
				2315
				2316	if (Py_UNICODE_ISSPACE(ch)) {
				2317	*output++ = ' ';
				2318	continue;
				2319	}
				2320	decimal = Py_UNICODE_TODECIMAL(ch);
				2321	if (decimal >= 0) {
				2322	*output++ = '0' + decimal;
				2323	continue;
				2324	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2325	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2326	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2327	continue;
				2328	}
				2329	/* All other characters are considered invalid */
				2330	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2331	PyErr_SetString(PyExc_ValueError,
				2332	"invalid decimal Unicode string");
				2333	goto onError;
				2334	}
				2335	else if (strcmp(errors, "ignore") == 0)
				2336	continue;
				2337	else if (strcmp(errors, "replace") == 0) {
				2338	*output++ = '?';
				2339	continue;
				2340	}
				2341	}
				2342	/* 0-terminate the output string */
				2343	*output++ = '\0';
				2344	return 0;
				2345
				2346	onError:
				2347	return -1;
				2348	}
				2349
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2350	/* --- Helpers ------------------------------------------------------------ */
				2351
				2352	static
				2353	int count(PyUnicodeObject *self,
				2354	int start,
				2355	int end,
				2356	PyUnicodeObject *substring)
				2357	{
				2358	int count = 0;
				2359
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2360	if (substring->length == 0)
				2361	return (end - start + 1);
				2362
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2363	end -= substring->length;
				2364
				2365	while (start <= end)
				2366	if (Py_UNICODE_MATCH(self, start, substring)) {
				2367	count++;
				2368	start += substring->length;
				2369	} else
				2370	start++;
				2371
				2372	return count;
				2373	}
				2374
				2375	int PyUnicode_Count(PyObject *str,
				2376	PyObject *substr,
				2377	int start,
				2378	int end)
				2379	{
				2380	int result;
				2381
				2382	str = PyUnicode_FromObject(str);
				2383	if (str == NULL)
				2384	return -1;
				2385	substr = PyUnicode_FromObject(substr);
				2386	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2387	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2388	return -1;
				2389	}
				2390
				2391	result = count((PyUnicodeObject *)str,
				2392	start, end,
				2393	(PyUnicodeObject *)substr);
				2394
				2395	Py_DECREF(str);
				2396	Py_DECREF(substr);
				2397	return result;
				2398	}
				2399
				2400	static
				2401	int findstring(PyUnicodeObject *self,
				2402	PyUnicodeObject *substring,
				2403	int start,
				2404	int end,
				2405	int direction)
				2406	{
				2407	if (start < 0)
				2408	start += self->length;
				2409	if (start < 0)
				2410	start = 0;
				2411
				2412	if (substring->length == 0)
				2413	return start;
				2414
				2415	if (end > self->length)
				2416	end = self->length;
				2417	if (end < 0)
				2418	end += self->length;
				2419	if (end < 0)
				2420	end = 0;
				2421
				2422	end -= substring->length;
				2423
				2424	if (direction < 0) {
				2425	for (; end >= start; end--)
				2426	if (Py_UNICODE_MATCH(self, end, substring))
				2427	return end;
				2428	} else {
				2429	for (; start <= end; start++)
				2430	if (Py_UNICODE_MATCH(self, start, substring))
				2431	return start;
				2432	}
				2433
				2434	return -1;
				2435	}
				2436
				2437	int PyUnicode_Find(PyObject *str,
				2438	PyObject *substr,
				2439	int start,
				2440	int end,
				2441	int direction)
				2442	{
				2443	int result;
				2444
				2445	str = PyUnicode_FromObject(str);
				2446	if (str == NULL)
				2447	return -1;
				2448	substr = PyUnicode_FromObject(substr);
				2449	if (substr == NULL) {
				2450	Py_DECREF(substr);
				2451	return -1;
				2452	}
				2453
				2454	result = findstring((PyUnicodeObject *)str,
				2455	(PyUnicodeObject *)substr,
				2456	start, end, direction);
				2457	Py_DECREF(str);
				2458	Py_DECREF(substr);
				2459	return result;
				2460	}
				2461
				2462	static
				2463	int tailmatch(PyUnicodeObject *self,
				2464	PyUnicodeObject *substring,
				2465	int start,
				2466	int end,
				2467	int direction)
				2468	{
				2469	if (start < 0)
				2470	start += self->length;
				2471	if (start < 0)
				2472	start = 0;
				2473
				2474	if (substring->length == 0)
				2475	return 1;
				2476
				2477	if (end > self->length)
				2478	end = self->length;
				2479	if (end < 0)
				2480	end += self->length;
				2481	if (end < 0)
				2482	end = 0;
				2483
				2484	end -= substring->length;
				2485	if (end < start)
				2486	return 0;
				2487
				2488	if (direction > 0) {
				2489	if (Py_UNICODE_MATCH(self, end, substring))
				2490	return 1;
				2491	} else {
				2492	if (Py_UNICODE_MATCH(self, start, substring))
				2493	return 1;
				2494	}
				2495
				2496	return 0;
				2497	}
				2498
				2499	int PyUnicode_Tailmatch(PyObject *str,
				2500	PyObject *substr,
				2501	int start,
				2502	int end,
				2503	int direction)
				2504	{
				2505	int result;
				2506
				2507	str = PyUnicode_FromObject(str);
				2508	if (str == NULL)
				2509	return -1;
				2510	substr = PyUnicode_FromObject(substr);
				2511	if (substr == NULL) {
				2512	Py_DECREF(substr);
				2513	return -1;
				2514	}
				2515
				2516	result = tailmatch((PyUnicodeObject *)str,
				2517	(PyUnicodeObject *)substr,
				2518	start, end, direction);
				2519	Py_DECREF(str);
				2520	Py_DECREF(substr);
				2521	return result;
				2522	}
				2523
				2524	static
				2525	const Py_UNICODE findchar(const Py_UNICODE s,
				2526	int size,
				2527	Py_UNICODE ch)
				2528	{
				2529	/* like wcschr, but doesn't stop at NULL characters */
				2530
				2531	while (size-- > 0) {
				2532	if (*s == ch)
				2533	return s;
				2534	s++;
				2535	}
				2536
				2537	return NULL;
				2538	}
				2539
				2540	/* Apply fixfct filter to the Unicode object self and return a
				2541	reference to the modified object */
				2542
				2543	static
				2544	PyObject fixup(PyUnicodeObject self,
				2545	int (fixfct)(PyUnicodeObject s))
				2546	{
				2547
				2548	PyUnicodeObject *u;
				2549
				2550	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2551	self->length);
				2552	if (u == NULL)
				2553	return NULL;
				2554	if (!fixfct(u)) {
				2555	/* fixfct should return TRUE if it modified the buffer. If
				2556	FALSE, return a reference to the original buffer instead
				2557	(to save space, not time) */
				2558	Py_INCREF(self);
				2559	Py_DECREF(u);
				2560	return (PyObject*) self;
				2561	}
				2562	return (PyObject*) u;
				2563	}
				2564
				2565	static
				2566	int fixupper(PyUnicodeObject *self)
				2567	{
				2568	int len = self->length;
				2569	Py_UNICODE *s = self->str;
				2570	int status = 0;
				2571
				2572	while (len-- > 0) {
				2573	register Py_UNICODE ch;
				2574
				2575	ch = Py_UNICODE_TOUPPER(*s);
				2576	if (ch != *s) {
				2577	status = 1;
				2578	*s = ch;
				2579	}
				2580	s++;
				2581	}
				2582
				2583	return status;
				2584	}
				2585
				2586	static
				2587	int fixlower(PyUnicodeObject *self)
				2588	{
				2589	int len = self->length;
				2590	Py_UNICODE *s = self->str;
				2591	int status = 0;
				2592
				2593	while (len-- > 0) {
				2594	register Py_UNICODE ch;
				2595
				2596	ch = Py_UNICODE_TOLOWER(*s);
				2597	if (ch != *s) {
				2598	status = 1;
				2599	*s = ch;
				2600	}
				2601	s++;
				2602	}
				2603
				2604	return status;
				2605	}
				2606
				2607	static
				2608	int fixswapcase(PyUnicodeObject *self)
				2609	{
				2610	int len = self->length;
				2611	Py_UNICODE *s = self->str;
				2612	int status = 0;
				2613
				2614	while (len-- > 0) {
				2615	if (Py_UNICODE_ISUPPER(*s)) {
				2616	s = Py_UNICODE_TOLOWER(s);
				2617	status = 1;
				2618	} else if (Py_UNICODE_ISLOWER(*s)) {
				2619	s = Py_UNICODE_TOUPPER(s);
				2620	status = 1;
				2621	}
				2622	s++;
				2623	}
				2624
				2625	return status;
				2626	}
				2627
				2628	static
				2629	int fixcapitalize(PyUnicodeObject *self)
				2630	{
				2631	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2632	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2633	return 1;
				2634	}
				2635	return 0;
				2636	}
				2637
				2638	static
				2639	int fixtitle(PyUnicodeObject *self)
				2640	{
				2641	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2642	register Py_UNICODE *e;
				2643	int previous_is_cased;
				2644
				2645	/* Shortcut for single character strings */
				2646	if (PyUnicode_GET_SIZE(self) == 1) {
				2647	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2648	if (*p != ch) {
				2649	*p = ch;
				2650	return 1;
				2651	}
				2652	else
				2653	return 0;
				2654	}
				2655
				2656	e = p + PyUnicode_GET_SIZE(self);
				2657	previous_is_cased = 0;
				2658	for (; p < e; p++) {
				2659	register const Py_UNICODE ch = *p;
				2660
				2661	if (previous_is_cased)
				2662	*p = Py_UNICODE_TOLOWER(ch);
				2663	else
				2664	*p = Py_UNICODE_TOTITLE(ch);
				2665
				2666	if (Py_UNICODE_ISLOWER(ch) \|\|
				2667	Py_UNICODE_ISUPPER(ch) \|\|
				2668	Py_UNICODE_ISTITLE(ch))
				2669	previous_is_cased = 1;
				2670	else
				2671	previous_is_cased = 0;
				2672	}
				2673	return 1;
				2674	}
				2675
				2676	PyObject PyUnicode_Join(PyObject separator,
				2677	PyObject *seq)
				2678	{
				2679	Py_UNICODE *sep;
				2680	int seplen;
				2681	PyUnicodeObject *res = NULL;
				2682	int reslen = 0;
				2683	Py_UNICODE *p;
				2684	int seqlen = 0;
				2685	int sz = 100;
				2686	int i;
				2687
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2688	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2689	if (seqlen < 0 && PyErr_Occurred())
				2690	return NULL;
				2691
				2692	if (separator == NULL) {
				2693	Py_UNICODE blank = ' ';
				2694	sep = &blank;
				2695	seplen = 1;
				2696	}
				2697	else {
				2698	separator = PyUnicode_FromObject(separator);
				2699	if (separator == NULL)
				2700	return NULL;
				2701	sep = PyUnicode_AS_UNICODE(separator);
				2702	seplen = PyUnicode_GET_SIZE(separator);
				2703	}
				2704
				2705	res = _PyUnicode_New(sz);
				2706	if (res == NULL)
				2707	goto onError;
				2708	p = PyUnicode_AS_UNICODE(res);
				2709	reslen = 0;
				2710
				2711	for (i = 0; i < seqlen; i++) {
				2712	int itemlen;
				2713	PyObject *item;
				2714
				2715	item = PySequence_GetItem(seq, i);
				2716	if (item == NULL)
				2717	goto onError;
				2718	if (!PyUnicode_Check(item)) {
				2719	PyObject *v;
				2720	v = PyUnicode_FromObject(item);
				2721	Py_DECREF(item);
				2722	item = v;
				2723	if (item == NULL)
				2724	goto onError;
				2725	}
				2726	itemlen = PyUnicode_GET_SIZE(item);
				2727	while (reslen + itemlen + seplen >= sz) {
				2728	if (_PyUnicode_Resize(res, sz*2))
				2729	goto onError;
				2730	sz *= 2;
				2731	p = PyUnicode_AS_UNICODE(res) + reslen;
				2732	}
				2733	if (i > 0) {
				2734	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2735	p += seplen;
				2736	reslen += seplen;
				2737	}
				2738	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2739	p += itemlen;
				2740	reslen += itemlen;
				2741	Py_DECREF(item);
				2742	}
				2743	if (_PyUnicode_Resize(res, reslen))
				2744	goto onError;
				2745
				2746	Py_XDECREF(separator);
				2747	return (PyObject *)res;
				2748
				2749	onError:
				2750	Py_XDECREF(separator);
				2751	Py_DECREF(res);
				2752	return NULL;
				2753	}
				2754
				2755	static
				2756	PyUnicodeObject pad(PyUnicodeObject self,
				2757	int left,
				2758	int right,
				2759	Py_UNICODE fill)
				2760	{
				2761	PyUnicodeObject *u;
				2762
				2763	if (left < 0)
				2764	left = 0;
				2765	if (right < 0)
				2766	right = 0;
				2767
				2768	if (left == 0 && right == 0) {
				2769	Py_INCREF(self);
				2770	return self;
				2771	}
				2772
				2773	u = _PyUnicode_New(left + self->length + right);
				2774	if (u) {
				2775	if (left)
				2776	Py_UNICODE_FILL(u->str, fill, left);
				2777	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2778	if (right)
				2779	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2780	}
				2781
				2782	return u;
				2783	}
				2784
				2785	#define SPLIT_APPEND(data, left, right) \
				2786	str = PyUnicode_FromUnicode(data + left, right - left); \
				2787	if (!str) \
				2788	goto onError; \
				2789	if (PyList_Append(list, str)) { \
				2790	Py_DECREF(str); \
				2791	goto onError; \
				2792	} \
				2793	else \
				2794	Py_DECREF(str);
				2795
				2796	static
				2797	PyObject split_whitespace(PyUnicodeObject self,
				2798	PyObject *list,
				2799	int maxcount)
				2800	{
				2801	register int i;
				2802	register int j;
				2803	int len = self->length;
				2804	PyObject *str;
				2805
				2806	for (i = j = 0; i < len; ) {
				2807	/* find a token */
				2808	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2809	i++;
				2810	j = i;
				2811	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2812	i++;
				2813	if (j < i) {
				2814	if (maxcount-- <= 0)
				2815	break;
				2816	SPLIT_APPEND(self->str, j, i);
				2817	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2818	i++;
				2819	j = i;
				2820	}
				2821	}
				2822	if (j < len) {
				2823	SPLIT_APPEND(self->str, j, len);
				2824	}
				2825	return list;
				2826
				2827	onError:
				2828	Py_DECREF(list);
				2829	return NULL;
				2830	}
				2831
				2832	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2833	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2834	{
				2835	register int i;
				2836	register int j;
				2837	int len;
				2838	PyObject *list;
				2839	PyObject *str;
				2840	Py_UNICODE *data;
				2841
				2842	string = PyUnicode_FromObject(string);
				2843	if (string == NULL)
				2844	return NULL;
				2845	data = PyUnicode_AS_UNICODE(string);
				2846	len = PyUnicode_GET_SIZE(string);
				2847
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2848	list = PyList_New(0);
				2849	if (!list)
				2850	goto onError;
				2851
				2852	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2853	int eol;
				2854
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2855	/* Find a line and append it */
				2856	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2857	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2858
				2859	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2860	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2861	if (i < len) {
				2862	if (data[i] == '\r' && i + 1 < len &&
				2863	data[i+1] == '\n')
				2864	i += 2;
				2865	else
				2866	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2867	if (keepends)
				2868	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2869	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2870	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2871	j = i;
				2872	}
				2873	if (j < len) {
				2874	SPLIT_APPEND(data, j, len);
				2875	}
				2876
				2877	Py_DECREF(string);
				2878	return list;
				2879
				2880	onError:
				2881	Py_DECREF(list);
				2882	Py_DECREF(string);
				2883	return NULL;
				2884	}
				2885
				2886	static
				2887	PyObject split_char(PyUnicodeObject self,
				2888	PyObject *list,
				2889	Py_UNICODE ch,
				2890	int maxcount)
				2891	{
				2892	register int i;
				2893	register int j;
				2894	int len = self->length;
				2895	PyObject *str;
				2896
				2897	for (i = j = 0; i < len; ) {
				2898	if (self->str[i] == ch) {
				2899	if (maxcount-- <= 0)
				2900	break;
				2901	SPLIT_APPEND(self->str, j, i);
				2902	i = j = i + 1;
				2903	} else
				2904	i++;
				2905	}
				2906	if (j <= len) {
				2907	SPLIT_APPEND(self->str, j, len);
				2908	}
				2909	return list;
				2910
				2911	onError:
				2912	Py_DECREF(list);
				2913	return NULL;
				2914	}
				2915
				2916	static
				2917	PyObject split_substring(PyUnicodeObject self,
				2918	PyObject *list,
				2919	PyUnicodeObject *substring,
				2920	int maxcount)
				2921	{
				2922	register int i;
				2923	register int j;
				2924	int len = self->length;
				2925	int sublen = substring->length;
				2926	PyObject *str;
				2927
				2928	for (i = j = 0; i < len - sublen; ) {
				2929	if (Py_UNICODE_MATCH(self, i, substring)) {
				2930	if (maxcount-- <= 0)
				2931	break;
				2932	SPLIT_APPEND(self->str, j, i);
				2933	i = j = i + sublen;
				2934	} else
				2935	i++;
				2936	}
				2937	if (j <= len) {
				2938	SPLIT_APPEND(self->str, j, len);
				2939	}
				2940	return list;
				2941
				2942	onError:
				2943	Py_DECREF(list);
				2944	return NULL;
				2945	}
				2946
				2947	#undef SPLIT_APPEND
				2948
				2949	static
				2950	PyObject split(PyUnicodeObject self,
				2951	PyUnicodeObject *substring,
				2952	int maxcount)
				2953	{
				2954	PyObject *list;
				2955
				2956	if (maxcount < 0)
				2957	maxcount = INT_MAX;
				2958
				2959	list = PyList_New(0);
				2960	if (!list)
				2961	return NULL;
				2962
				2963	if (substring == NULL)
				2964	return split_whitespace(self,list,maxcount);
				2965
				2966	else if (substring->length == 1)
				2967	return split_char(self,list,substring->str[0],maxcount);
				2968
				2969	else if (substring->length == 0) {
				2970	Py_DECREF(list);
				2971	PyErr_SetString(PyExc_ValueError, "empty separator");
				2972	return NULL;
				2973	}
				2974	else
				2975	return split_substring(self,list,substring,maxcount);
				2976	}
				2977
				2978	static
				2979	PyObject strip(PyUnicodeObject self,
				2980	int left,
				2981	int right)
				2982	{
				2983	Py_UNICODE *p = self->str;
				2984	int start = 0;
				2985	int end = self->length;
				2986
				2987	if (left)
				2988	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2989	start++;
				2990
				2991	if (right)
				2992	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2993	end--;
				2994
				2995	if (start == 0 && end == self->length) {
				2996	/* couldn't strip anything off, return original string */
				2997	Py_INCREF(self);
				2998	return (PyObject*) self;
				2999	}
				3000
				3001	return (PyObject*) PyUnicode_FromUnicode(
				3002	self->str + start,
				3003	end - start
				3004	);
				3005	}
				3006
				3007	static
				3008	PyObject replace(PyUnicodeObject self,
				3009	PyUnicodeObject *str1,
				3010	PyUnicodeObject *str2,
				3011	int maxcount)
				3012	{
				3013	PyUnicodeObject *u;
				3014
				3015	if (maxcount < 0)
				3016	maxcount = INT_MAX;
				3017
				3018	if (str1->length == 1 && str2->length == 1) {
				3019	int i;
				3020
				3021	/* replace characters */
				3022	if (!findchar(self->str, self->length, str1->str[0])) {
				3023	/* nothing to replace, return original string */
				3024	Py_INCREF(self);
				3025	u = self;
				3026	} else {
				3027	Py_UNICODE u1 = str1->str[0];
				3028	Py_UNICODE u2 = str2->str[0];
				3029
				3030	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3031	self->str,
				3032	self->length
				3033	);
				3034	if (u)
				3035	for (i = 0; i < u->length; i++)
				3036	if (u->str[i] == u1) {
				3037	if (--maxcount < 0)
				3038	break;
				3039	u->str[i] = u2;
				3040	}
				3041	}
				3042
				3043	} else {
				3044	int n, i;
				3045	Py_UNICODE *p;
				3046
				3047	/* replace strings */
				3048	n = count(self, 0, self->length, str1);
				3049	if (n > maxcount)
				3050	n = maxcount;
				3051	if (n == 0) {
				3052	/* nothing to replace, return original string */
				3053	Py_INCREF(self);
				3054	u = self;
				3055	} else {
				3056	u = _PyUnicode_New(
				3057	self->length + n * (str2->length - str1->length));
				3058	if (u) {
				3059	i = 0;
				3060	p = u->str;
				3061	while (i <= self->length - str1->length)
				3062	if (Py_UNICODE_MATCH(self, i, str1)) {
				3063	/* replace string segment */
				3064	Py_UNICODE_COPY(p, str2->str, str2->length);
				3065	p += str2->length;
				3066	i += str1->length;
				3067	if (--n <= 0) {
				3068	/* copy remaining part */
				3069	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3070	break;
				3071	}
				3072	} else
				3073	*p++ = self->str[i++];
				3074	}
				3075	}
				3076	}
				3077
				3078	return (PyObject *) u;
				3079	}
				3080
				3081	/* --- Unicode Object Methods --------------------------------------------- */
				3082
				3083	static char title__doc__[] =
				3084	"S.title() -> unicode\n\
				3085	\n\
				3086	Return a titlecased version of S, i.e. words start with title case\n\
				3087	characters, all remaining cased characters have lower case.";
				3088
				3089	static PyObject*
				3090	unicode_title(PyUnicodeObject self, PyObject args)
				3091	{
				3092	if (!PyArg_NoArgs(args))
				3093	return NULL;
				3094	return fixup(self, fixtitle);
				3095	}
				3096
				3097	static char capitalize__doc__[] =
				3098	"S.capitalize() -> unicode\n\
				3099	\n\
				3100	Return a capitalized version of S, i.e. make the first character\n\
				3101	have upper case.";
				3102
				3103	static PyObject*
				3104	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3105	{
				3106	if (!PyArg_NoArgs(args))
				3107	return NULL;
				3108	return fixup(self, fixcapitalize);
				3109	}
				3110
				3111	#if 0
				3112	static char capwords__doc__[] =
				3113	"S.capwords() -> unicode\n\
				3114	\n\
				3115	Apply .capitalize() to all words in S and return the result with\n\
				3116	normalized whitespace (all whitespace strings are replaced by ' ').";
				3117
				3118	static PyObject*
				3119	unicode_capwords(PyUnicodeObject self, PyObject args)
				3120	{
				3121	PyObject *list;
				3122	PyObject *item;
				3123	int i;
				3124
				3125	if (!PyArg_NoArgs(args))
				3126	return NULL;
				3127
				3128	/* Split into words */
				3129	list = split(self, NULL, -1);
				3130	if (!list)
				3131	return NULL;
				3132
				3133	/* Capitalize each word */
				3134	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3135	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3136	fixcapitalize);
				3137	if (item == NULL)
				3138	goto onError;
				3139	Py_DECREF(PyList_GET_ITEM(list, i));
				3140	PyList_SET_ITEM(list, i, item);
				3141	}
				3142
				3143	/* Join the words to form a new string */
				3144	item = PyUnicode_Join(NULL, list);
				3145
				3146	onError:
				3147	Py_DECREF(list);
				3148	return (PyObject *)item;
				3149	}
				3150	#endif
				3151
				3152	static char center__doc__[] =
				3153	"S.center(width) -> unicode\n\
				3154	\n\
				3155	Return S centered in a Unicode string of length width. Padding is done\n\
				3156	using spaces.";
				3157
				3158	static PyObject *
				3159	unicode_center(PyUnicodeObject self, PyObject args)
				3160	{
				3161	int marg, left;
				3162	int width;
				3163
				3164	if (!PyArg_ParseTuple(args, "i:center", &width))
				3165	return NULL;
				3166
				3167	if (self->length >= width) {
				3168	Py_INCREF(self);
				3169	return (PyObject*) self;
				3170	}
				3171
				3172	marg = width - self->length;
				3173	left = marg / 2 + (marg & width & 1);
				3174
				3175	return (PyObject*) pad(self, left, marg - left, ' ');
				3176	}
				3177
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3178	#if 0
				3179
				3180	/* This code should go into some future Unicode collation support
				3181	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3182	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3183
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3184	/* speedy UTF-16 code point order comparison */
				3185	/* gleaned from: */
				3186	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3187
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3188	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3189	{
				3190	0, 0, 0, 0, 0, 0, 0, 0,
				3191	0, 0, 0, 0, 0, 0, 0, 0,
				3192	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3193	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3194	};
				3195
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3196	static int
				3197	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3198	{
				3199	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3200
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3201	Py_UNICODE *s1 = str1->str;
				3202	Py_UNICODE *s2 = str2->str;
				3203
				3204	len1 = str1->length;
				3205	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3206
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3207	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3208	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3209	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3210
				3211	c1 = *s1++;
				3212	c2 = *s2++;
				3213	if (c1 > (1<<11) * 26)
				3214	c1 += utf16Fixup[c1>>11];
				3215	if (c2 > (1<<11) * 26)
				3216	c2 += utf16Fixup[c2>>11];
				3217
				3218	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3219	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3220	if (diff)
				3221	return (diff < 0) ? -1 : (diff != 0);
				3222	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3223	}
				3224
				3225	return (len1 < len2) ? -1 : (len1 != len2);
				3226	}
				3227
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3228	#else
				3229
				3230	static int
				3231	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3232	{
				3233	register int len1, len2;
				3234
				3235	Py_UNICODE *s1 = str1->str;
				3236	Py_UNICODE *s2 = str2->str;
				3237
				3238	len1 = str1->length;
				3239	len2 = str2->length;
				3240
				3241	while (len1 > 0 && len2 > 0) {
				3242	register long diff;
				3243
				3244	diff = (long)s1++ - (long)s2++;
				3245	if (diff)
				3246	return (diff < 0) ? -1 : (diff != 0);
				3247	len1--; len2--;
				3248	}
				3249
				3250	return (len1 < len2) ? -1 : (len1 != len2);
				3251	}
				3252
				3253	#endif
				3254
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3255	int PyUnicode_Compare(PyObject *left,
				3256	PyObject *right)
				3257	{
				3258	PyUnicodeObject u = NULL, v = NULL;
				3259	int result;
				3260
				3261	/* Coerce the two arguments */
				3262	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3263	if (u == NULL)
				3264	goto onError;
				3265	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3266	if (v == NULL)
				3267	goto onError;
				3268
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3269	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3270	if (v == u) {
				3271	Py_DECREF(u);
				3272	Py_DECREF(v);
				3273	return 0;
				3274	}
				3275
				3276	result = unicode_compare(u, v);
				3277
				3278	Py_DECREF(u);
				3279	Py_DECREF(v);
				3280	return result;
				3281
				3282	onError:
				3283	Py_XDECREF(u);
				3284	Py_XDECREF(v);
				3285	return -1;
				3286	}
				3287
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3288	int PyUnicode_Contains(PyObject *container,
				3289	PyObject *element)
				3290	{
				3291	PyUnicodeObject u = NULL, v = NULL;
				3292	int result;
				3293	register const Py_UNICODE p, e;
				3294	register Py_UNICODE ch;
				3295
				3296	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3297	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3298	if (v == NULL) {
				3299	PyErr_SetString(PyExc_TypeError,
				3300	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3301	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3302	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3303	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3304	if (u == NULL) {
				3305	Py_DECREF(v);
				3306	goto onError;
				3307	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3308
				3309	/* Check v in u */
				3310	if (PyUnicode_GET_SIZE(v) != 1) {
				3311	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3312	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3313	goto onError;
				3314	}
				3315	ch = *PyUnicode_AS_UNICODE(v);
				3316	p = PyUnicode_AS_UNICODE(u);
				3317	e = p + PyUnicode_GET_SIZE(u);
				3318	result = 0;
				3319	while (p < e) {
				3320	if (*p++ == ch) {
				3321	result = 1;
				3322	break;
				3323	}
				3324	}
				3325
				3326	Py_DECREF(u);
				3327	Py_DECREF(v);
				3328	return result;
				3329
				3330	onError:
				3331	Py_XDECREF(u);
				3332	Py_XDECREF(v);
				3333	return -1;
				3334	}
				3335
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3336	/* Concat to string or Unicode object giving a new Unicode object. */
				3337
				3338	PyObject PyUnicode_Concat(PyObject left,
				3339	PyObject *right)
				3340	{
				3341	PyUnicodeObject u = NULL, v = NULL, *w;
				3342
				3343	/* Coerce the two arguments */
				3344	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3345	if (u == NULL)
				3346	goto onError;
				3347	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3348	if (v == NULL)
				3349	goto onError;
				3350
				3351	/* Shortcuts */
				3352	if (v == unicode_empty) {
				3353	Py_DECREF(v);
				3354	return (PyObject *)u;
				3355	}
				3356	if (u == unicode_empty) {
				3357	Py_DECREF(u);
				3358	return (PyObject *)v;
				3359	}
				3360
				3361	/* Concat the two Unicode strings */
				3362	w = _PyUnicode_New(u->length + v->length);
				3363	if (w == NULL)
				3364	goto onError;
				3365	Py_UNICODE_COPY(w->str, u->str, u->length);
				3366	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3367
				3368	Py_DECREF(u);
				3369	Py_DECREF(v);
				3370	return (PyObject *)w;
				3371
				3372	onError:
				3373	Py_XDECREF(u);
				3374	Py_XDECREF(v);
				3375	return NULL;
				3376	}
				3377
				3378	static char count__doc__[] =
				3379	"S.count(sub[, start[, end]]) -> int\n\
				3380	\n\
				3381	Return the number of occurrences of substring sub in Unicode string\n\
				3382	S[start:end]. Optional arguments start and end are\n\
				3383	interpreted as in slice notation.";
				3384
				3385	static PyObject *
				3386	unicode_count(PyUnicodeObject self, PyObject args)
				3387	{
				3388	PyUnicodeObject *substring;
				3389	int start = 0;
				3390	int end = INT_MAX;
				3391	PyObject *result;
				3392
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3393	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3394	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3395	return NULL;
				3396
				3397	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3398	(PyObject *)substring);
				3399	if (substring == NULL)
				3400	return NULL;
				3401
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3402	if (start < 0)
				3403	start += self->length;
				3404	if (start < 0)
				3405	start = 0;
				3406	if (end > self->length)
				3407	end = self->length;
				3408	if (end < 0)
				3409	end += self->length;
				3410	if (end < 0)
				3411	end = 0;
				3412
				3413	result = PyInt_FromLong((long) count(self, start, end, substring));
				3414
				3415	Py_DECREF(substring);
				3416	return result;
				3417	}
				3418
				3419	static char encode__doc__[] =
				3420	"S.encode([encoding[,errors]]) -> string\n\
				3421	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3422	Return an encoded string version of S. Default encoding is the current\n\
				3423	default string encoding. errors may be given to set a different error\n\
				3424	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3425	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3426
				3427	static PyObject *
				3428	unicode_encode(PyUnicodeObject self, PyObject args)
				3429	{
				3430	char *encoding = NULL;
				3431	char *errors = NULL;
				3432	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3433	return NULL;
				3434	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3435	}
				3436
				3437	static char expandtabs__doc__[] =
				3438	"S.expandtabs([tabsize]) -> unicode\n\
				3439	\n\
				3440	Return a copy of S where all tab characters are expanded using spaces.\n\
				3441	If tabsize is not given, a tab size of 8 characters is assumed.";
				3442
				3443	static PyObject*
				3444	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3445	{
				3446	Py_UNICODE *e;
				3447	Py_UNICODE *p;
				3448	Py_UNICODE *q;
				3449	int i, j;
				3450	PyUnicodeObject *u;
				3451	int tabsize = 8;
				3452
				3453	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3454	return NULL;
				3455
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3456	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3457	i = j = 0;
				3458	e = self->str + self->length;
				3459	for (p = self->str; p < e; p++)
				3460	if (*p == '\t') {
				3461	if (tabsize > 0)
				3462	j += tabsize - (j % tabsize);
				3463	}
				3464	else {
				3465	j++;
				3466	if (p == '\n' \|\| p == '\r') {
				3467	i += j;
				3468	j = 0;
				3469	}
				3470	}
				3471
				3472	/* Second pass: create output string and fill it */
				3473	u = _PyUnicode_New(i + j);
				3474	if (!u)
				3475	return NULL;
				3476
				3477	j = 0;
				3478	q = u->str;
				3479
				3480	for (p = self->str; p < e; p++)
				3481	if (*p == '\t') {
				3482	if (tabsize > 0) {
				3483	i = tabsize - (j % tabsize);
				3484	j += i;
				3485	while (i--)
				3486	*q++ = ' ';
				3487	}
				3488	}
				3489	else {
				3490	j++;
				3491	q++ = p;
				3492	if (p == '\n' \|\| p == '\r')
				3493	j = 0;
				3494	}
				3495
				3496	return (PyObject*) u;
				3497	}
				3498
				3499	static char find__doc__[] =
				3500	"S.find(sub [,start [,end]]) -> int\n\
				3501	\n\
				3502	Return the lowest index in S where substring sub is found,\n\
				3503	such that sub is contained within s[start,end]. Optional\n\
				3504	arguments start and end are interpreted as in slice notation.\n\
				3505	\n\
				3506	Return -1 on failure.";
				3507
				3508	static PyObject *
				3509	unicode_find(PyUnicodeObject self, PyObject args)
				3510	{
				3511	PyUnicodeObject *substring;
				3512	int start = 0;
				3513	int end = INT_MAX;
				3514	PyObject *result;
				3515
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3516	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3517	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3518	return NULL;
				3519	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3520	(PyObject *)substring);
				3521	if (substring == NULL)
				3522	return NULL;
				3523
				3524	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3525
				3526	Py_DECREF(substring);
				3527	return result;
				3528	}
				3529
				3530	static PyObject *
				3531	unicode_getitem(PyUnicodeObject *self, int index)
				3532	{
				3533	if (index < 0 \|\| index >= self->length) {
				3534	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3535	return NULL;
				3536	}
				3537
				3538	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3539	}
				3540
				3541	static long
				3542	unicode_hash(PyUnicodeObject *self)
				3543	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3544	/* Since Unicode objects compare equal to their ASCII string
				3545	counterparts, they should use the individual character values
				3546	as basis for their hash value. This is needed to assure that
				3547	strings and Unicode objects behave in the same way as
				3548	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3549
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3550	register int len;
				3551	register Py_UNICODE *p;
				3552	register long x;
				3553
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3554	if (self->hash != -1)
				3555	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3556	len = PyUnicode_GET_SIZE(self);
				3557	p = PyUnicode_AS_UNICODE(self);
				3558	x = *p << 7;
				3559	while (--len >= 0)
				3560	x = (1000003x) ^ p++;
				3561	x ^= PyUnicode_GET_SIZE(self);
				3562	if (x == -1)
				3563	x = -2;
				3564	self->hash = x;
				3565	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3566	}
				3567
				3568	static char index__doc__[] =
				3569	"S.index(sub [,start [,end]]) -> int\n\
				3570	\n\
				3571	Like S.find() but raise ValueError when the substring is not found.";
				3572
				3573	static PyObject *
				3574	unicode_index(PyUnicodeObject self, PyObject args)
				3575	{
				3576	int result;
				3577	PyUnicodeObject *substring;
				3578	int start = 0;
				3579	int end = INT_MAX;
				3580
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3581	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3582	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3583	return NULL;
				3584
				3585	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3586	(PyObject *)substring);
				3587	if (substring == NULL)
				3588	return NULL;
				3589
				3590	result = findstring(self, substring, start, end, 1);
				3591
				3592	Py_DECREF(substring);
				3593	if (result < 0) {
				3594	PyErr_SetString(PyExc_ValueError, "substring not found");
				3595	return NULL;
				3596	}
				3597	return PyInt_FromLong(result);
				3598	}
				3599
				3600	static char islower__doc__[] =
				3601	"S.islower() -> int\n\
				3602	\n\
				3603	Return 1 if all cased characters in S are lowercase and there is\n\
				3604	at least one cased character in S, 0 otherwise.";
				3605
				3606	static PyObject*
				3607	unicode_islower(PyUnicodeObject self, PyObject args)
				3608	{
				3609	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3610	register const Py_UNICODE *e;
				3611	int cased;
				3612
				3613	if (!PyArg_NoArgs(args))
				3614	return NULL;
				3615
				3616	/* Shortcut for single character strings */
				3617	if (PyUnicode_GET_SIZE(self) == 1)
				3618	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3619
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3620	/* Special case for empty strings */
				3621	if (PyString_GET_SIZE(self) == 0)
				3622	return PyInt_FromLong(0);
				3623
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3624	e = p + PyUnicode_GET_SIZE(self);
				3625	cased = 0;
				3626	for (; p < e; p++) {
				3627	register const Py_UNICODE ch = *p;
				3628
				3629	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3630	return PyInt_FromLong(0);
				3631	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3632	cased = 1;
				3633	}
				3634	return PyInt_FromLong(cased);
				3635	}
				3636
				3637	static char isupper__doc__[] =
				3638	"S.isupper() -> int\n\
				3639	\n\
				3640	Return 1 if all cased characters in S are uppercase and there is\n\
				3641	at least one cased character in S, 0 otherwise.";
				3642
				3643	static PyObject*
				3644	unicode_isupper(PyUnicodeObject self, PyObject args)
				3645	{
				3646	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3647	register const Py_UNICODE *e;
				3648	int cased;
				3649
				3650	if (!PyArg_NoArgs(args))
				3651	return NULL;
				3652
				3653	/* Shortcut for single character strings */
				3654	if (PyUnicode_GET_SIZE(self) == 1)
				3655	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3656
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3657	/* Special case for empty strings */
				3658	if (PyString_GET_SIZE(self) == 0)
				3659	return PyInt_FromLong(0);
				3660
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3661	e = p + PyUnicode_GET_SIZE(self);
				3662	cased = 0;
				3663	for (; p < e; p++) {
				3664	register const Py_UNICODE ch = *p;
				3665
				3666	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3667	return PyInt_FromLong(0);
				3668	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3669	cased = 1;
				3670	}
				3671	return PyInt_FromLong(cased);
				3672	}
				3673
				3674	static char istitle__doc__[] =
				3675	"S.istitle() -> int\n\
				3676	\n\
				3677	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3678	may only follow uncased characters and lowercase characters only cased\n\
				3679	ones. Return 0 otherwise.";
				3680
				3681	static PyObject*
				3682	unicode_istitle(PyUnicodeObject self, PyObject args)
				3683	{
				3684	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3685	register const Py_UNICODE *e;
				3686	int cased, previous_is_cased;
				3687
				3688	if (!PyArg_NoArgs(args))
				3689	return NULL;
				3690
				3691	/* Shortcut for single character strings */
				3692	if (PyUnicode_GET_SIZE(self) == 1)
				3693	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3694	(Py_UNICODE_ISUPPER(*p) != 0));
				3695
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3696	/* Special case for empty strings */
				3697	if (PyString_GET_SIZE(self) == 0)
				3698	return PyInt_FromLong(0);
				3699
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3700	e = p + PyUnicode_GET_SIZE(self);
				3701	cased = 0;
				3702	previous_is_cased = 0;
				3703	for (; p < e; p++) {
				3704	register const Py_UNICODE ch = *p;
				3705
				3706	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3707	if (previous_is_cased)
				3708	return PyInt_FromLong(0);
				3709	previous_is_cased = 1;
				3710	cased = 1;
				3711	}
				3712	else if (Py_UNICODE_ISLOWER(ch)) {
				3713	if (!previous_is_cased)
				3714	return PyInt_FromLong(0);
				3715	previous_is_cased = 1;
				3716	cased = 1;
				3717	}
				3718	else
				3719	previous_is_cased = 0;
				3720	}
				3721	return PyInt_FromLong(cased);
				3722	}
				3723
				3724	static char isspace__doc__[] =
				3725	"S.isspace() -> int\n\
				3726	\n\
				3727	Return 1 if there are only whitespace characters in S,\n\
				3728	0 otherwise.";
				3729
				3730	static PyObject*
				3731	unicode_isspace(PyUnicodeObject self, PyObject args)
				3732	{
				3733	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3734	register const Py_UNICODE *e;
				3735
				3736	if (!PyArg_NoArgs(args))
				3737	return NULL;
				3738
				3739	/* Shortcut for single character strings */
				3740	if (PyUnicode_GET_SIZE(self) == 1 &&
				3741	Py_UNICODE_ISSPACE(*p))
				3742	return PyInt_FromLong(1);
				3743
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3744	/* Special case for empty strings */
				3745	if (PyString_GET_SIZE(self) == 0)
				3746	return PyInt_FromLong(0);
				3747
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3748	e = p + PyUnicode_GET_SIZE(self);
				3749	for (; p < e; p++) {
				3750	if (!Py_UNICODE_ISSPACE(*p))
				3751	return PyInt_FromLong(0);
				3752	}
				3753	return PyInt_FromLong(1);
				3754	}
				3755
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3756	static char isalpha__doc__[] =
				3757	"S.isalpha() -> int\n\
				3758	\n\
				3759	Return 1 if all characters in S are alphabetic\n\
				3760	and there is at least one character in S, 0 otherwise.";
				3761
				3762	static PyObject*
				3763	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3764	{
				3765	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3766	register const Py_UNICODE *e;
				3767
				3768	if (!PyArg_NoArgs(args))
				3769	return NULL;
				3770
				3771	/* Shortcut for single character strings */
				3772	if (PyUnicode_GET_SIZE(self) == 1 &&
				3773	Py_UNICODE_ISALPHA(*p))
				3774	return PyInt_FromLong(1);
				3775
				3776	/* Special case for empty strings */
				3777	if (PyString_GET_SIZE(self) == 0)
				3778	return PyInt_FromLong(0);
				3779
				3780	e = p + PyUnicode_GET_SIZE(self);
				3781	for (; p < e; p++) {
				3782	if (!Py_UNICODE_ISALPHA(*p))
				3783	return PyInt_FromLong(0);
				3784	}
				3785	return PyInt_FromLong(1);
				3786	}
				3787
				3788	static char isalnum__doc__[] =
				3789	"S.isalnum() -> int\n\
				3790	\n\
				3791	Return 1 if all characters in S are alphanumeric\n\
				3792	and there is at least one character in S, 0 otherwise.";
				3793
				3794	static PyObject*
				3795	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3796	{
				3797	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3798	register const Py_UNICODE *e;
				3799
				3800	if (!PyArg_NoArgs(args))
				3801	return NULL;
				3802
				3803	/* Shortcut for single character strings */
				3804	if (PyUnicode_GET_SIZE(self) == 1 &&
				3805	Py_UNICODE_ISALNUM(*p))
				3806	return PyInt_FromLong(1);
				3807
				3808	/* Special case for empty strings */
				3809	if (PyString_GET_SIZE(self) == 0)
				3810	return PyInt_FromLong(0);
				3811
				3812	e = p + PyUnicode_GET_SIZE(self);
				3813	for (; p < e; p++) {
				3814	if (!Py_UNICODE_ISALNUM(*p))
				3815	return PyInt_FromLong(0);
				3816	}
				3817	return PyInt_FromLong(1);
				3818	}
				3819
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3820	static char isdecimal__doc__[] =
				3821	"S.isdecimal() -> int\n\
				3822	\n\
				3823	Return 1 if there are only decimal characters in S,\n\
				3824	0 otherwise.";
				3825
				3826	static PyObject*
				3827	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3828	{
				3829	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3830	register const Py_UNICODE *e;
				3831
				3832	if (!PyArg_NoArgs(args))
				3833	return NULL;
				3834
				3835	/* Shortcut for single character strings */
				3836	if (PyUnicode_GET_SIZE(self) == 1 &&
				3837	Py_UNICODE_ISDECIMAL(*p))
				3838	return PyInt_FromLong(1);
				3839
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3840	/* Special case for empty strings */
				3841	if (PyString_GET_SIZE(self) == 0)
				3842	return PyInt_FromLong(0);
				3843
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3844	e = p + PyUnicode_GET_SIZE(self);
				3845	for (; p < e; p++) {
				3846	if (!Py_UNICODE_ISDECIMAL(*p))
				3847	return PyInt_FromLong(0);
				3848	}
				3849	return PyInt_FromLong(1);
				3850	}
				3851
				3852	static char isdigit__doc__[] =
				3853	"S.isdigit() -> int\n\
				3854	\n\
				3855	Return 1 if there are only digit characters in S,\n\
				3856	0 otherwise.";
				3857
				3858	static PyObject*
				3859	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3860	{
				3861	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3862	register const Py_UNICODE *e;
				3863
				3864	if (!PyArg_NoArgs(args))
				3865	return NULL;
				3866
				3867	/* Shortcut for single character strings */
				3868	if (PyUnicode_GET_SIZE(self) == 1 &&
				3869	Py_UNICODE_ISDIGIT(*p))
				3870	return PyInt_FromLong(1);
				3871
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3872	/* Special case for empty strings */
				3873	if (PyString_GET_SIZE(self) == 0)
				3874	return PyInt_FromLong(0);
				3875
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3876	e = p + PyUnicode_GET_SIZE(self);
				3877	for (; p < e; p++) {
				3878	if (!Py_UNICODE_ISDIGIT(*p))
				3879	return PyInt_FromLong(0);
				3880	}
				3881	return PyInt_FromLong(1);
				3882	}
				3883
				3884	static char isnumeric__doc__[] =
				3885	"S.isnumeric() -> int\n\
				3886	\n\
				3887	Return 1 if there are only numeric characters in S,\n\
				3888	0 otherwise.";
				3889
				3890	static PyObject*
				3891	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3892	{
				3893	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3894	register const Py_UNICODE *e;
				3895
				3896	if (!PyArg_NoArgs(args))
				3897	return NULL;
				3898
				3899	/* Shortcut for single character strings */
				3900	if (PyUnicode_GET_SIZE(self) == 1 &&
				3901	Py_UNICODE_ISNUMERIC(*p))
				3902	return PyInt_FromLong(1);
				3903
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3904	/* Special case for empty strings */
				3905	if (PyString_GET_SIZE(self) == 0)
				3906	return PyInt_FromLong(0);
				3907
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3908	e = p + PyUnicode_GET_SIZE(self);
				3909	for (; p < e; p++) {
				3910	if (!Py_UNICODE_ISNUMERIC(*p))
				3911	return PyInt_FromLong(0);
				3912	}
				3913	return PyInt_FromLong(1);
				3914	}
				3915
				3916	static char join__doc__[] =
				3917	"S.join(sequence) -> unicode\n\
				3918	\n\
				3919	Return a string which is the concatenation of the strings in the\n\
				3920	sequence. The separator between elements is S.";
				3921
				3922	static PyObject*
				3923	unicode_join(PyUnicodeObject self, PyObject args)
				3924	{
				3925	PyObject *data;
				3926	if (!PyArg_ParseTuple(args, "O:join", &data))
				3927	return NULL;
				3928
				3929	return PyUnicode_Join((PyObject *)self, data);
				3930	}
				3931
				3932	static int
				3933	unicode_length(PyUnicodeObject *self)
				3934	{
				3935	return self->length;
				3936	}
				3937
				3938	static char ljust__doc__[] =
				3939	"S.ljust(width) -> unicode\n\
				3940	\n\
				3941	Return S left justified in a Unicode string of length width. Padding is\n\
				3942	done using spaces.";
				3943
				3944	static PyObject *
				3945	unicode_ljust(PyUnicodeObject self, PyObject args)
				3946	{
				3947	int width;
				3948	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3949	return NULL;
				3950
				3951	if (self->length >= width) {
				3952	Py_INCREF(self);
				3953	return (PyObject*) self;
				3954	}
				3955
				3956	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3957	}
				3958
				3959	static char lower__doc__[] =
				3960	"S.lower() -> unicode\n\
				3961	\n\
				3962	Return a copy of the string S converted to lowercase.";
				3963
				3964	static PyObject*
				3965	unicode_lower(PyUnicodeObject self, PyObject args)
				3966	{
				3967	if (!PyArg_NoArgs(args))
				3968	return NULL;
				3969	return fixup(self, fixlower);
				3970	}
				3971
				3972	static char lstrip__doc__[] =
				3973	"S.lstrip() -> unicode\n\
				3974	\n\
				3975	Return a copy of the string S with leading whitespace removed.";
				3976
				3977	static PyObject *
				3978	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3979	{
				3980	if (!PyArg_NoArgs(args))
				3981	return NULL;
				3982	return strip(self, 1, 0);
				3983	}
				3984
				3985	static PyObject*
				3986	unicode_repeat(PyUnicodeObject *str, int len)
				3987	{
				3988	PyUnicodeObject *u;
				3989	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3990	int nchars;
				3991	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3992
				3993	if (len < 0)
				3994	len = 0;
				3995
				3996	if (len == 1) {
				3997	/* no repeat, return original string */
				3998	Py_INCREF(str);
				3999	return (PyObject*) str;
				4000	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4001
				4002	/* ensure # of chars needed doesn't overflow int and # of bytes
				4003	* needed doesn't overflow size_t
				4004	*/
				4005	nchars = len * str->length;
				4006	if (len && nchars / len != str->length) {
				4007	PyErr_SetString(PyExc_OverflowError,
				4008	"repeated string is too long");
				4009	return NULL;
				4010	}
				4011	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4012	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4013	PyErr_SetString(PyExc_OverflowError,
				4014	"repeated string is too long");
				4015	return NULL;
				4016	}
				4017	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4018	if (!u)
				4019	return NULL;
				4020
				4021	p = u->str;
				4022
				4023	while (len-- > 0) {
				4024	Py_UNICODE_COPY(p, str->str, str->length);
				4025	p += str->length;
				4026	}
				4027
				4028	return (PyObject*) u;
				4029	}
				4030
				4031	PyObject PyUnicode_Replace(PyObject obj,
				4032	PyObject *subobj,
				4033	PyObject *replobj,
				4034	int maxcount)
				4035	{
				4036	PyObject *self;
				4037	PyObject *str1;
				4038	PyObject *str2;
				4039	PyObject *result;
				4040
				4041	self = PyUnicode_FromObject(obj);
				4042	if (self == NULL)
				4043	return NULL;
				4044	str1 = PyUnicode_FromObject(subobj);
				4045	if (str1 == NULL) {
				4046	Py_DECREF(self);
				4047	return NULL;
				4048	}
				4049	str2 = PyUnicode_FromObject(replobj);
				4050	if (str2 == NULL) {
				4051	Py_DECREF(self);
				4052	Py_DECREF(str1);
				4053	return NULL;
				4054	}
				4055	result = replace((PyUnicodeObject *)self,
				4056	(PyUnicodeObject *)str1,
				4057	(PyUnicodeObject *)str2,
				4058	maxcount);
				4059	Py_DECREF(self);
				4060	Py_DECREF(str1);
				4061	Py_DECREF(str2);
				4062	return result;
				4063	}
				4064
				4065	static char replace__doc__[] =
				4066	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4067	\n\
				4068	Return a copy of S with all occurrences of substring\n\
				4069	old replaced by new. If the optional argument maxsplit is\n\
				4070	given, only the first maxsplit occurrences are replaced.";
				4071
				4072	static PyObject*
				4073	unicode_replace(PyUnicodeObject self, PyObject args)
				4074	{
				4075	PyUnicodeObject *str1;
				4076	PyUnicodeObject *str2;
				4077	int maxcount = -1;
				4078	PyObject *result;
				4079
				4080	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4081	return NULL;
				4082	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4083	if (str1 == NULL)
				4084	return NULL;
				4085	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4086	if (str2 == NULL)
				4087	return NULL;
				4088
				4089	result = replace(self, str1, str2, maxcount);
				4090
				4091	Py_DECREF(str1);
				4092	Py_DECREF(str2);
				4093	return result;
				4094	}
				4095
				4096	static
				4097	PyObject unicode_repr(PyObject unicode)
				4098	{
				4099	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4100	PyUnicode_GET_SIZE(unicode),
				4101	1);
				4102	}
				4103
				4104	static char rfind__doc__[] =
				4105	"S.rfind(sub [,start [,end]]) -> int\n\
				4106	\n\
				4107	Return the highest index in S where substring sub is found,\n\
				4108	such that sub is contained within s[start,end]. Optional\n\
				4109	arguments start and end are interpreted as in slice notation.\n\
				4110	\n\
				4111	Return -1 on failure.";
				4112
				4113	static PyObject *
				4114	unicode_rfind(PyUnicodeObject self, PyObject args)
				4115	{
				4116	PyUnicodeObject *substring;
				4117	int start = 0;
				4118	int end = INT_MAX;
				4119	PyObject *result;
				4120
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4121	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4122	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4123	return NULL;
				4124	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4125	(PyObject *)substring);
				4126	if (substring == NULL)
				4127	return NULL;
				4128
				4129	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4130
				4131	Py_DECREF(substring);
				4132	return result;
				4133	}
				4134
				4135	static char rindex__doc__[] =
				4136	"S.rindex(sub [,start [,end]]) -> int\n\
				4137	\n\
				4138	Like S.rfind() but raise ValueError when the substring is not found.";
				4139
				4140	static PyObject *
				4141	unicode_rindex(PyUnicodeObject self, PyObject args)
				4142	{
				4143	int result;
				4144	PyUnicodeObject *substring;
				4145	int start = 0;
				4146	int end = INT_MAX;
				4147
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4148	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4149	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4150	return NULL;
				4151	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4152	(PyObject *)substring);
				4153	if (substring == NULL)
				4154	return NULL;
				4155
				4156	result = findstring(self, substring, start, end, -1);
				4157
				4158	Py_DECREF(substring);
				4159	if (result < 0) {
				4160	PyErr_SetString(PyExc_ValueError, "substring not found");
				4161	return NULL;
				4162	}
				4163	return PyInt_FromLong(result);
				4164	}
				4165
				4166	static char rjust__doc__[] =
				4167	"S.rjust(width) -> unicode\n\
				4168	\n\
				4169	Return S right justified in a Unicode string of length width. Padding is\n\
				4170	done using spaces.";
				4171
				4172	static PyObject *
				4173	unicode_rjust(PyUnicodeObject self, PyObject args)
				4174	{
				4175	int width;
				4176	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4177	return NULL;
				4178
				4179	if (self->length >= width) {
				4180	Py_INCREF(self);
				4181	return (PyObject*) self;
				4182	}
				4183
				4184	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4185	}
				4186
				4187	static char rstrip__doc__[] =
				4188	"S.rstrip() -> unicode\n\
				4189	\n\
				4190	Return a copy of the string S with trailing whitespace removed.";
				4191
				4192	static PyObject *
				4193	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4194	{
				4195	if (!PyArg_NoArgs(args))
				4196	return NULL;
				4197	return strip(self, 0, 1);
				4198	}
				4199
				4200	static PyObject*
				4201	unicode_slice(PyUnicodeObject *self, int start, int end)
				4202	{
				4203	/* standard clamping */
				4204	if (start < 0)
				4205	start = 0;
				4206	if (end < 0)
				4207	end = 0;
				4208	if (end > self->length)
				4209	end = self->length;
				4210	if (start == 0 && end == self->length) {
				4211	/* full slice, return original string */
				4212	Py_INCREF(self);
				4213	return (PyObject*) self;
				4214	}
				4215	if (start > end)
				4216	start = end;
				4217	/* copy slice */
				4218	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4219	end - start);
				4220	}
				4221
				4222	PyObject PyUnicode_Split(PyObject s,
				4223	PyObject *sep,
				4224	int maxsplit)
				4225	{
				4226	PyObject *result;
				4227
				4228	s = PyUnicode_FromObject(s);
				4229	if (s == NULL)
				4230	return NULL;
				4231	if (sep != NULL) {
				4232	sep = PyUnicode_FromObject(sep);
				4233	if (sep == NULL) {
				4234	Py_DECREF(s);
				4235	return NULL;
				4236	}
				4237	}
				4238
				4239	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4240
				4241	Py_DECREF(s);
				4242	Py_XDECREF(sep);
				4243	return result;
				4244	}
				4245
				4246	static char split__doc__[] =
				4247	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4248	\n\
				4249	Return a list of the words in S, using sep as the\n\
				4250	delimiter string. If maxsplit is given, at most maxsplit\n\
				4251	splits are done. If sep is not specified, any whitespace string\n\
				4252	is a separator.";
				4253
				4254	static PyObject*
				4255	unicode_split(PyUnicodeObject self, PyObject args)
				4256	{
				4257	PyObject *substring = Py_None;
				4258	int maxcount = -1;
				4259
				4260	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4261	return NULL;
				4262
				4263	if (substring == Py_None)
				4264	return split(self, NULL, maxcount);
				4265	else if (PyUnicode_Check(substring))
				4266	return split(self, (PyUnicodeObject *)substring, maxcount);
				4267	else
				4268	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4269	}
				4270
				4271	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4272	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4273	\n\
				4274	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4275	Line breaks are not included in the resulting list unless keepends\n\
				4276	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4277
				4278	static PyObject*
				4279	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4280	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4281	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4282
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4283	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4284	return NULL;
				4285
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4286	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4287	}
				4288
				4289	static
				4290	PyObject unicode_str(PyUnicodeObject self)
				4291	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4292	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4293	}
				4294
				4295	static char strip__doc__[] =
				4296	"S.strip() -> unicode\n\
				4297	\n\
				4298	Return a copy of S with leading and trailing whitespace removed.";
				4299
				4300	static PyObject *
				4301	unicode_strip(PyUnicodeObject self, PyObject args)
				4302	{
				4303	if (!PyArg_NoArgs(args))
				4304	return NULL;
				4305	return strip(self, 1, 1);
				4306	}
				4307
				4308	static char swapcase__doc__[] =
				4309	"S.swapcase() -> unicode\n\
				4310	\n\
				4311	Return a copy of S with uppercase characters converted to lowercase\n\
				4312	and vice versa.";
				4313
				4314	static PyObject*
				4315	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4316	{
				4317	if (!PyArg_NoArgs(args))
				4318	return NULL;
				4319	return fixup(self, fixswapcase);
				4320	}
				4321
				4322	static char translate__doc__[] =
				4323	"S.translate(table) -> unicode\n\
				4324	\n\
				4325	Return a copy of the string S, where all characters have been mapped\n\
				4326	through the given translation table, which must be a mapping of\n\
				4327	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4328	are left untouched. Characters mapped to None are deleted.";
				4329
				4330	static PyObject*
				4331	unicode_translate(PyUnicodeObject self, PyObject args)
				4332	{
				4333	PyObject *table;
				4334
				4335	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4336	return NULL;
				4337	return PyUnicode_TranslateCharmap(self->str,
				4338	self->length,
				4339	table,
				4340	"ignore");
				4341	}
				4342
				4343	static char upper__doc__[] =
				4344	"S.upper() -> unicode\n\
				4345	\n\
				4346	Return a copy of S converted to uppercase.";
				4347
				4348	static PyObject*
				4349	unicode_upper(PyUnicodeObject self, PyObject args)
				4350	{
				4351	if (!PyArg_NoArgs(args))
				4352	return NULL;
				4353	return fixup(self, fixupper);
				4354	}
				4355
				4356	#if 0
				4357	static char zfill__doc__[] =
				4358	"S.zfill(width) -> unicode\n\
				4359	\n\
				4360	Pad a numeric string x with zeros on the left, to fill a field\n\
				4361	of the specified width. The string x is never truncated.";
				4362
				4363	static PyObject *
				4364	unicode_zfill(PyUnicodeObject self, PyObject args)
				4365	{
				4366	int fill;
				4367	PyUnicodeObject *u;
				4368
				4369	int width;
				4370	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4371	return NULL;
				4372
				4373	if (self->length >= width) {
				4374	Py_INCREF(self);
				4375	return (PyObject*) self;
				4376	}
				4377
				4378	fill = width - self->length;
				4379
				4380	u = pad(self, fill, 0, '0');
				4381
				4382	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4383	/* move sign to beginning of string */
				4384	u->str[0] = u->str[fill];
				4385	u->str[fill] = '0';
				4386	}
				4387
				4388	return (PyObject*) u;
				4389	}
				4390	#endif
				4391
				4392	#if 0
				4393	static PyObject*
				4394	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4395	{
				4396	if (!PyArg_NoArgs(args))
				4397	return NULL;
				4398	return PyInt_FromLong(unicode_freelist_size);
				4399	}
				4400	#endif
				4401
				4402	static char startswith__doc__[] =
				4403	"S.startswith(prefix[, start[, end]]) -> int\n\
				4404	\n\
				4405	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4406	optional start, test S beginning at that position. With optional end, stop\n\
				4407	comparing S at that position.";
				4408
				4409	static PyObject *
				4410	unicode_startswith(PyUnicodeObject *self,
				4411	PyObject *args)
				4412	{
				4413	PyUnicodeObject *substring;
				4414	int start = 0;
				4415	int end = INT_MAX;
				4416	PyObject *result;
				4417
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4418	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4419	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4420	return NULL;
				4421	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4422	(PyObject *)substring);
				4423	if (substring == NULL)
				4424	return NULL;
				4425
				4426	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4427
				4428	Py_DECREF(substring);
				4429	return result;
				4430	}
				4431
				4432
				4433	static char endswith__doc__[] =
				4434	"S.endswith(suffix[, start[, end]]) -> int\n\
				4435	\n\
				4436	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4437	optional start, test S beginning at that position. With optional end, stop\n\
				4438	comparing S at that position.";
				4439
				4440	static PyObject *
				4441	unicode_endswith(PyUnicodeObject *self,
				4442	PyObject *args)
				4443	{
				4444	PyUnicodeObject *substring;
				4445	int start = 0;
				4446	int end = INT_MAX;
				4447	PyObject *result;
				4448
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4449	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4450	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4451	return NULL;
				4452	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4453	(PyObject *)substring);
				4454	if (substring == NULL)
				4455	return NULL;
				4456
				4457	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4458
				4459	Py_DECREF(substring);
				4460	return result;
				4461	}
				4462
				4463
				4464	static PyMethodDef unicode_methods[] = {
				4465
				4466	/* Order is according to common usage: often used methods should
				4467	appear first, since lookup is done sequentially. */
				4468
				4469	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4470	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4471	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4472	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4473	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4474	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4475	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4476	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4477	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4478	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4479	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4480	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4481	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4482	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4483	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4484	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4485	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4486	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4487	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4488	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4489	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4490	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4491	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4492	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4493	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4494	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4495	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4496	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4497	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4498	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4499	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4500	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4501	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4502	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4503	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4504	#if 0
				4505	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4506	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4507	#endif
				4508
				4509	#if 0
				4510	/* This one is just used for debugging the implementation. */
				4511	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4512	#endif
				4513
				4514	{NULL, NULL}
				4515	};
				4516
				4517	static PyObject *
				4518	unicode_getattr(PyUnicodeObject self, char name)
				4519	{
				4520	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4521	}
				4522
				4523	static PySequenceMethods unicode_as_sequence = {
				4524	(inquiry) unicode_length, /* sq_length */
				4525	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4526	(intargfunc) unicode_repeat, /* sq_repeat */
				4527	(intargfunc) unicode_getitem, /* sq_item */
				4528	(intintargfunc) unicode_slice, /* sq_slice */
				4529	0, /* sq_ass_item */
				4530	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4531	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4532	};
				4533
				4534	static int
				4535	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4536	int index,
				4537	const void **ptr)
				4538	{
				4539	if (index != 0) {
				4540	PyErr_SetString(PyExc_SystemError,
				4541	"accessing non-existent unicode segment");
				4542	return -1;
				4543	}
				4544	ptr = (void ) self->str;
				4545	return PyUnicode_GET_DATA_SIZE(self);
				4546	}
				4547
				4548	static int
				4549	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4550	const void **ptr)
				4551	{
				4552	PyErr_SetString(PyExc_TypeError,
				4553	"cannot use unicode as modifyable buffer");
				4554	return -1;
				4555	}
				4556
				4557	static int
				4558	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4559	int *lenp)
				4560	{
				4561	if (lenp)
				4562	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4563	return 1;
				4564	}
				4565
				4566	static int
				4567	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4568	int index,
				4569	const void **ptr)
				4570	{
				4571	PyObject *str;
				4572
				4573	if (index != 0) {
				4574	PyErr_SetString(PyExc_SystemError,
				4575	"accessing non-existent unicode segment");
				4576	return -1;
				4577	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4578	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4579	if (str == NULL)
				4580	return -1;
				4581	ptr = (void ) PyString_AS_STRING(str);
				4582	return PyString_GET_SIZE(str);
				4583	}
				4584
				4585	/* Helpers for PyUnicode_Format() */
				4586
				4587	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4588	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4589	{
				4590	int argidx = *p_argidx;
				4591	if (argidx < arglen) {
				4592	(*p_argidx)++;
				4593	if (arglen < 0)
				4594	return args;
				4595	else
				4596	return PyTuple_GetItem(args, argidx);
				4597	}
				4598	PyErr_SetString(PyExc_TypeError,
				4599	"not enough arguments for format string");
				4600	return NULL;
				4601	}
				4602
				4603	#define F_LJUST (1<<0)
				4604	#define F_SIGN (1<<1)
				4605	#define F_BLANK (1<<2)
				4606	#define F_ALT (1<<3)
				4607	#define F_ZERO (1<<4)
				4608
				4609	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4610	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4611	{
				4612	register int i;
				4613	int len;
				4614	va_list va;
				4615	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4616	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4617
				4618	/* First, format the string as char array, then expand to Py_UNICODE
				4619	array. */
				4620	charbuffer = (char *)buffer;
				4621	len = vsprintf(charbuffer, format, va);
				4622	for (i = len - 1; i >= 0; i--)
				4623	buffer[i] = (Py_UNICODE) charbuffer[i];
				4624
				4625	va_end(va);
				4626	return len;
				4627	}
				4628
				4629	static int
				4630	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4631	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4632	int flags,
				4633	int prec,
				4634	int type,
				4635	PyObject *v)
				4636	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4637	/* fmt = '%#.' + `prec` + `type`
				4638	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4639	char fmt[20];
				4640	double x;
				4641
				4642	x = PyFloat_AsDouble(v);
				4643	if (x == -1.0 && PyErr_Occurred())
				4644	return -1;
				4645	if (prec < 0)
				4646	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4647	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4648	type = 'g';
				4649	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4650	/* worst case length calc to ensure no buffer overrun:
				4651	fmt = %#.<prec>g
				4652	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4653	for any double rep.)
				4654	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4655	If prec=0 the effective precision is 1 (the leading digit is
				4656	always given), therefore increase by one to 10+prec. */
				4657	if (buflen <= (size_t)10 + (size_t)prec) {
				4658	PyErr_SetString(PyExc_OverflowError,
				4659	"formatted float is too long (precision too long?)");
				4660	return -1;
				4661	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4662	return usprintf(buf, fmt, x);
				4663	}
				4664
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4665	static PyObject*
				4666	formatlong(PyObject *val, int flags, int prec, int type)
				4667	{
				4668	char *buf;
				4669	int i, len;
				4670	PyObject str; / temporary string object. */
				4671	PyUnicodeObject *result;
				4672
				4673	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4674	if (!str)
				4675	return NULL;
				4676	result = _PyUnicode_New(len);
				4677	for (i = 0; i < len; i++)
				4678	result->str[i] = buf[i];
				4679	result->str[len] = 0;
				4680	Py_DECREF(str);
				4681	return (PyObject*)result;
				4682	}
				4683
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4684	static int
				4685	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4686	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4687	int flags,
				4688	int prec,
				4689	int type,
				4690	PyObject *v)
				4691	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4692	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4693	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4694	+ 1 + 1 = 24*/
				4695	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4696	long x;
				4697
				4698	x = PyInt_AsLong(v);
				4699	if (x == -1 && PyErr_Occurred())
				4700	return -1;
				4701	if (prec < 0)
				4702	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4703	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4704	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4705	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4706	PyErr_SetString(PyExc_OverflowError,
				4707	"formatted integer is too long (precision too long?)");
				4708	return -1;
				4709	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4710	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4711	return usprintf(buf, fmt, x);
				4712	}
				4713
				4714	static int
				4715	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4716	size_t buflen,
				4717	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4718	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4719	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4720	if (PyUnicode_Check(v)) {
				4721	if (PyUnicode_GET_SIZE(v) != 1)
				4722	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4723	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4724	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4725
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4726	else if (PyString_Check(v)) {
				4727	if (PyString_GET_SIZE(v) != 1)
				4728	goto onError;
				4729	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4730	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4731
				4732	else {
				4733	/* Integer input truncated to a character */
				4734	long x;
				4735	x = PyInt_AsLong(v);
				4736	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4737	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4738	buf[0] = (char) x;
				4739	}
				4740	buf[1] = '\0';
				4741	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4742
				4743	onError:
				4744	PyErr_SetString(PyExc_TypeError,
				4745	"%c requires int or char");
				4746	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4747	}
				4748
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4749	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4750
				4751	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4752	chars are formatted. XXX This is a magic number. Each formatting
				4753	routine does bounds checking to ensure no overflow, but a better
				4754	solution may be to malloc a buffer of appropriate size for each
				4755	format. For now, the current solution is sufficient.
				4756	*/
				4757	#define FORMATBUFLEN (size_t)120
				4758
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4759	PyObject PyUnicode_Format(PyObject format,
				4760	PyObject *args)
				4761	{
				4762	Py_UNICODE fmt, res;
				4763	int fmtcnt, rescnt, reslen, arglen, argidx;
				4764	int args_owned = 0;
				4765	PyUnicodeObject *result = NULL;
				4766	PyObject *dict = NULL;
				4767	PyObject *uformat;
				4768
				4769	if (format == NULL \|\| args == NULL) {
				4770	PyErr_BadInternalCall();
				4771	return NULL;
				4772	}
				4773	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4774	if (uformat == NULL)
				4775	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4776	fmt = PyUnicode_AS_UNICODE(uformat);
				4777	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4778
				4779	reslen = rescnt = fmtcnt + 100;
				4780	result = _PyUnicode_New(reslen);
				4781	if (result == NULL)
				4782	goto onError;
				4783	res = PyUnicode_AS_UNICODE(result);
				4784
				4785	if (PyTuple_Check(args)) {
				4786	arglen = PyTuple_Size(args);
				4787	argidx = 0;
				4788	}
				4789	else {
				4790	arglen = -1;
				4791	argidx = -2;
				4792	}
				4793	if (args->ob_type->tp_as_mapping)
				4794	dict = args;
				4795
				4796	while (--fmtcnt >= 0) {
				4797	if (*fmt != '%') {
				4798	if (--rescnt < 0) {
				4799	rescnt = fmtcnt + 100;
				4800	reslen += rescnt;
				4801	if (_PyUnicode_Resize(result, reslen) < 0)
				4802	return NULL;
				4803	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4804	--rescnt;
				4805	}
				4806	res++ = fmt++;
				4807	}
				4808	else {
				4809	/* Got a format specifier */
				4810	int flags = 0;
				4811	int width = -1;
				4812	int prec = -1;
				4813	int size = 0;
				4814	Py_UNICODE c = '\0';
				4815	Py_UNICODE fill;
				4816	PyObject *v = NULL;
				4817	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4818	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4819	Py_UNICODE sign;
				4820	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4821	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4822
				4823	fmt++;
				4824	if (*fmt == '(') {
				4825	Py_UNICODE *keystart;
				4826	int keylen;
				4827	PyObject *key;
				4828	int pcount = 1;
				4829
				4830	if (dict == NULL) {
				4831	PyErr_SetString(PyExc_TypeError,
				4832	"format requires a mapping");
				4833	goto onError;
				4834	}
				4835	++fmt;
				4836	--fmtcnt;
				4837	keystart = fmt;
				4838	/* Skip over balanced parentheses */
				4839	while (pcount > 0 && --fmtcnt >= 0) {
				4840	if (*fmt == ')')
				4841	--pcount;
				4842	else if (*fmt == '(')
				4843	++pcount;
				4844	fmt++;
				4845	}
				4846	keylen = fmt - keystart - 1;
				4847	if (fmtcnt < 0 \|\| pcount > 0) {
				4848	PyErr_SetString(PyExc_ValueError,
				4849	"incomplete format key");
				4850	goto onError;
				4851	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4852	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4853	then looked up since Python uses strings to hold
				4854	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4855	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4856	key = PyUnicode_EncodeUTF8(keystart,
				4857	keylen,
				4858	NULL);
				4859	if (key == NULL)
				4860	goto onError;
				4861	if (args_owned) {
				4862	Py_DECREF(args);
				4863	args_owned = 0;
				4864	}
				4865	args = PyObject_GetItem(dict, key);
				4866	Py_DECREF(key);
				4867	if (args == NULL) {
				4868	goto onError;
				4869	}
				4870	args_owned = 1;
				4871	arglen = -1;
				4872	argidx = -2;
				4873	}
				4874	while (--fmtcnt >= 0) {
				4875	switch (c = *fmt++) {
				4876	case '-': flags \|= F_LJUST; continue;
				4877	case '+': flags \|= F_SIGN; continue;
				4878	case ' ': flags \|= F_BLANK; continue;
				4879	case '#': flags \|= F_ALT; continue;
				4880	case '0': flags \|= F_ZERO; continue;
				4881	}
				4882	break;
				4883	}
				4884	if (c == '*') {
				4885	v = getnextarg(args, arglen, &argidx);
				4886	if (v == NULL)
				4887	goto onError;
				4888	if (!PyInt_Check(v)) {
				4889	PyErr_SetString(PyExc_TypeError,
				4890	"* wants int");
				4891	goto onError;
				4892	}
				4893	width = PyInt_AsLong(v);
				4894	if (width < 0) {
				4895	flags \|= F_LJUST;
				4896	width = -width;
				4897	}
				4898	if (--fmtcnt >= 0)
				4899	c = *fmt++;
				4900	}
				4901	else if (c >= '0' && c <= '9') {
				4902	width = c - '0';
				4903	while (--fmtcnt >= 0) {
				4904	c = *fmt++;
				4905	if (c < '0' \|\| c > '9')
				4906	break;
				4907	if ((width*10) / 10 != width) {
				4908	PyErr_SetString(PyExc_ValueError,
				4909	"width too big");
				4910	goto onError;
				4911	}
				4912	width = width*10 + (c - '0');
				4913	}
				4914	}
				4915	if (c == '.') {
				4916	prec = 0;
				4917	if (--fmtcnt >= 0)
				4918	c = *fmt++;
				4919	if (c == '*') {
				4920	v = getnextarg(args, arglen, &argidx);
				4921	if (v == NULL)
				4922	goto onError;
				4923	if (!PyInt_Check(v)) {
				4924	PyErr_SetString(PyExc_TypeError,
				4925	"* wants int");
				4926	goto onError;
				4927	}
				4928	prec = PyInt_AsLong(v);
				4929	if (prec < 0)
				4930	prec = 0;
				4931	if (--fmtcnt >= 0)
				4932	c = *fmt++;
				4933	}
				4934	else if (c >= '0' && c <= '9') {
				4935	prec = c - '0';
				4936	while (--fmtcnt >= 0) {
				4937	c = Py_CHARMASK(*fmt++);
				4938	if (c < '0' \|\| c > '9')
				4939	break;
				4940	if ((prec*10) / 10 != prec) {
				4941	PyErr_SetString(PyExc_ValueError,
				4942	"prec too big");
				4943	goto onError;
				4944	}
				4945	prec = prec*10 + (c - '0');
				4946	}
				4947	}
				4948	} /* prec */
				4949	if (fmtcnt >= 0) {
				4950	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4951	size = c;
				4952	if (--fmtcnt >= 0)
				4953	c = *fmt++;
				4954	}
				4955	}
				4956	if (fmtcnt < 0) {
				4957	PyErr_SetString(PyExc_ValueError,
				4958	"incomplete format");
				4959	goto onError;
				4960	}
				4961	if (c != '%') {
				4962	v = getnextarg(args, arglen, &argidx);
				4963	if (v == NULL)
				4964	goto onError;
				4965	}
				4966	sign = 0;
				4967	fill = ' ';
				4968	switch (c) {
				4969
				4970	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4971	pbuf = formatbuf;
				4972	/* presume that buffer length is at least 1 */
				4973	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4974	len = 1;
				4975	break;
				4976
				4977	case 's':
				4978	case 'r':
				4979	if (PyUnicode_Check(v) && c == 's') {
				4980	temp = v;
				4981	Py_INCREF(temp);
				4982	}
				4983	else {
				4984	PyObject *unicode;
				4985	if (c == 's')
				4986	temp = PyObject_Str(v);
				4987	else
				4988	temp = PyObject_Repr(v);
				4989	if (temp == NULL)
				4990	goto onError;
				4991	if (!PyString_Check(temp)) {
				4992	/* XXX Note: this should never happen, since
				4993	PyObject_Repr() and PyObject_Str() assure
				4994	this */
				4995	Py_DECREF(temp);
				4996	PyErr_SetString(PyExc_TypeError,
				4997	"%s argument has non-string str()");
				4998	goto onError;
				4999	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5000	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5001	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5002	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5003	"strict");
				5004	Py_DECREF(temp);
				5005	temp = unicode;
				5006	if (temp == NULL)
				5007	goto onError;
				5008	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5009	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5010	len = PyUnicode_GET_SIZE(temp);
				5011	if (prec >= 0 && len > prec)
				5012	len = prec;
				5013	break;
				5014
				5015	case 'i':
				5016	case 'd':
				5017	case 'u':
				5018	case 'o':
				5019	case 'x':
				5020	case 'X':
				5021	if (c == 'i')
				5022	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5023	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5024	temp = formatlong(v, flags, prec, c);
				5025	if (!temp)
				5026	goto onError;
				5027	pbuf = PyUnicode_AS_UNICODE(temp);
				5028	len = PyUnicode_GET_SIZE(temp);
				5029	/* unbounded ints can always produce
				5030	a sign character! */
				5031	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5032	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5033	else {
				5034	pbuf = formatbuf;
				5035	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5036	flags, prec, c, v);
				5037	if (len < 0)
				5038	goto onError;
				5039	/* only d conversion is signed */
				5040	sign = c == 'd';
				5041	}
				5042	if (flags & F_ZERO)
				5043	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5044	break;
				5045
				5046	case 'e':
				5047	case 'E':
				5048	case 'f':
				5049	case 'g':
				5050	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5051	pbuf = formatbuf;
				5052	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5053	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5054	if (len < 0)
				5055	goto onError;
				5056	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5057	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5058	fill = '0';
				5059	break;
				5060
				5061	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5062	pbuf = formatbuf;
				5063	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5064	if (len < 0)
				5065	goto onError;
				5066	break;
				5067
				5068	default:
				5069	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame^]	5070	"unsupported format character '%c' (0x%x) "
				5071	"at index %i",
				5072	c, c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5073	goto onError;
				5074	}
				5075	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5076	if (pbuf == '-' \|\| pbuf == '+') {
				5077	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5078	len--;
				5079	}
				5080	else if (flags & F_SIGN)
				5081	sign = '+';
				5082	else if (flags & F_BLANK)
				5083	sign = ' ';
				5084	else
				5085	sign = 0;
				5086	}
				5087	if (width < len)
				5088	width = len;
				5089	if (rescnt < width + (sign != 0)) {
				5090	reslen -= rescnt;
				5091	rescnt = width + fmtcnt + 100;
				5092	reslen += rescnt;
				5093	if (_PyUnicode_Resize(result, reslen) < 0)
				5094	return NULL;
				5095	res = PyUnicode_AS_UNICODE(result)
				5096	+ reslen - rescnt;
				5097	}
				5098	if (sign) {
				5099	if (fill != ' ')
				5100	*res++ = sign;
				5101	rescnt--;
				5102	if (width > len)
				5103	width--;
				5104	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5105	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5106	assert(pbuf[0] == '0');
				5107	assert(pbuf[1] == c);
				5108	if (fill != ' ') {
				5109	res++ = pbuf++;
				5110	res++ = pbuf++;
				5111	}
				5112	rescnt -= 2;
				5113	width -= 2;
				5114	if (width < 0)
				5115	width = 0;
				5116	len -= 2;
				5117	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5118	if (width > len && !(flags & F_LJUST)) {
				5119	do {
				5120	--rescnt;
				5121	*res++ = fill;
				5122	} while (--width > len);
				5123	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5124	if (fill == ' ') {
				5125	if (sign)
				5126	*res++ = sign;
				5127	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5128	assert(pbuf[0] == '0');
				5129	assert(pbuf[1] == c);
				5130	res++ = pbuf++;
				5131	res++ = pbuf++;
				5132	}
				5133	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5134	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5135	res += len;
				5136	rescnt -= len;
				5137	while (--width >= len) {
				5138	--rescnt;
				5139	*res++ = ' ';
				5140	}
				5141	if (dict && (argidx < arglen) && c != '%') {
				5142	PyErr_SetString(PyExc_TypeError,
				5143	"not all arguments converted");
				5144	goto onError;
				5145	}
				5146	Py_XDECREF(temp);
				5147	} /* '%' */
				5148	} /* until end */
				5149	if (argidx < arglen && !dict) {
				5150	PyErr_SetString(PyExc_TypeError,
				5151	"not all arguments converted");
				5152	goto onError;
				5153	}
				5154
				5155	if (args_owned) {
				5156	Py_DECREF(args);
				5157	}
				5158	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5159	if (_PyUnicode_Resize(result, reslen - rescnt))
				5160	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5161	return (PyObject *)result;
				5162
				5163	onError:
				5164	Py_XDECREF(result);
				5165	Py_DECREF(uformat);
				5166	if (args_owned) {
				5167	Py_DECREF(args);
				5168	}
				5169	return NULL;
				5170	}
				5171
				5172	static PyBufferProcs unicode_as_buffer = {
				5173	(getreadbufferproc) unicode_buffer_getreadbuf,
				5174	(getwritebufferproc) unicode_buffer_getwritebuf,
				5175	(getsegcountproc) unicode_buffer_getsegcount,
				5176	(getcharbufferproc) unicode_buffer_getcharbuf,
				5177	};
				5178
				5179	PyTypeObject PyUnicode_Type = {
				5180	PyObject_HEAD_INIT(&PyType_Type)
				5181	0, /* ob_size */
				5182	"unicode", /* tp_name */
				5183	sizeof(PyUnicodeObject), /* tp_size */
				5184	0, /* tp_itemsize */
				5185	/* Slots */
				5186	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5187	0, /* tp_print */
				5188	(getattrfunc)unicode_getattr, /* tp_getattr */
				5189	0, /* tp_setattr */
				5190	(cmpfunc) unicode_compare, /* tp_compare */
				5191	(reprfunc) unicode_repr, /* tp_repr */
				5192	0, /* tp_as_number */
				5193	&unicode_as_sequence, /* tp_as_sequence */
				5194	0, /* tp_as_mapping */
				5195	(hashfunc) unicode_hash, /* tp_hash*/
				5196	0, /* tp_call*/
				5197	(reprfunc) unicode_str, /* tp_str */
				5198	(getattrofunc) NULL, /* tp_getattro */
				5199	(setattrofunc) NULL, /* tp_setattro */
				5200	&unicode_as_buffer, /* tp_as_buffer */
				5201	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5202	};
				5203
				5204	/* Initialize the Unicode implementation */
				5205
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5206	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5207	{
				5208	/* Doublecheck the configuration... */
				5209	if (sizeof(Py_UNICODE) != 2)
				5210	Py_FatalError("Unicode configuration error: "
				5211	"sizeof(Py_UNICODE) != 2 bytes");
				5212
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5213	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5214	unicode_freelist = NULL;
				5215	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5216	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5217	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5218	}
				5219
				5220	/* Finalize the Unicode implementation */
				5221
				5222	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5223	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5224	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5225	PyUnicodeObject *u;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5226
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5227	Py_XDECREF(unicode_empty);
				5228	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5229
				5230	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5231	PyUnicodeObject *v = u;
				5232	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5233	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5234	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5235	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5236	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5237	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5238	unicode_freelist = NULL;
				5239	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5240	}