Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: a3678d52e8c52e69cb55ba6cebf5c4baf1952b95 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	67	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	68	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	69
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	70	#ifdef MS_WIN32
				71	#include <windows.h>
				72	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	73
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	74	/* Limit for the Unicode object free list */
				75
				76	#define MAX_UNICODE_FREELIST_SIZE 1024
				77
				78	/* Limit for the Unicode object free list stay alive optimization.
				79
				80	The implementation will keep allocated Unicode memory intact for
				81	all objects on the free list having a size less than this
				82	limit. This reduces malloc() overhead for small Unicode objects.
				83
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	84	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	85	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	malloc()-overhead) bytes of unused garbage.
				87
				88	Setting the limit to 0 effectively turns the feature off.
				89
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	90	Note: This is an experimental feature ! If you get core dumps when
				91	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92
				93	*/
				94
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	95	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	96
				97	/* Endianness switches; defaults to little endian */
				98
				99	#ifdef WORDS_BIGENDIAN
				100	# define BYTEORDER_IS_BIG_ENDIAN
				101	#else
				102	# define BYTEORDER_IS_LITTLE_ENDIAN
				103	#endif
				104
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	105	/* --- Globals ------------------------------------------------------------
				106
				107	The globals are initialized by the _PyUnicode_Init() API and should
				108	not be used before calling that API.
				109
				110	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	111
				112	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	113	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	114
				115	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	116	static PyUnicodeObject *unicode_freelist;
				117	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	119	/* Default encoding to use and assume when NULL is passed as encoding
				120	parameter; it is initialized by _PyUnicode_Init().
				121
				122	Always use the PyUnicode_SetDefaultEncoding() and
				123	PyUnicode_GetDefaultEncoding() APIs to access this global.
				124
				125	*/
				126
				127	static char unicode_default_encoding[100];
				128
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129	/* --- Unicode Object ----------------------------------------------------- */
				130
				131	static
				132	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				133	int length)
				134	{
				135	void *oldstr;
				136
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	137	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	138	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	139	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	140
				141	/* Resizing unicode_empty is not allowed. */
				142	if (unicode == unicode_empty) {
				143	PyErr_SetString(PyExc_SystemError,
				144	"can't resize empty unicode object");
				145	return -1;
				146	}
				147
				148	/* We allocate one more byte to make sure the string is
				149	Ux0000 terminated -- XXX is this needed ? */
				150	oldstr = unicode->str;
				151	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				152	if (!unicode->str) {
				153	unicode->str = oldstr;
				154	PyErr_NoMemory();
				155	return -1;
				156	}
				157	unicode->str[length] = 0;
				158	unicode->length = length;
				159
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	160	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	161	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	162	if (unicode->defenc) {
				163	Py_DECREF(unicode->defenc);
				164	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	165	}
				166	unicode->hash = -1;
				167
				168	return 0;
				169	}
				170
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	171	int PyUnicode_Resize(PyObject **unicode,
				172	int length)
				173	{
				174	PyUnicodeObject *v;
				175
				176	if (unicode == NULL) {
				177	PyErr_BadInternalCall();
				178	return -1;
				179	}
				180	v = (PyUnicodeObject )unicode;
				181	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				182	PyErr_BadInternalCall();
				183	return -1;
				184	}
				185	return _PyUnicode_Resize(v, length);
				186	}
				187
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	/* We allocate one more byte to make sure the string is
				189	Ux0000 terminated -- XXX is this needed ?
				190
				191	XXX This allocator could further be enhanced by assuring that the
				192	free list never reduces its size below 1.
				193
				194	*/
				195
				196	static
				197	PyUnicodeObject *_PyUnicode_New(int length)
				198	{
				199	register PyUnicodeObject *unicode;
				200
				201	/* Optimization for empty strings */
				202	if (length == 0 && unicode_empty != NULL) {
				203	Py_INCREF(unicode_empty);
				204	return unicode_empty;
				205	}
				206
				207	/* Unicode freelist & memory allocation */
				208	if (unicode_freelist) {
				209	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	210	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	211	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	212	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	213	/* Keep-Alive optimization: we only upsize the buffer,
				214	never downsize it. */
				215	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	216	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	217	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	218	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	219	}
				220	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	221	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	222	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	223	}
				224	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	225	}
				226	else {
				227	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				228	if (unicode == NULL)
				229	return NULL;
				230	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				231	}
				232
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	233	if (!unicode->str) {
				234	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	235	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	236	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	237	unicode->str[length] = 0;
				238	unicode->length = length;
				239	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	240	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	242
				243	onError:
				244	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	246	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	247	}
				248
				249	static
				250	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				251	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	252	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	253	/* Keep-Alive optimization */
				254	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	255	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	256	unicode->str = NULL;
				257	unicode->length = 0;
				258	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	259	if (unicode->defenc) {
				260	Py_DECREF(unicode->defenc);
				261	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	262	}
				263	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	264	(PyUnicodeObject *)unicode = unicode_freelist;
				265	unicode_freelist = unicode;
				266	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	267	}
				268	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	269	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	270	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	271	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	272	}
				273	}
				274
				275	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				276	int size)
				277	{
				278	PyUnicodeObject *unicode;
				279
				280	unicode = _PyUnicode_New(size);
				281	if (!unicode)
				282	return NULL;
				283
				284	/* Copy the Unicode data into the new object */
				285	if (u != NULL)
				286	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				287
				288	return (PyObject *)unicode;
				289	}
				290
				291	#ifdef HAVE_WCHAR_H
				292
				293	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				294	int size)
				295	{
				296	PyUnicodeObject *unicode;
				297
				298	if (w == NULL) {
				299	PyErr_BadInternalCall();
				300	return NULL;
				301	}
				302
				303	unicode = _PyUnicode_New(size);
				304	if (!unicode)
				305	return NULL;
				306
				307	/* Copy the wchar_t data into the new object */
				308	#ifdef HAVE_USABLE_WCHAR_T
				309	memcpy(unicode->str, w, size * sizeof(wchar_t));
				310	#else
				311	{
				312	register Py_UNICODE *u;
				313	register int i;
				314	u = PyUnicode_AS_UNICODE(unicode);
				315	for (i = size; i >= 0; i--)
				316	u++ = w++;
				317	}
				318	#endif
				319
				320	return (PyObject *)unicode;
				321	}
				322
				323	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				324	register wchar_t *w,
				325	int size)
				326	{
				327	if (unicode == NULL) {
				328	PyErr_BadInternalCall();
				329	return -1;
				330	}
				331	if (size > PyUnicode_GET_SIZE(unicode))
				332	size = PyUnicode_GET_SIZE(unicode);
				333	#ifdef HAVE_USABLE_WCHAR_T
				334	memcpy(w, unicode->str, size * sizeof(wchar_t));
				335	#else
				336	{
				337	register Py_UNICODE *u;
				338	register int i;
				339	u = PyUnicode_AS_UNICODE(unicode);
				340	for (i = size; i >= 0; i--)
				341	w++ = u++;
				342	}
				343	#endif
				344
				345	return size;
				346	}
				347
				348	#endif
				349
				350	PyObject PyUnicode_FromObject(register PyObject obj)
				351	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	352	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				353	}
				354
				355	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				356	const char *encoding,
				357	const char *errors)
				358	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	359	const char *s;
				360	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	361	int owned = 0;
				362	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	363
				364	if (obj == NULL) {
				365	PyErr_BadInternalCall();
				366	return NULL;
				367	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	368
				369	/* Coerce object */
				370	if (PyInstance_Check(obj)) {
				371	PyObject *func;
				372	func = PyObject_GetAttrString(obj, "__str__");
				373	if (func == NULL) {
				374	PyErr_SetString(PyExc_TypeError,
				375	"coercing to Unicode: instance doesn't define __str__");
				376	return NULL;
				377	}
				378	obj = PyEval_CallObject(func, NULL);
				379	Py_DECREF(func);
				380	if (obj == NULL)
				381	return NULL;
				382	owned = 1;
				383	}
				384	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	385	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	386	v = obj;
				387	if (encoding) {
				388	PyErr_SetString(PyExc_TypeError,
				389	"decoding Unicode is not supported");
				390	return NULL;
				391	}
				392	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	393	}
				394	else if (PyString_Check(obj)) {
				395	s = PyString_AS_STRING(obj);
				396	len = PyString_GET_SIZE(obj);
				397	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	398	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				399	/* Overwrite the error message with something more useful in
				400	case of a TypeError. */
				401	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	402	PyErr_Format(PyExc_TypeError,
				403	"coercing to Unicode: need string or buffer, "
				404	"%.80s found",
				405	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	406	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	407	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	408
				409	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	410	if (len == 0) {
				411	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	412	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	413	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	414	else
				415	v = PyUnicode_Decode(s, len, encoding, errors);
				416	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	417	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	418	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	419	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	420	return v;
				421
				422	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	423	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	424	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	425	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	426	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	427	}
				428
				429	PyObject PyUnicode_Decode(const char s,
				430	int size,
				431	const char *encoding,
				432	const char *errors)
				433	{
				434	PyObject buffer = NULL, unicode;
				435
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	436	if (encoding == NULL)
				437	encoding = PyUnicode_GetDefaultEncoding();
				438
				439	/* Shortcuts for common default encodings */
				440	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	441	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	442	else if (strcmp(encoding, "latin-1") == 0)
				443	return PyUnicode_DecodeLatin1(s, size, errors);
				444	else if (strcmp(encoding, "ascii") == 0)
				445	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	446
				447	/* Decode via the codec registry */
				448	buffer = PyBuffer_FromMemory((void *)s, size);
				449	if (buffer == NULL)
				450	goto onError;
				451	unicode = PyCodec_Decode(buffer, encoding, errors);
				452	if (unicode == NULL)
				453	goto onError;
				454	if (!PyUnicode_Check(unicode)) {
				455	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	456	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	457	unicode->ob_type->tp_name);
				458	Py_DECREF(unicode);
				459	goto onError;
				460	}
				461	Py_DECREF(buffer);
				462	return unicode;
				463
				464	onError:
				465	Py_XDECREF(buffer);
				466	return NULL;
				467	}
				468
				469	PyObject PyUnicode_Encode(const Py_UNICODE s,
				470	int size,
				471	const char *encoding,
				472	const char *errors)
				473	{
				474	PyObject v, unicode;
				475
				476	unicode = PyUnicode_FromUnicode(s, size);
				477	if (unicode == NULL)
				478	return NULL;
				479	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				480	Py_DECREF(unicode);
				481	return v;
				482	}
				483
				484	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				485	const char *encoding,
				486	const char *errors)
				487	{
				488	PyObject *v;
				489
				490	if (!PyUnicode_Check(unicode)) {
				491	PyErr_BadArgument();
				492	goto onError;
				493	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	494
				495	if (encoding == NULL)
				496	encoding = PyUnicode_GetDefaultEncoding();
				497
				498	/* Shortcuts for common default encodings */
				499	if (errors == NULL) {
				500	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	501	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	502	else if (strcmp(encoding, "latin-1") == 0)
				503	return PyUnicode_AsLatin1String(unicode);
				504	else if (strcmp(encoding, "ascii") == 0)
				505	return PyUnicode_AsASCIIString(unicode);
				506	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	507
				508	/* Encode via the codec registry */
				509	v = PyCodec_Encode(unicode, encoding, errors);
				510	if (v == NULL)
				511	goto onError;
				512	/* XXX Should we really enforce this ? */
				513	if (!PyString_Check(v)) {
				514	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	515	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	516	v->ob_type->tp_name);
				517	Py_DECREF(v);
				518	goto onError;
				519	}
				520	return v;
				521
				522	onError:
				523	return NULL;
				524	}
				525
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	526	/* Return a Python string holding the default encoded value of the
				527	Unicode object.
				528
				529	The resulting string is cached in the Unicode object for subsequent
				530	usage by this function. The cached version is needed to implement
				531	the character buffer interface and will live (at least) as long as
				532	the Unicode object itself.
				533
				534	The refcount of the string is not incremented.
				535
				536	* Exported for internal use by the interpreter only !!! *
				537
				538	*/
				539
				540	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				541	const char *errors)
				542	{
				543	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				544
				545	if (v)
				546	return v;
				547	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				548	if (v && errors == NULL)
				549	((PyUnicodeObject *)unicode)->defenc = v;
				550	return v;
				551	}
				552
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	553	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				554	{
				555	if (!PyUnicode_Check(unicode)) {
				556	PyErr_BadArgument();
				557	goto onError;
				558	}
				559	return PyUnicode_AS_UNICODE(unicode);
				560
				561	onError:
				562	return NULL;
				563	}
				564
				565	int PyUnicode_GetSize(PyObject *unicode)
				566	{
				567	if (!PyUnicode_Check(unicode)) {
				568	PyErr_BadArgument();
				569	goto onError;
				570	}
				571	return PyUnicode_GET_SIZE(unicode);
				572
				573	onError:
				574	return -1;
				575	}
				576
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	577	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	578	{
				579	return unicode_default_encoding;
				580	}
				581
				582	int PyUnicode_SetDefaultEncoding(const char *encoding)
				583	{
				584	PyObject *v;
				585
				586	/* Make sure the encoding is valid. As side effect, this also
				587	loads the encoding into the codec registry cache. */
				588	v = _PyCodec_Lookup(encoding);
				589	if (v == NULL)
				590	goto onError;
				591	Py_DECREF(v);
				592	strncpy(unicode_default_encoding,
				593	encoding,
				594	sizeof(unicode_default_encoding));
				595	return 0;
				596
				597	onError:
				598	return -1;
				599	}
				600
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	601	/* --- UTF-8 Codec -------------------------------------------------------- */
				602
				603	static
				604	char utf8_code_length[256] = {
				605	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				606	illegal prefix. see RFC 2279 for details */
				607	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				608	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				609	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				610	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				611	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				612	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				613	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				614	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				615	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				616	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				617	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				618	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				619	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				620	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				621	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				622	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				623	};
				624
				625	static
				626	int utf8_decoding_error(const char **source,
				627	Py_UNICODE **dest,
				628	const char *errors,
				629	const char *details)
				630	{
				631	if ((errors == NULL) \|\|
				632	(strcmp(errors,"strict") == 0)) {
				633	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	634	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	635	details);
				636	return -1;
				637	}
				638	else if (strcmp(errors,"ignore") == 0) {
				639	(*source)++;
				640	return 0;
				641	}
				642	else if (strcmp(errors,"replace") == 0) {
				643	(*source)++;
				644	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				645	(*dest)++;
				646	return 0;
				647	}
				648	else {
				649	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	650	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	651	errors);
				652	return -1;
				653	}
				654	}
				655
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	656	PyObject PyUnicode_DecodeUTF8(const char s,
				657	int size,
				658	const char *errors)
				659	{
				660	int n;
				661	const char *e;
				662	PyUnicodeObject *unicode;
				663	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	664	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	665
				666	/* Note: size will always be longer than the resulting Unicode
				667	character count */
				668	unicode = _PyUnicode_New(size);
				669	if (!unicode)
				670	return NULL;
				671	if (size == 0)
				672	return (PyObject *)unicode;
				673
				674	/* Unpack UTF-8 encoded data */
				675	p = unicode->str;
				676	e = s + size;
				677
				678	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	679	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	680
				681	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	682	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	683	s++;
				684	continue;
				685	}
				686
				687	n = utf8_code_length[ch];
				688
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	689	if (s + n > e) {
				690	errmsg = "unexpected end of data";
				691	goto utf8Error;
				692	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	693
				694	switch (n) {
				695
				696	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	697	errmsg = "unexpected code byte";
				698	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699	break;
				700
				701	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	702	errmsg = "internal error";
				703	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	704	break;
				705
				706	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	707	if ((s[1] & 0xc0) != 0x80) {
				708	errmsg = "invalid data";
				709	goto utf8Error;
				710	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	711	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	712	if (ch < 0x80) {
				713	errmsg = "illegal encoding";
				714	goto utf8Error;
				715	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	716	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	717	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	718	break;
				719
				720	case 3:
				721	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	722	(s[2] & 0xc0) != 0x80) {
				723	errmsg = "invalid data";
				724	goto utf8Error;
				725	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	726	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	727	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				728	errmsg = "illegal encoding";
				729	goto utf8Error;
				730	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	731	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	732	*p++ = (Py_UNICODE)ch;
				733	break;
				734
				735	case 4:
				736	if ((s[1] & 0xc0) != 0x80 \|\|
				737	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	738	(s[3] & 0xc0) != 0x80) {
				739	errmsg = "invalid data";
				740	goto utf8Error;
				741	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	742	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				743	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				744	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	745	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				746	byte encoding */
				747	(ch > 0x10ffff)) { /* maximum value allowed for
				748	UTF-16 */
				749	errmsg = "illegal encoding";
				750	goto utf8Error;
				751	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	752	/* compute and append the two surrogates: */
				753
				754	/* translate from 10000..10FFFF to 0..FFFF */
				755	ch -= 0x10000;
				756
				757	/* high surrogate = top 10 bits added to D800 */
				758	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				759
				760	/* low surrogate = bottom 10 bits added to DC00 */
				761	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	762	break;
				763
				764	default:
				765	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	766	errmsg = "unsupported Unicode code range";
				767	goto utf8Error;
				768	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	769	}
				770	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	771	continue;
				772
				773	utf8Error:
				774	if (utf8_decoding_error(&s, &p, errors, errmsg))
				775	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	776	}
				777
				778	/* Adjust length */
				779	if (_PyUnicode_Resize(unicode, p - unicode->str))
				780	goto onError;
				781
				782	return (PyObject *)unicode;
				783
				784	onError:
				785	Py_DECREF(unicode);
				786	return NULL;
				787	}
				788
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	789	/* Not used anymore, now that the encoder supports UTF-16
				790	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	791	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	792	static
				793	int utf8_encoding_error(const Py_UNICODE **source,
				794	char **dest,
				795	const char *errors,
				796	const char *details)
				797	{
				798	if ((errors == NULL) \|\|
				799	(strcmp(errors,"strict") == 0)) {
				800	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	801	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	802	details);
				803	return -1;
				804	}
				805	else if (strcmp(errors,"ignore") == 0) {
				806	return 0;
				807	}
				808	else if (strcmp(errors,"replace") == 0) {
				809	**dest = '?';
				810	(*dest)++;
				811	return 0;
				812	}
				813	else {
				814	PyErr_Format(PyExc_ValueError,
				815	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	816	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	817	errors);
				818	return -1;
				819	}
				820	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	821	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	822
				823	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				824	int size,
				825	const char *errors)
				826	{
				827	PyObject *v;
				828	char *p;
				829	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	830	Py_UCS4 ch2;
				831	unsigned int cbAllocated = 3 * size;
				832	unsigned int cbWritten = 0;
				833	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	834
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	835	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	836	if (v == NULL)
				837	return NULL;
				838	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	839	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	840
				841	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	842	while (i < size) {
				843	Py_UCS4 ch = s[i++];
				844	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	845	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	846	cbWritten++;
				847	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	848	else if (ch < 0x0800) {
				849	*p++ = 0xc0 \| (ch >> 6);
				850	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	851	cbWritten += 2;
				852	}
				853	else {
				854	/* Check for high surrogate */
				855	if (0xD800 <= ch && ch <= 0xDBFF) {
				856	if (i != size) {
				857	ch2 = s[i];
				858	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				859
				860	if (cbWritten >= (cbAllocated - 4)) {
				861	/* Provide enough room for some more
				862	surrogates */
				863	cbAllocated += 4*10;
				864	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	865	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	866	}
				867
				868	/* combine the two values */
				869	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				870
				871	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	872	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	873	i++;
				874	cbWritten += 4;
				875	}
				876	}
				877	}
				878	else {
				879	*p++ = (char)(0xe0 \| (ch >> 12));
				880	cbWritten += 3;
				881	}
				882	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				883	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	884	}
				885	}
				886	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	887	if (_PyString_Resize(&v, p - q))
				888	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	889	return v;
				890
				891	onError:
				892	Py_DECREF(v);
				893	return NULL;
				894	}
				895
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	896	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				897	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	898	if (!PyUnicode_Check(unicode)) {
				899	PyErr_BadArgument();
				900	return NULL;
				901	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	902	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				903	PyUnicode_GET_SIZE(unicode),
				904	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	905	}
				906
				907	/* --- UTF-16 Codec ------------------------------------------------------- */
				908
				909	static
				910	int utf16_decoding_error(const Py_UNICODE **source,
				911	Py_UNICODE **dest,
				912	const char *errors,
				913	const char *details)
				914	{
				915	if ((errors == NULL) \|\|
				916	(strcmp(errors,"strict") == 0)) {
				917	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	918	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	919	details);
				920	return -1;
				921	}
				922	else if (strcmp(errors,"ignore") == 0) {
				923	return 0;
				924	}
				925	else if (strcmp(errors,"replace") == 0) {
				926	if (dest) {
				927	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				928	(*dest)++;
				929	}
				930	return 0;
				931	}
				932	else {
				933	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	934	"UTF-16 decoding error; "
				935	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	936	errors);
				937	return -1;
				938	}
				939	}
				940
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	941	PyObject PyUnicode_DecodeUTF16(const char s,
				942	int size,
				943	const char *errors,
				944	int *byteorder)
				945	{
				946	PyUnicodeObject *unicode;
				947	Py_UNICODE *p;
				948	const Py_UNICODE q, e;
				949	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	950	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	951
				952	/* size should be an even number */
				953	if (size % sizeof(Py_UNICODE) != 0) {
				954	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				955	return NULL;
				956	/* The remaining input chars are ignored if we fall through
				957	here... */
				958	}
				959
				960	/* Note: size will always be longer than the resulting Unicode
				961	character count */
				962	unicode = _PyUnicode_New(size);
				963	if (!unicode)
				964	return NULL;
				965	if (size == 0)
				966	return (PyObject *)unicode;
				967
				968	/* Unpack UTF-16 encoded data */
				969	p = unicode->str;
				970	q = (Py_UNICODE *)s;
				971	e = q + (size / sizeof(Py_UNICODE));
				972
				973	if (byteorder)
				974	bo = *byteorder;
				975
				976	while (q < e) {
				977	register Py_UNICODE ch = *q++;
				978
				979	/* Check for BOM marks (U+FEFF) in the input and adjust
				980	current byte order setting accordingly. Swap input
				981	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				982	!) */
				983	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				984	if (ch == 0xFEFF) {
				985	bo = -1;
				986	continue;
				987	} else if (ch == 0xFFFE) {
				988	bo = 1;
				989	continue;
				990	}
				991	if (bo == 1)
				992	ch = (ch >> 8) \| (ch << 8);
				993	#else
				994	if (ch == 0xFEFF) {
				995	bo = 1;
				996	continue;
				997	} else if (ch == 0xFFFE) {
				998	bo = -1;
				999	continue;
				1000	}
				1001	if (bo == -1)
				1002	ch = (ch >> 8) \| (ch << 8);
				1003	#endif
				1004	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1005	*p++ = ch;
				1006	continue;
				1007	}
				1008
				1009	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1010	if (q >= e) {
				1011	errmsg = "unexpected end of data";
				1012	goto utf16Error;
				1013	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1014	if (0xDC00 <= q && q <= 0xDFFF) {
				1015	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1016	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1017	/* This is valid data (a UTF-16 surrogate pair), but
				1018	we are not able to store this information since our
				1019	Py_UNICODE type only has 16 bits... this might
				1020	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1021	errmsg = "code pairs are not supported";
				1022	goto utf16Error;
				1023	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1024	else
				1025	continue;
				1026	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1027	errmsg = "illegal encoding";
				1028	/* Fall through to report the error */
				1029
				1030	utf16Error:
				1031	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1032	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1033	}
				1034
				1035	if (byteorder)
				1036	*byteorder = bo;
				1037
				1038	/* Adjust length */
				1039	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1040	goto onError;
				1041
				1042	return (PyObject *)unicode;
				1043
				1044	onError:
				1045	Py_DECREF(unicode);
				1046	return NULL;
				1047	}
				1048
				1049	#undef UTF16_ERROR
				1050
				1051	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1052	int size,
				1053	const char *errors,
				1054	int byteorder)
				1055	{
				1056	PyObject *v;
				1057	Py_UNICODE *p;
				1058	char *q;
				1059
				1060	/* We don't create UTF-16 pairs... */
				1061	v = PyString_FromStringAndSize(NULL,
				1062	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1063	if (v == NULL)
				1064	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1065
				1066	q = PyString_AS_STRING(v);
				1067	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1068	if (byteorder == 0)
				1069	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1070	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1071	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1072	if (byteorder == 0 \|\|
				1073	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1074	byteorder == -1
				1075	#else
				1076	byteorder == 1
				1077	#endif
				1078	)
				1079	memcpy(p, s, size * sizeof(Py_UNICODE));
				1080	else
				1081	while (size-- > 0) {
				1082	Py_UNICODE ch = *s++;
				1083	*p++ = (ch >> 8) \| (ch << 8);
				1084	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1085	return v;
				1086	}
				1087
				1088	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1089	{
				1090	if (!PyUnicode_Check(unicode)) {
				1091	PyErr_BadArgument();
				1092	return NULL;
				1093	}
				1094	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1095	PyUnicode_GET_SIZE(unicode),
				1096	NULL,
				1097	0);
				1098	}
				1099
				1100	/* --- Unicode Escape Codec ----------------------------------------------- */
				1101
				1102	static
				1103	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1104	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1105	const char *errors,
				1106	const char *details)
				1107	{
				1108	if ((errors == NULL) \|\|
				1109	(strcmp(errors,"strict") == 0)) {
				1110	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1111	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1112	details);
				1113	return -1;
				1114	}
				1115	else if (strcmp(errors,"ignore") == 0) {
				1116	return 0;
				1117	}
				1118	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1119	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1120	return 0;
				1121	}
				1122	else {
				1123	PyErr_Format(PyExc_ValueError,
				1124	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1125	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1126	errors);
				1127	return -1;
				1128	}
				1129	}
				1130
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1131	static _Py_UCNHashAPI *pucnHash = NULL;
				1132
				1133	static
				1134	int mystrnicmp(const char s1, const char s2, size_t count)
				1135	{
				1136	char c1, c2;
				1137
				1138	if (count)
				1139	{
				1140	do
				1141	{
				1142	c1 = tolower(*(s1++));
				1143	c2 = tolower(*(s2++));
				1144	}
				1145	while(--count && c1 == c2);
				1146
				1147	return c1 - c2;
				1148	}
				1149
				1150	return 0;
				1151	}
				1152
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1153	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1154	int size,
				1155	const char *errors)
				1156	{
				1157	PyUnicodeObject *v;
				1158	Py_UNICODE p = NULL, buf = NULL;
				1159	const char *end;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1160	Py_UCS4 chr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1161
				1162	/* Escaped strings will always be longer than the resulting
				1163	Unicode string, so we start with size here and then reduce the
				1164	length after conversion to the true value. */
				1165	v = _PyUnicode_New(size);
				1166	if (v == NULL)
				1167	goto onError;
				1168	if (size == 0)
				1169	return (PyObject *)v;
				1170	p = buf = PyUnicode_AS_UNICODE(v);
				1171	end = s + size;
				1172	while (s < end) {
				1173	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1174	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1175	int i;
				1176
				1177	/* Non-escape characters are interpreted as Unicode ordinals */
				1178	if (*s != '\\') {
				1179	p++ = (unsigned char)s++;
				1180	continue;
				1181	}
				1182
				1183	/* \ - Escapes */
				1184	s++;
				1185	switch (*s++) {
				1186
				1187	/* \x escapes */
				1188	case '\n': break;
				1189	case '\\': *p++ = '\\'; break;
				1190	case '\'': *p++ = '\''; break;
				1191	case '\"': *p++ = '\"'; break;
				1192	case 'b': *p++ = '\b'; break;
				1193	case 'f': p++ = '\014'; break; / FF */
				1194	case 't': *p++ = '\t'; break;
				1195	case 'n': *p++ = '\n'; break;
				1196	case 'r': *p++ = '\r'; break;
				1197	case 'v': p++ = '\013'; break; / VT */
				1198	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1199
				1200	/* \OOO (octal) escapes */
				1201	case '0': case '1': case '2': case '3':
				1202	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1203	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1204	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1205	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1206	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1207	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1208	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1209	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1210	break;
				1211
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1212	/* \xXX with two hex digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1213	case 'x':
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1214	for (x = 0, i = 0; i < 2; i++) {
				1215	c = (unsigned char)s[i];
				1216	if (!isxdigit(c)) {
				1217	if (unicodeescape_decoding_error(&s, &x, errors,
				1218	"truncated \\xXX"))
				1219	goto onError;
				1220	i++;
				1221	break;
				1222	}
				1223	x = (x<<4) & ~0xF;
				1224	if (c >= '0' && c <= '9')
				1225	x += c - '0';
				1226	else if (c >= 'a' && c <= 'f')
				1227	x += 10 + c - 'a';
				1228	else
				1229	x += 10 + c - 'A';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1230	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1231	s += i;
				1232	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1233	break;
				1234
				1235	/* \uXXXX with 4 hex digits */
				1236	case 'u':
				1237	for (x = 0, i = 0; i < 4; i++) {
				1238	c = (unsigned char)s[i];
				1239	if (!isxdigit(c)) {
				1240	if (unicodeescape_decoding_error(&s, &x, errors,
				1241	"truncated \\uXXXX"))
				1242	goto onError;
				1243	i++;
				1244	break;
				1245	}
				1246	x = (x<<4) & ~0xF;
				1247	if (c >= '0' && c <= '9')
				1248	x += c - '0';
				1249	else if (c >= 'a' && c <= 'f')
				1250	x += 10 + c - 'a';
				1251	else
				1252	x += 10 + c - 'A';
				1253	}
				1254	s += i;
				1255	*p++ = x;
				1256	break;
				1257
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1258	/* \UXXXXXXXX with 8 hex digits */
				1259	case 'U':
				1260	for (chr = 0, i = 0; i < 8; i++) {
				1261	c = (unsigned char)s[i];
				1262	if (!isxdigit(c)) {
				1263	if (unicodeescape_decoding_error(&s, &x, errors,
				1264	"truncated \\uXXXX"))
				1265	goto onError;
				1266	i++;
				1267	break;
				1268	}
				1269	chr = (chr<<4) & ~0xF;
				1270	if (c >= '0' && c <= '9')
				1271	chr += c - '0';
				1272	else if (c >= 'a' && c <= 'f')
				1273	chr += 10 + c - 'a';
				1274	else
				1275	chr += 10 + c - 'A';
				1276	}
				1277	s += i;
				1278	goto store;
				1279
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1280	case 'N':
				1281	/* Ok, we need to deal with Unicode Character Names now,
				1282	* make sure we've imported the hash table data...
				1283	*/
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1284	if (pucnHash == NULL) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1285	PyObject mod = 0, v = 0;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1286	mod = PyImport_ImportModule("ucnhash");
				1287	if (mod == NULL)
				1288	goto onError;
				1289	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1290	Py_DECREF(mod);
				1291	if (v == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1292	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1293	pucnHash = PyCObject_AsVoidPtr(v);
				1294	Py_DECREF(v);
				1295	if (pucnHash == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1296	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1297	}
				1298
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1299	if (*s == '{') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1300	const char *start = s + 1;
				1301	const char *endBrace = start;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1302	unsigned long j;
				1303
				1304	/* look for either the closing brace, or we
				1305	* exceed the maximum length of the unicode character names
				1306	*/
				1307	while (*endBrace != '}' &&
				1308	(unsigned int)(endBrace - start) <=
				1309	pucnHash->cchMax &&
				1310	endBrace < end)
				1311	{
				1312	endBrace++;
				1313	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1314	if (endBrace != end && *endBrace == '}') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1315	j = pucnHash->hash(start, endBrace - start);
				1316	if (j > pucnHash->cKeys \|\|
				1317	mystrnicmp(
				1318	start,
				1319	((_Py_UnicodeCharacterName *)
				1320	(pucnHash->getValue(j)))->pszUCN,
				1321	(int)(endBrace - start)) != 0)
				1322	{
				1323	if (unicodeescape_decoding_error(
				1324	&s, &x, errors,
				1325	"Invalid Unicode Character Name"))
				1326	{
				1327	goto onError;
				1328	}
				1329	goto ucnFallthrough;
				1330	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1331	chr = ((_Py_UnicodeCharacterName *)
				1332	(pucnHash->getValue(j)))->value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1333	s = endBrace + 1;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1334	goto store;
				1335	} else {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1336	if (unicodeescape_decoding_error(
				1337	&s, &x, errors,
				1338	"Unicode name missing closing brace"))
				1339	goto onError;
				1340	goto ucnFallthrough;
				1341	}
				1342	break;
				1343	}
				1344	if (unicodeescape_decoding_error(
				1345	&s, &x, errors,
				1346	"Missing opening brace for Unicode Character Name escape"))
				1347	goto onError;
				1348	ucnFallthrough:
				1349	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1350	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1351	*p++ = '\\';
				1352	*p++ = (unsigned char)s[-1];
				1353	break;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1354	store:
				1355	/* when we get here, chr is a 32-bit unicode character */
				1356	if (chr <= 0xffff)
				1357	/* UCS-2 character */
				1358	*p++ = (Py_UNICODE) chr;
				1359	else if (chr <= 0x10ffff) {
				1360	/* UCS-4 character. store as two surrogate characters */
				1361	chr -= 0x10000L;
				1362	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
				1363	*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
				1364	} else {
				1365	if (unicodeescape_decoding_error(
				1366	&s, &x, errors,
				1367	"Illegal Unicode character")
				1368	)
				1369	goto onError;
				1370	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1371	}
				1372	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1373	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1374	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1375	return (PyObject *)v;
				1376
				1377	onError:
				1378	Py_XDECREF(v);
				1379	return NULL;
				1380	}
				1381
				1382	/* Return a Unicode-Escape string version of the Unicode object.
				1383
				1384	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1385	appropriate.
				1386
				1387	*/
				1388
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1389	static const Py_UNICODE findchar(const Py_UNICODE s,
				1390	int size,
				1391	Py_UNICODE ch);
				1392
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1393	static
				1394	PyObject unicodeescape_string(const Py_UNICODE s,
				1395	int size,
				1396	int quotes)
				1397	{
				1398	PyObject *repr;
				1399	char *p;
				1400	char *q;
				1401
				1402	static const char *hexdigit = "0123456789ABCDEF";
				1403
				1404	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1405	if (repr == NULL)
				1406	return NULL;
				1407
				1408	p = q = PyString_AS_STRING(repr);
				1409
				1410	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1411	*p++ = 'u';
				1412	*p++ = (findchar(s, size, '\'') &&
				1413	!findchar(s, size, '"')) ? '"' : '\'';
				1414	}
				1415	while (size-- > 0) {
				1416	Py_UNICODE ch = *s++;
				1417	/* Escape quotes */
				1418	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1419	*p++ = '\\';
				1420	*p++ = (char) ch;
				1421	}
				1422	/* Map 16-bit characters to '\uxxxx' */
				1423	else if (ch >= 256) {
				1424	*p++ = '\\';
				1425	*p++ = 'u';
				1426	*p++ = hexdigit[(ch >> 12) & 0xf];
				1427	*p++ = hexdigit[(ch >> 8) & 0xf];
				1428	*p++ = hexdigit[(ch >> 4) & 0xf];
				1429	*p++ = hexdigit[ch & 15];
				1430	}
				1431	/* Map non-printable US ASCII to '\ooo' */
				1432	else if (ch < ' ' \|\| ch >= 128) {
				1433	*p++ = '\\';
				1434	*p++ = hexdigit[(ch >> 6) & 7];
				1435	*p++ = hexdigit[(ch >> 3) & 7];
				1436	*p++ = hexdigit[ch & 7];
				1437	}
				1438	/* Copy everything else as-is */
				1439	else
				1440	*p++ = (char) ch;
				1441	}
				1442	if (quotes)
				1443	*p++ = q[1];
				1444
				1445	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1446	if (_PyString_Resize(&repr, p - q))
				1447	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1448
				1449	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1450
				1451	onError:
				1452	Py_DECREF(repr);
				1453	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1454	}
				1455
				1456	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1457	int size)
				1458	{
				1459	return unicodeescape_string(s, size, 0);
				1460	}
				1461
				1462	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1463	{
				1464	if (!PyUnicode_Check(unicode)) {
				1465	PyErr_BadArgument();
				1466	return NULL;
				1467	}
				1468	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1469	PyUnicode_GET_SIZE(unicode));
				1470	}
				1471
				1472	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1473
				1474	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1475	int size,
				1476	const char *errors)
				1477	{
				1478	PyUnicodeObject *v;
				1479	Py_UNICODE p, buf;
				1480	const char *end;
				1481	const char *bs;
				1482
				1483	/* Escaped strings will always be longer than the resulting
				1484	Unicode string, so we start with size here and then reduce the
				1485	length after conversion to the true value. */
				1486	v = _PyUnicode_New(size);
				1487	if (v == NULL)
				1488	goto onError;
				1489	if (size == 0)
				1490	return (PyObject *)v;
				1491	p = buf = PyUnicode_AS_UNICODE(v);
				1492	end = s + size;
				1493	while (s < end) {
				1494	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1495	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1496	int i;
				1497
				1498	/* Non-escape characters are interpreted as Unicode ordinals */
				1499	if (*s != '\\') {
				1500	p++ = (unsigned char)s++;
				1501	continue;
				1502	}
				1503
				1504	/* \u-escapes are only interpreted iff the number of leading
				1505	backslashes if odd */
				1506	bs = s;
				1507	for (;s < end;) {
				1508	if (*s != '\\')
				1509	break;
				1510	p++ = (unsigned char)s++;
				1511	}
				1512	if (((s - bs) & 1) == 0 \|\|
				1513	s >= end \|\|
				1514	*s != 'u') {
				1515	continue;
				1516	}
				1517	p--;
				1518	s++;
				1519
				1520	/* \uXXXX with 4 hex digits */
				1521	for (x = 0, i = 0; i < 4; i++) {
				1522	c = (unsigned char)s[i];
				1523	if (!isxdigit(c)) {
				1524	if (unicodeescape_decoding_error(&s, &x, errors,
				1525	"truncated \\uXXXX"))
				1526	goto onError;
				1527	i++;
				1528	break;
				1529	}
				1530	x = (x<<4) & ~0xF;
				1531	if (c >= '0' && c <= '9')
				1532	x += c - '0';
				1533	else if (c >= 'a' && c <= 'f')
				1534	x += 10 + c - 'a';
				1535	else
				1536	x += 10 + c - 'A';
				1537	}
				1538	s += i;
				1539	*p++ = x;
				1540	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1541	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1542	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1543	return (PyObject *)v;
				1544
				1545	onError:
				1546	Py_XDECREF(v);
				1547	return NULL;
				1548	}
				1549
				1550	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1551	int size)
				1552	{
				1553	PyObject *repr;
				1554	char *p;
				1555	char *q;
				1556
				1557	static const char *hexdigit = "0123456789ABCDEF";
				1558
				1559	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1560	if (repr == NULL)
				1561	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1562	if (size == 0)
				1563	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1564
				1565	p = q = PyString_AS_STRING(repr);
				1566	while (size-- > 0) {
				1567	Py_UNICODE ch = *s++;
				1568	/* Map 16-bit characters to '\uxxxx' */
				1569	if (ch >= 256) {
				1570	*p++ = '\\';
				1571	*p++ = 'u';
				1572	*p++ = hexdigit[(ch >> 12) & 0xf];
				1573	*p++ = hexdigit[(ch >> 8) & 0xf];
				1574	*p++ = hexdigit[(ch >> 4) & 0xf];
				1575	*p++ = hexdigit[ch & 15];
				1576	}
				1577	/* Copy everything else as-is */
				1578	else
				1579	*p++ = (char) ch;
				1580	}
				1581	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1582	if (_PyString_Resize(&repr, p - q))
				1583	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1584
				1585	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1586
				1587	onError:
				1588	Py_DECREF(repr);
				1589	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1590	}
				1591
				1592	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1593	{
				1594	if (!PyUnicode_Check(unicode)) {
				1595	PyErr_BadArgument();
				1596	return NULL;
				1597	}
				1598	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1599	PyUnicode_GET_SIZE(unicode));
				1600	}
				1601
				1602	/* --- Latin-1 Codec ------------------------------------------------------ */
				1603
				1604	PyObject PyUnicode_DecodeLatin1(const char s,
				1605	int size,
				1606	const char *errors)
				1607	{
				1608	PyUnicodeObject *v;
				1609	Py_UNICODE *p;
				1610
				1611	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1612	v = _PyUnicode_New(size);
				1613	if (v == NULL)
				1614	goto onError;
				1615	if (size == 0)
				1616	return (PyObject *)v;
				1617	p = PyUnicode_AS_UNICODE(v);
				1618	while (size-- > 0)
				1619	p++ = (unsigned char)s++;
				1620	return (PyObject *)v;
				1621
				1622	onError:
				1623	Py_XDECREF(v);
				1624	return NULL;
				1625	}
				1626
				1627	static
				1628	int latin1_encoding_error(const Py_UNICODE **source,
				1629	char **dest,
				1630	const char *errors,
				1631	const char *details)
				1632	{
				1633	if ((errors == NULL) \|\|
				1634	(strcmp(errors,"strict") == 0)) {
				1635	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1636	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1637	details);
				1638	return -1;
				1639	}
				1640	else if (strcmp(errors,"ignore") == 0) {
				1641	return 0;
				1642	}
				1643	else if (strcmp(errors,"replace") == 0) {
				1644	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1645	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1646	return 0;
				1647	}
				1648	else {
				1649	PyErr_Format(PyExc_ValueError,
				1650	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1651	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1652	errors);
				1653	return -1;
				1654	}
				1655	}
				1656
				1657	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1658	int size,
				1659	const char *errors)
				1660	{
				1661	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1662	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1663
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1664	repr = PyString_FromStringAndSize(NULL, size);
				1665	if (repr == NULL)
				1666	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1667	if (size == 0)
				1668	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1669
				1670	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1671	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1672	while (size-- > 0) {
				1673	Py_UNICODE ch = *p++;
				1674	if (ch >= 256) {
				1675	if (latin1_encoding_error(&p, &s, errors,
				1676	"ordinal not in range(256)"))
				1677	goto onError;
				1678	}
				1679	else
				1680	*s++ = (char)ch;
				1681	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1682	/* Resize if error handling skipped some characters */
				1683	if (s - start < PyString_GET_SIZE(repr))
				1684	if (_PyString_Resize(&repr, s - start))
				1685	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1686	return repr;
				1687
				1688	onError:
				1689	Py_DECREF(repr);
				1690	return NULL;
				1691	}
				1692
				1693	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1694	{
				1695	if (!PyUnicode_Check(unicode)) {
				1696	PyErr_BadArgument();
				1697	return NULL;
				1698	}
				1699	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1700	PyUnicode_GET_SIZE(unicode),
				1701	NULL);
				1702	}
				1703
				1704	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1705
				1706	static
				1707	int ascii_decoding_error(const char **source,
				1708	Py_UNICODE **dest,
				1709	const char *errors,
				1710	const char *details)
				1711	{
				1712	if ((errors == NULL) \|\|
				1713	(strcmp(errors,"strict") == 0)) {
				1714	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1715	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1716	details);
				1717	return -1;
				1718	}
				1719	else if (strcmp(errors,"ignore") == 0) {
				1720	return 0;
				1721	}
				1722	else if (strcmp(errors,"replace") == 0) {
				1723	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1724	(*dest)++;
				1725	return 0;
				1726	}
				1727	else {
				1728	PyErr_Format(PyExc_ValueError,
				1729	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1730	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1731	errors);
				1732	return -1;
				1733	}
				1734	}
				1735
				1736	PyObject PyUnicode_DecodeASCII(const char s,
				1737	int size,
				1738	const char *errors)
				1739	{
				1740	PyUnicodeObject *v;
				1741	Py_UNICODE *p;
				1742
				1743	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1744	v = _PyUnicode_New(size);
				1745	if (v == NULL)
				1746	goto onError;
				1747	if (size == 0)
				1748	return (PyObject *)v;
				1749	p = PyUnicode_AS_UNICODE(v);
				1750	while (size-- > 0) {
				1751	register unsigned char c;
				1752
				1753	c = (unsigned char)*s++;
				1754	if (c < 128)
				1755	*p++ = c;
				1756	else if (ascii_decoding_error(&s, &p, errors,
				1757	"ordinal not in range(128)"))
				1758	goto onError;
				1759	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1760	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1761	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1762	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1763	return (PyObject *)v;
				1764
				1765	onError:
				1766	Py_XDECREF(v);
				1767	return NULL;
				1768	}
				1769
				1770	static
				1771	int ascii_encoding_error(const Py_UNICODE **source,
				1772	char **dest,
				1773	const char *errors,
				1774	const char *details)
				1775	{
				1776	if ((errors == NULL) \|\|
				1777	(strcmp(errors,"strict") == 0)) {
				1778	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1779	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1780	details);
				1781	return -1;
				1782	}
				1783	else if (strcmp(errors,"ignore") == 0) {
				1784	return 0;
				1785	}
				1786	else if (strcmp(errors,"replace") == 0) {
				1787	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1788	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1789	return 0;
				1790	}
				1791	else {
				1792	PyErr_Format(PyExc_ValueError,
				1793	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1794	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1795	errors);
				1796	return -1;
				1797	}
				1798	}
				1799
				1800	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1801	int size,
				1802	const char *errors)
				1803	{
				1804	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1805	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1806
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1807	repr = PyString_FromStringAndSize(NULL, size);
				1808	if (repr == NULL)
				1809	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1810	if (size == 0)
				1811	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1812
				1813	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1814	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1815	while (size-- > 0) {
				1816	Py_UNICODE ch = *p++;
				1817	if (ch >= 128) {
				1818	if (ascii_encoding_error(&p, &s, errors,
				1819	"ordinal not in range(128)"))
				1820	goto onError;
				1821	}
				1822	else
				1823	*s++ = (char)ch;
				1824	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1825	/* Resize if error handling skipped some characters */
				1826	if (s - start < PyString_GET_SIZE(repr))
				1827	if (_PyString_Resize(&repr, s - start))
				1828	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1829	return repr;
				1830
				1831	onError:
				1832	Py_DECREF(repr);
				1833	return NULL;
				1834	}
				1835
				1836	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1837	{
				1838	if (!PyUnicode_Check(unicode)) {
				1839	PyErr_BadArgument();
				1840	return NULL;
				1841	}
				1842	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1843	PyUnicode_GET_SIZE(unicode),
				1844	NULL);
				1845	}
				1846
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1847	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1848
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1849	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1850
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1851	PyObject PyUnicode_DecodeMBCS(const char s,
				1852	int size,
				1853	const char *errors)
				1854	{
				1855	PyUnicodeObject *v;
				1856	Py_UNICODE *p;
				1857
				1858	/* First get the size of the result */
				1859	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1860	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1861	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1862
				1863	v = _PyUnicode_New(usize);
				1864	if (v == NULL)
				1865	return NULL;
				1866	if (usize == 0)
				1867	return (PyObject *)v;
				1868	p = PyUnicode_AS_UNICODE(v);
				1869	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1870	Py_DECREF(v);
				1871	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1872	}
				1873
				1874	return (PyObject *)v;
				1875	}
				1876
				1877	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1878	int size,
				1879	const char *errors)
				1880	{
				1881	PyObject *repr;
				1882	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1883	DWORD mbcssize;
				1884
				1885	/* If there are no characters, bail now! */
				1886	if (size==0)
				1887	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1888
				1889	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1890	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1891	if (mbcssize==0)
				1892	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1893
				1894	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1895	if (repr == NULL)
				1896	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1897	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1898	return repr;
				1899
				1900	/* Do the conversion */
				1901	s = PyString_AS_STRING(repr);
				1902	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1903	Py_DECREF(repr);
				1904	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1905	}
				1906	return repr;
				1907	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1908
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1909	#endif /* MS_WIN32 */
				1910
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1911	/* --- Character Mapping Codec -------------------------------------------- */
				1912
				1913	static
				1914	int charmap_decoding_error(const char **source,
				1915	Py_UNICODE **dest,
				1916	const char *errors,
				1917	const char *details)
				1918	{
				1919	if ((errors == NULL) \|\|
				1920	(strcmp(errors,"strict") == 0)) {
				1921	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1922	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1923	details);
				1924	return -1;
				1925	}
				1926	else if (strcmp(errors,"ignore") == 0) {
				1927	return 0;
				1928	}
				1929	else if (strcmp(errors,"replace") == 0) {
				1930	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1931	(*dest)++;
				1932	return 0;
				1933	}
				1934	else {
				1935	PyErr_Format(PyExc_ValueError,
				1936	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1937	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1938	errors);
				1939	return -1;
				1940	}
				1941	}
				1942
				1943	PyObject PyUnicode_DecodeCharmap(const char s,
				1944	int size,
				1945	PyObject *mapping,
				1946	const char *errors)
				1947	{
				1948	PyUnicodeObject *v;
				1949	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	1950	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1951
				1952	/* Default to Latin-1 */
				1953	if (mapping == NULL)
				1954	return PyUnicode_DecodeLatin1(s, size, errors);
				1955
				1956	v = _PyUnicode_New(size);
				1957	if (v == NULL)
				1958	goto onError;
				1959	if (size == 0)
				1960	return (PyObject *)v;
				1961	p = PyUnicode_AS_UNICODE(v);
				1962	while (size-- > 0) {
				1963	unsigned char ch = *s++;
				1964	PyObject w, x;
				1965
				1966	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1967	w = PyInt_FromLong((long)ch);
				1968	if (w == NULL)
				1969	goto onError;
				1970	x = PyObject_GetItem(mapping, w);
				1971	Py_DECREF(w);
				1972	if (x == NULL) {
				1973	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1974	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1975	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	1976	x = Py_None;
				1977	Py_INCREF(x);
				1978	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	1979	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1980	}
				1981
				1982	/* Apply mapping */
				1983	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1984	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1985	if (value < 0 \|\| value > 65535) {
				1986	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1987	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1988	Py_DECREF(x);
				1989	goto onError;
				1990	}
				1991	*p++ = (Py_UNICODE)value;
				1992	}
				1993	else if (x == Py_None) {
				1994	/* undefined mapping */
				1995	if (charmap_decoding_error(&s, &p, errors,
				1996	"character maps to <undefined>")) {
				1997	Py_DECREF(x);
				1998	goto onError;
				1999	}
				2000	}
				2001	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2002	int targetsize = PyUnicode_GET_SIZE(x);
				2003
				2004	if (targetsize == 1)
				2005	/* 1-1 mapping */
				2006	p++ = PyUnicode_AS_UNICODE(x);
				2007
				2008	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2009	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2010	if (targetsize > extrachars) {
				2011	/* resize first */
				2012	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2013	int needed = (targetsize - extrachars) + \
				2014	(targetsize << 2);
				2015	extrachars += needed;
				2016	if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2017	Py_DECREF(x);
				2018	goto onError;
				2019	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2020	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2021	}
				2022	Py_UNICODE_COPY(p,
				2023	PyUnicode_AS_UNICODE(x),
				2024	targetsize);
				2025	p += targetsize;
				2026	extrachars -= targetsize;
				2027	}
				2028	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2029	}
				2030	else {
				2031	/* wrong return value */
				2032	PyErr_SetString(PyExc_TypeError,
				2033	"character mapping must return integer, None or unicode");
				2034	Py_DECREF(x);
				2035	goto onError;
				2036	}
				2037	Py_DECREF(x);
				2038	}
				2039	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				2040	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2041	goto onError;
				2042	return (PyObject *)v;
				2043
				2044	onError:
				2045	Py_XDECREF(v);
				2046	return NULL;
				2047	}
				2048
				2049	static
				2050	int charmap_encoding_error(const Py_UNICODE **source,
				2051	char **dest,
				2052	const char *errors,
				2053	const char *details)
				2054	{
				2055	if ((errors == NULL) \|\|
				2056	(strcmp(errors,"strict") == 0)) {
				2057	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2058	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2059	details);
				2060	return -1;
				2061	}
				2062	else if (strcmp(errors,"ignore") == 0) {
				2063	return 0;
				2064	}
				2065	else if (strcmp(errors,"replace") == 0) {
				2066	**dest = '?';
				2067	(*dest)++;
				2068	return 0;
				2069	}
				2070	else {
				2071	PyErr_Format(PyExc_ValueError,
				2072	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2073	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2074	errors);
				2075	return -1;
				2076	}
				2077	}
				2078
				2079	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2080	int size,
				2081	PyObject *mapping,
				2082	const char *errors)
				2083	{
				2084	PyObject *v;
				2085	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2086	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2087
				2088	/* Default to Latin-1 */
				2089	if (mapping == NULL)
				2090	return PyUnicode_EncodeLatin1(p, size, errors);
				2091
				2092	v = PyString_FromStringAndSize(NULL, size);
				2093	if (v == NULL)
				2094	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2095	if (size == 0)
				2096	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2097	s = PyString_AS_STRING(v);
				2098	while (size-- > 0) {
				2099	Py_UNICODE ch = *p++;
				2100	PyObject w, x;
				2101
				2102	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2103	w = PyInt_FromLong((long)ch);
				2104	if (w == NULL)
				2105	goto onError;
				2106	x = PyObject_GetItem(mapping, w);
				2107	Py_DECREF(w);
				2108	if (x == NULL) {
				2109	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2110	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2111	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2112	x = Py_None;
				2113	Py_INCREF(x);
				2114	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2115	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2116	}
				2117
				2118	/* Apply mapping */
				2119	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2120	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2121	if (value < 0 \|\| value > 255) {
				2122	PyErr_SetString(PyExc_TypeError,
				2123	"character mapping must be in range(256)");
				2124	Py_DECREF(x);
				2125	goto onError;
				2126	}
				2127	*s++ = (char)value;
				2128	}
				2129	else if (x == Py_None) {
				2130	/* undefined mapping */
				2131	if (charmap_encoding_error(&p, &s, errors,
				2132	"character maps to <undefined>")) {
				2133	Py_DECREF(x);
				2134	goto onError;
				2135	}
				2136	}
				2137	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2138	int targetsize = PyString_GET_SIZE(x);
				2139
				2140	if (targetsize == 1)
				2141	/* 1-1 mapping */
				2142	s++ = PyString_AS_STRING(x);
				2143
				2144	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2145	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2146	if (targetsize > extrachars) {
				2147	/* resize first */
				2148	int oldpos = (int)(s - PyString_AS_STRING(v));
				2149	int needed = (targetsize - extrachars) + \
				2150	(targetsize << 2);
				2151	extrachars += needed;
				2152	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2153	Py_DECREF(x);
				2154	goto onError;
				2155	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2156	s = PyString_AS_STRING(v) + oldpos;
				2157	}
				2158	memcpy(s,
				2159	PyString_AS_STRING(x),
				2160	targetsize);
				2161	s += targetsize;
				2162	extrachars -= targetsize;
				2163	}
				2164	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2165	}
				2166	else {
				2167	/* wrong return value */
				2168	PyErr_SetString(PyExc_TypeError,
				2169	"character mapping must return integer, None or unicode");
				2170	Py_DECREF(x);
				2171	goto onError;
				2172	}
				2173	Py_DECREF(x);
				2174	}
				2175	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2176	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2177	goto onError;
				2178	return v;
				2179
				2180	onError:
				2181	Py_DECREF(v);
				2182	return NULL;
				2183	}
				2184
				2185	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2186	PyObject *mapping)
				2187	{
				2188	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2189	PyErr_BadArgument();
				2190	return NULL;
				2191	}
				2192	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2193	PyUnicode_GET_SIZE(unicode),
				2194	mapping,
				2195	NULL);
				2196	}
				2197
				2198	static
				2199	int translate_error(const Py_UNICODE **source,
				2200	Py_UNICODE **dest,
				2201	const char *errors,
				2202	const char *details)
				2203	{
				2204	if ((errors == NULL) \|\|
				2205	(strcmp(errors,"strict") == 0)) {
				2206	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2207	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2208	details);
				2209	return -1;
				2210	}
				2211	else if (strcmp(errors,"ignore") == 0) {
				2212	return 0;
				2213	}
				2214	else if (strcmp(errors,"replace") == 0) {
				2215	**dest = '?';
				2216	(*dest)++;
				2217	return 0;
				2218	}
				2219	else {
				2220	PyErr_Format(PyExc_ValueError,
				2221	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2222	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2223	errors);
				2224	return -1;
				2225	}
				2226	}
				2227
				2228	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2229	int size,
				2230	PyObject *mapping,
				2231	const char *errors)
				2232	{
				2233	PyUnicodeObject *v;
				2234	Py_UNICODE *p;
				2235
				2236	if (mapping == NULL) {
				2237	PyErr_BadArgument();
				2238	return NULL;
				2239	}
				2240
				2241	/* Output will never be longer than input */
				2242	v = _PyUnicode_New(size);
				2243	if (v == NULL)
				2244	goto onError;
				2245	if (size == 0)
				2246	goto done;
				2247	p = PyUnicode_AS_UNICODE(v);
				2248	while (size-- > 0) {
				2249	Py_UNICODE ch = *s++;
				2250	PyObject w, x;
				2251
				2252	/* Get mapping */
				2253	w = PyInt_FromLong(ch);
				2254	if (w == NULL)
				2255	goto onError;
				2256	x = PyObject_GetItem(mapping, w);
				2257	Py_DECREF(w);
				2258	if (x == NULL) {
				2259	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2260	/* No mapping found: default to 1-1 mapping */
				2261	PyErr_Clear();
				2262	*p++ = ch;
				2263	continue;
				2264	}
				2265	goto onError;
				2266	}
				2267
				2268	/* Apply mapping */
				2269	if (PyInt_Check(x))
				2270	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2271	else if (x == Py_None) {
				2272	/* undefined mapping */
				2273	if (translate_error(&s, &p, errors,
				2274	"character maps to <undefined>")) {
				2275	Py_DECREF(x);
				2276	goto onError;
				2277	}
				2278	}
				2279	else if (PyUnicode_Check(x)) {
				2280	if (PyUnicode_GET_SIZE(x) != 1) {
				2281	/* 1-n mapping */
				2282	PyErr_SetString(PyExc_NotImplementedError,
				2283	"1-n mappings are currently not implemented");
				2284	Py_DECREF(x);
				2285	goto onError;
				2286	}
				2287	p++ = PyUnicode_AS_UNICODE(x);
				2288	}
				2289	else {
				2290	/* wrong return value */
				2291	PyErr_SetString(PyExc_TypeError,
				2292	"translate mapping must return integer, None or unicode");
				2293	Py_DECREF(x);
				2294	goto onError;
				2295	}
				2296	Py_DECREF(x);
				2297	}
				2298	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2299	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2300	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2301
				2302	done:
				2303	return (PyObject *)v;
				2304
				2305	onError:
				2306	Py_XDECREF(v);
				2307	return NULL;
				2308	}
				2309
				2310	PyObject PyUnicode_Translate(PyObject str,
				2311	PyObject *mapping,
				2312	const char *errors)
				2313	{
				2314	PyObject *result;
				2315
				2316	str = PyUnicode_FromObject(str);
				2317	if (str == NULL)
				2318	goto onError;
				2319	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2320	PyUnicode_GET_SIZE(str),
				2321	mapping,
				2322	errors);
				2323	Py_DECREF(str);
				2324	return result;
				2325
				2326	onError:
				2327	Py_XDECREF(str);
				2328	return NULL;
				2329	}
				2330
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2331	/* --- Decimal Encoder ---------------------------------------------------- */
				2332
				2333	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2334	int length,
				2335	char *output,
				2336	const char *errors)
				2337	{
				2338	Py_UNICODE p, end;
				2339
				2340	if (output == NULL) {
				2341	PyErr_BadArgument();
				2342	return -1;
				2343	}
				2344
				2345	p = s;
				2346	end = s + length;
				2347	while (p < end) {
				2348	register Py_UNICODE ch = *p++;
				2349	int decimal;
				2350
				2351	if (Py_UNICODE_ISSPACE(ch)) {
				2352	*output++ = ' ';
				2353	continue;
				2354	}
				2355	decimal = Py_UNICODE_TODECIMAL(ch);
				2356	if (decimal >= 0) {
				2357	*output++ = '0' + decimal;
				2358	continue;
				2359	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2360	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2361	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2362	continue;
				2363	}
				2364	/* All other characters are considered invalid */
				2365	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2366	PyErr_SetString(PyExc_ValueError,
				2367	"invalid decimal Unicode string");
				2368	goto onError;
				2369	}
				2370	else if (strcmp(errors, "ignore") == 0)
				2371	continue;
				2372	else if (strcmp(errors, "replace") == 0) {
				2373	*output++ = '?';
				2374	continue;
				2375	}
				2376	}
				2377	/* 0-terminate the output string */
				2378	*output++ = '\0';
				2379	return 0;
				2380
				2381	onError:
				2382	return -1;
				2383	}
				2384
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2385	/* --- Helpers ------------------------------------------------------------ */
				2386
				2387	static
				2388	int count(PyUnicodeObject *self,
				2389	int start,
				2390	int end,
				2391	PyUnicodeObject *substring)
				2392	{
				2393	int count = 0;
				2394
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2395	if (start < 0)
				2396	start += self->length;
				2397	if (start < 0)
				2398	start = 0;
				2399	if (end > self->length)
				2400	end = self->length;
				2401	if (end < 0)
				2402	end += self->length;
				2403	if (end < 0)
				2404	end = 0;
				2405
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2406	if (substring->length == 0)
				2407	return (end - start + 1);
				2408
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2409	end -= substring->length;
				2410
				2411	while (start <= end)
				2412	if (Py_UNICODE_MATCH(self, start, substring)) {
				2413	count++;
				2414	start += substring->length;
				2415	} else
				2416	start++;
				2417
				2418	return count;
				2419	}
				2420
				2421	int PyUnicode_Count(PyObject *str,
				2422	PyObject *substr,
				2423	int start,
				2424	int end)
				2425	{
				2426	int result;
				2427
				2428	str = PyUnicode_FromObject(str);
				2429	if (str == NULL)
				2430	return -1;
				2431	substr = PyUnicode_FromObject(substr);
				2432	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2433	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2434	return -1;
				2435	}
				2436
				2437	result = count((PyUnicodeObject *)str,
				2438	start, end,
				2439	(PyUnicodeObject *)substr);
				2440
				2441	Py_DECREF(str);
				2442	Py_DECREF(substr);
				2443	return result;
				2444	}
				2445
				2446	static
				2447	int findstring(PyUnicodeObject *self,
				2448	PyUnicodeObject *substring,
				2449	int start,
				2450	int end,
				2451	int direction)
				2452	{
				2453	if (start < 0)
				2454	start += self->length;
				2455	if (start < 0)
				2456	start = 0;
				2457
				2458	if (substring->length == 0)
				2459	return start;
				2460
				2461	if (end > self->length)
				2462	end = self->length;
				2463	if (end < 0)
				2464	end += self->length;
				2465	if (end < 0)
				2466	end = 0;
				2467
				2468	end -= substring->length;
				2469
				2470	if (direction < 0) {
				2471	for (; end >= start; end--)
				2472	if (Py_UNICODE_MATCH(self, end, substring))
				2473	return end;
				2474	} else {
				2475	for (; start <= end; start++)
				2476	if (Py_UNICODE_MATCH(self, start, substring))
				2477	return start;
				2478	}
				2479
				2480	return -1;
				2481	}
				2482
				2483	int PyUnicode_Find(PyObject *str,
				2484	PyObject *substr,
				2485	int start,
				2486	int end,
				2487	int direction)
				2488	{
				2489	int result;
				2490
				2491	str = PyUnicode_FromObject(str);
				2492	if (str == NULL)
				2493	return -1;
				2494	substr = PyUnicode_FromObject(substr);
				2495	if (substr == NULL) {
				2496	Py_DECREF(substr);
				2497	return -1;
				2498	}
				2499
				2500	result = findstring((PyUnicodeObject *)str,
				2501	(PyUnicodeObject *)substr,
				2502	start, end, direction);
				2503	Py_DECREF(str);
				2504	Py_DECREF(substr);
				2505	return result;
				2506	}
				2507
				2508	static
				2509	int tailmatch(PyUnicodeObject *self,
				2510	PyUnicodeObject *substring,
				2511	int start,
				2512	int end,
				2513	int direction)
				2514	{
				2515	if (start < 0)
				2516	start += self->length;
				2517	if (start < 0)
				2518	start = 0;
				2519
				2520	if (substring->length == 0)
				2521	return 1;
				2522
				2523	if (end > self->length)
				2524	end = self->length;
				2525	if (end < 0)
				2526	end += self->length;
				2527	if (end < 0)
				2528	end = 0;
				2529
				2530	end -= substring->length;
				2531	if (end < start)
				2532	return 0;
				2533
				2534	if (direction > 0) {
				2535	if (Py_UNICODE_MATCH(self, end, substring))
				2536	return 1;
				2537	} else {
				2538	if (Py_UNICODE_MATCH(self, start, substring))
				2539	return 1;
				2540	}
				2541
				2542	return 0;
				2543	}
				2544
				2545	int PyUnicode_Tailmatch(PyObject *str,
				2546	PyObject *substr,
				2547	int start,
				2548	int end,
				2549	int direction)
				2550	{
				2551	int result;
				2552
				2553	str = PyUnicode_FromObject(str);
				2554	if (str == NULL)
				2555	return -1;
				2556	substr = PyUnicode_FromObject(substr);
				2557	if (substr == NULL) {
				2558	Py_DECREF(substr);
				2559	return -1;
				2560	}
				2561
				2562	result = tailmatch((PyUnicodeObject *)str,
				2563	(PyUnicodeObject *)substr,
				2564	start, end, direction);
				2565	Py_DECREF(str);
				2566	Py_DECREF(substr);
				2567	return result;
				2568	}
				2569
				2570	static
				2571	const Py_UNICODE findchar(const Py_UNICODE s,
				2572	int size,
				2573	Py_UNICODE ch)
				2574	{
				2575	/* like wcschr, but doesn't stop at NULL characters */
				2576
				2577	while (size-- > 0) {
				2578	if (*s == ch)
				2579	return s;
				2580	s++;
				2581	}
				2582
				2583	return NULL;
				2584	}
				2585
				2586	/* Apply fixfct filter to the Unicode object self and return a
				2587	reference to the modified object */
				2588
				2589	static
				2590	PyObject fixup(PyUnicodeObject self,
				2591	int (fixfct)(PyUnicodeObject s))
				2592	{
				2593
				2594	PyUnicodeObject *u;
				2595
				2596	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2597	self->length);
				2598	if (u == NULL)
				2599	return NULL;
				2600	if (!fixfct(u)) {
				2601	/* fixfct should return TRUE if it modified the buffer. If
				2602	FALSE, return a reference to the original buffer instead
				2603	(to save space, not time) */
				2604	Py_INCREF(self);
				2605	Py_DECREF(u);
				2606	return (PyObject*) self;
				2607	}
				2608	return (PyObject*) u;
				2609	}
				2610
				2611	static
				2612	int fixupper(PyUnicodeObject *self)
				2613	{
				2614	int len = self->length;
				2615	Py_UNICODE *s = self->str;
				2616	int status = 0;
				2617
				2618	while (len-- > 0) {
				2619	register Py_UNICODE ch;
				2620
				2621	ch = Py_UNICODE_TOUPPER(*s);
				2622	if (ch != *s) {
				2623	status = 1;
				2624	*s = ch;
				2625	}
				2626	s++;
				2627	}
				2628
				2629	return status;
				2630	}
				2631
				2632	static
				2633	int fixlower(PyUnicodeObject *self)
				2634	{
				2635	int len = self->length;
				2636	Py_UNICODE *s = self->str;
				2637	int status = 0;
				2638
				2639	while (len-- > 0) {
				2640	register Py_UNICODE ch;
				2641
				2642	ch = Py_UNICODE_TOLOWER(*s);
				2643	if (ch != *s) {
				2644	status = 1;
				2645	*s = ch;
				2646	}
				2647	s++;
				2648	}
				2649
				2650	return status;
				2651	}
				2652
				2653	static
				2654	int fixswapcase(PyUnicodeObject *self)
				2655	{
				2656	int len = self->length;
				2657	Py_UNICODE *s = self->str;
				2658	int status = 0;
				2659
				2660	while (len-- > 0) {
				2661	if (Py_UNICODE_ISUPPER(*s)) {
				2662	s = Py_UNICODE_TOLOWER(s);
				2663	status = 1;
				2664	} else if (Py_UNICODE_ISLOWER(*s)) {
				2665	s = Py_UNICODE_TOUPPER(s);
				2666	status = 1;
				2667	}
				2668	s++;
				2669	}
				2670
				2671	return status;
				2672	}
				2673
				2674	static
				2675	int fixcapitalize(PyUnicodeObject *self)
				2676	{
				2677	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2678	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2679	return 1;
				2680	}
				2681	return 0;
				2682	}
				2683
				2684	static
				2685	int fixtitle(PyUnicodeObject *self)
				2686	{
				2687	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2688	register Py_UNICODE *e;
				2689	int previous_is_cased;
				2690
				2691	/* Shortcut for single character strings */
				2692	if (PyUnicode_GET_SIZE(self) == 1) {
				2693	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2694	if (*p != ch) {
				2695	*p = ch;
				2696	return 1;
				2697	}
				2698	else
				2699	return 0;
				2700	}
				2701
				2702	e = p + PyUnicode_GET_SIZE(self);
				2703	previous_is_cased = 0;
				2704	for (; p < e; p++) {
				2705	register const Py_UNICODE ch = *p;
				2706
				2707	if (previous_is_cased)
				2708	*p = Py_UNICODE_TOLOWER(ch);
				2709	else
				2710	*p = Py_UNICODE_TOTITLE(ch);
				2711
				2712	if (Py_UNICODE_ISLOWER(ch) \|\|
				2713	Py_UNICODE_ISUPPER(ch) \|\|
				2714	Py_UNICODE_ISTITLE(ch))
				2715	previous_is_cased = 1;
				2716	else
				2717	previous_is_cased = 0;
				2718	}
				2719	return 1;
				2720	}
				2721
				2722	PyObject PyUnicode_Join(PyObject separator,
				2723	PyObject *seq)
				2724	{
				2725	Py_UNICODE *sep;
				2726	int seplen;
				2727	PyUnicodeObject *res = NULL;
				2728	int reslen = 0;
				2729	Py_UNICODE *p;
				2730	int seqlen = 0;
				2731	int sz = 100;
				2732	int i;
				2733
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2734	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2735	if (seqlen < 0 && PyErr_Occurred())
				2736	return NULL;
				2737
				2738	if (separator == NULL) {
				2739	Py_UNICODE blank = ' ';
				2740	sep = &blank;
				2741	seplen = 1;
				2742	}
				2743	else {
				2744	separator = PyUnicode_FromObject(separator);
				2745	if (separator == NULL)
				2746	return NULL;
				2747	sep = PyUnicode_AS_UNICODE(separator);
				2748	seplen = PyUnicode_GET_SIZE(separator);
				2749	}
				2750
				2751	res = _PyUnicode_New(sz);
				2752	if (res == NULL)
				2753	goto onError;
				2754	p = PyUnicode_AS_UNICODE(res);
				2755	reslen = 0;
				2756
				2757	for (i = 0; i < seqlen; i++) {
				2758	int itemlen;
				2759	PyObject *item;
				2760
				2761	item = PySequence_GetItem(seq, i);
				2762	if (item == NULL)
				2763	goto onError;
				2764	if (!PyUnicode_Check(item)) {
				2765	PyObject *v;
				2766	v = PyUnicode_FromObject(item);
				2767	Py_DECREF(item);
				2768	item = v;
				2769	if (item == NULL)
				2770	goto onError;
				2771	}
				2772	itemlen = PyUnicode_GET_SIZE(item);
				2773	while (reslen + itemlen + seplen >= sz) {
				2774	if (_PyUnicode_Resize(res, sz*2))
				2775	goto onError;
				2776	sz *= 2;
				2777	p = PyUnicode_AS_UNICODE(res) + reslen;
				2778	}
				2779	if (i > 0) {
				2780	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2781	p += seplen;
				2782	reslen += seplen;
				2783	}
				2784	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2785	p += itemlen;
				2786	reslen += itemlen;
				2787	Py_DECREF(item);
				2788	}
				2789	if (_PyUnicode_Resize(res, reslen))
				2790	goto onError;
				2791
				2792	Py_XDECREF(separator);
				2793	return (PyObject *)res;
				2794
				2795	onError:
				2796	Py_XDECREF(separator);
				2797	Py_DECREF(res);
				2798	return NULL;
				2799	}
				2800
				2801	static
				2802	PyUnicodeObject pad(PyUnicodeObject self,
				2803	int left,
				2804	int right,
				2805	Py_UNICODE fill)
				2806	{
				2807	PyUnicodeObject *u;
				2808
				2809	if (left < 0)
				2810	left = 0;
				2811	if (right < 0)
				2812	right = 0;
				2813
				2814	if (left == 0 && right == 0) {
				2815	Py_INCREF(self);
				2816	return self;
				2817	}
				2818
				2819	u = _PyUnicode_New(left + self->length + right);
				2820	if (u) {
				2821	if (left)
				2822	Py_UNICODE_FILL(u->str, fill, left);
				2823	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2824	if (right)
				2825	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2826	}
				2827
				2828	return u;
				2829	}
				2830
				2831	#define SPLIT_APPEND(data, left, right) \
				2832	str = PyUnicode_FromUnicode(data + left, right - left); \
				2833	if (!str) \
				2834	goto onError; \
				2835	if (PyList_Append(list, str)) { \
				2836	Py_DECREF(str); \
				2837	goto onError; \
				2838	} \
				2839	else \
				2840	Py_DECREF(str);
				2841
				2842	static
				2843	PyObject split_whitespace(PyUnicodeObject self,
				2844	PyObject *list,
				2845	int maxcount)
				2846	{
				2847	register int i;
				2848	register int j;
				2849	int len = self->length;
				2850	PyObject *str;
				2851
				2852	for (i = j = 0; i < len; ) {
				2853	/* find a token */
				2854	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2855	i++;
				2856	j = i;
				2857	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2858	i++;
				2859	if (j < i) {
				2860	if (maxcount-- <= 0)
				2861	break;
				2862	SPLIT_APPEND(self->str, j, i);
				2863	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2864	i++;
				2865	j = i;
				2866	}
				2867	}
				2868	if (j < len) {
				2869	SPLIT_APPEND(self->str, j, len);
				2870	}
				2871	return list;
				2872
				2873	onError:
				2874	Py_DECREF(list);
				2875	return NULL;
				2876	}
				2877
				2878	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2879	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2880	{
				2881	register int i;
				2882	register int j;
				2883	int len;
				2884	PyObject *list;
				2885	PyObject *str;
				2886	Py_UNICODE *data;
				2887
				2888	string = PyUnicode_FromObject(string);
				2889	if (string == NULL)
				2890	return NULL;
				2891	data = PyUnicode_AS_UNICODE(string);
				2892	len = PyUnicode_GET_SIZE(string);
				2893
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2894	list = PyList_New(0);
				2895	if (!list)
				2896	goto onError;
				2897
				2898	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2899	int eol;
				2900
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2901	/* Find a line and append it */
				2902	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2903	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2904
				2905	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2906	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2907	if (i < len) {
				2908	if (data[i] == '\r' && i + 1 < len &&
				2909	data[i+1] == '\n')
				2910	i += 2;
				2911	else
				2912	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2913	if (keepends)
				2914	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2915	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2916	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2917	j = i;
				2918	}
				2919	if (j < len) {
				2920	SPLIT_APPEND(data, j, len);
				2921	}
				2922
				2923	Py_DECREF(string);
				2924	return list;
				2925
				2926	onError:
				2927	Py_DECREF(list);
				2928	Py_DECREF(string);
				2929	return NULL;
				2930	}
				2931
				2932	static
				2933	PyObject split_char(PyUnicodeObject self,
				2934	PyObject *list,
				2935	Py_UNICODE ch,
				2936	int maxcount)
				2937	{
				2938	register int i;
				2939	register int j;
				2940	int len = self->length;
				2941	PyObject *str;
				2942
				2943	for (i = j = 0; i < len; ) {
				2944	if (self->str[i] == ch) {
				2945	if (maxcount-- <= 0)
				2946	break;
				2947	SPLIT_APPEND(self->str, j, i);
				2948	i = j = i + 1;
				2949	} else
				2950	i++;
				2951	}
				2952	if (j <= len) {
				2953	SPLIT_APPEND(self->str, j, len);
				2954	}
				2955	return list;
				2956
				2957	onError:
				2958	Py_DECREF(list);
				2959	return NULL;
				2960	}
				2961
				2962	static
				2963	PyObject split_substring(PyUnicodeObject self,
				2964	PyObject *list,
				2965	PyUnicodeObject *substring,
				2966	int maxcount)
				2967	{
				2968	register int i;
				2969	register int j;
				2970	int len = self->length;
				2971	int sublen = substring->length;
				2972	PyObject *str;
				2973
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	2974	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2975	if (Py_UNICODE_MATCH(self, i, substring)) {
				2976	if (maxcount-- <= 0)
				2977	break;
				2978	SPLIT_APPEND(self->str, j, i);
				2979	i = j = i + sublen;
				2980	} else
				2981	i++;
				2982	}
				2983	if (j <= len) {
				2984	SPLIT_APPEND(self->str, j, len);
				2985	}
				2986	return list;
				2987
				2988	onError:
				2989	Py_DECREF(list);
				2990	return NULL;
				2991	}
				2992
				2993	#undef SPLIT_APPEND
				2994
				2995	static
				2996	PyObject split(PyUnicodeObject self,
				2997	PyUnicodeObject *substring,
				2998	int maxcount)
				2999	{
				3000	PyObject *list;
				3001
				3002	if (maxcount < 0)
				3003	maxcount = INT_MAX;
				3004
				3005	list = PyList_New(0);
				3006	if (!list)
				3007	return NULL;
				3008
				3009	if (substring == NULL)
				3010	return split_whitespace(self,list,maxcount);
				3011
				3012	else if (substring->length == 1)
				3013	return split_char(self,list,substring->str[0],maxcount);
				3014
				3015	else if (substring->length == 0) {
				3016	Py_DECREF(list);
				3017	PyErr_SetString(PyExc_ValueError, "empty separator");
				3018	return NULL;
				3019	}
				3020	else
				3021	return split_substring(self,list,substring,maxcount);
				3022	}
				3023
				3024	static
				3025	PyObject strip(PyUnicodeObject self,
				3026	int left,
				3027	int right)
				3028	{
				3029	Py_UNICODE *p = self->str;
				3030	int start = 0;
				3031	int end = self->length;
				3032
				3033	if (left)
				3034	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3035	start++;
				3036
				3037	if (right)
				3038	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3039	end--;
				3040
				3041	if (start == 0 && end == self->length) {
				3042	/* couldn't strip anything off, return original string */
				3043	Py_INCREF(self);
				3044	return (PyObject*) self;
				3045	}
				3046
				3047	return (PyObject*) PyUnicode_FromUnicode(
				3048	self->str + start,
				3049	end - start
				3050	);
				3051	}
				3052
				3053	static
				3054	PyObject replace(PyUnicodeObject self,
				3055	PyUnicodeObject *str1,
				3056	PyUnicodeObject *str2,
				3057	int maxcount)
				3058	{
				3059	PyUnicodeObject *u;
				3060
				3061	if (maxcount < 0)
				3062	maxcount = INT_MAX;
				3063
				3064	if (str1->length == 1 && str2->length == 1) {
				3065	int i;
				3066
				3067	/* replace characters */
				3068	if (!findchar(self->str, self->length, str1->str[0])) {
				3069	/* nothing to replace, return original string */
				3070	Py_INCREF(self);
				3071	u = self;
				3072	} else {
				3073	Py_UNICODE u1 = str1->str[0];
				3074	Py_UNICODE u2 = str2->str[0];
				3075
				3076	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3077	self->str,
				3078	self->length
				3079	);
				3080	if (u)
				3081	for (i = 0; i < u->length; i++)
				3082	if (u->str[i] == u1) {
				3083	if (--maxcount < 0)
				3084	break;
				3085	u->str[i] = u2;
				3086	}
				3087	}
				3088
				3089	} else {
				3090	int n, i;
				3091	Py_UNICODE *p;
				3092
				3093	/* replace strings */
				3094	n = count(self, 0, self->length, str1);
				3095	if (n > maxcount)
				3096	n = maxcount;
				3097	if (n == 0) {
				3098	/* nothing to replace, return original string */
				3099	Py_INCREF(self);
				3100	u = self;
				3101	} else {
				3102	u = _PyUnicode_New(
				3103	self->length + n * (str2->length - str1->length));
				3104	if (u) {
				3105	i = 0;
				3106	p = u->str;
				3107	while (i <= self->length - str1->length)
				3108	if (Py_UNICODE_MATCH(self, i, str1)) {
				3109	/* replace string segment */
				3110	Py_UNICODE_COPY(p, str2->str, str2->length);
				3111	p += str2->length;
				3112	i += str1->length;
				3113	if (--n <= 0) {
				3114	/* copy remaining part */
				3115	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3116	break;
				3117	}
				3118	} else
				3119	*p++ = self->str[i++];
				3120	}
				3121	}
				3122	}
				3123
				3124	return (PyObject *) u;
				3125	}
				3126
				3127	/* --- Unicode Object Methods --------------------------------------------- */
				3128
				3129	static char title__doc__[] =
				3130	"S.title() -> unicode\n\
				3131	\n\
				3132	Return a titlecased version of S, i.e. words start with title case\n\
				3133	characters, all remaining cased characters have lower case.";
				3134
				3135	static PyObject*
				3136	unicode_title(PyUnicodeObject self, PyObject args)
				3137	{
				3138	if (!PyArg_NoArgs(args))
				3139	return NULL;
				3140	return fixup(self, fixtitle);
				3141	}
				3142
				3143	static char capitalize__doc__[] =
				3144	"S.capitalize() -> unicode\n\
				3145	\n\
				3146	Return a capitalized version of S, i.e. make the first character\n\
				3147	have upper case.";
				3148
				3149	static PyObject*
				3150	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3151	{
				3152	if (!PyArg_NoArgs(args))
				3153	return NULL;
				3154	return fixup(self, fixcapitalize);
				3155	}
				3156
				3157	#if 0
				3158	static char capwords__doc__[] =
				3159	"S.capwords() -> unicode\n\
				3160	\n\
				3161	Apply .capitalize() to all words in S and return the result with\n\
				3162	normalized whitespace (all whitespace strings are replaced by ' ').";
				3163
				3164	static PyObject*
				3165	unicode_capwords(PyUnicodeObject self, PyObject args)
				3166	{
				3167	PyObject *list;
				3168	PyObject *item;
				3169	int i;
				3170
				3171	if (!PyArg_NoArgs(args))
				3172	return NULL;
				3173
				3174	/* Split into words */
				3175	list = split(self, NULL, -1);
				3176	if (!list)
				3177	return NULL;
				3178
				3179	/* Capitalize each word */
				3180	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3181	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3182	fixcapitalize);
				3183	if (item == NULL)
				3184	goto onError;
				3185	Py_DECREF(PyList_GET_ITEM(list, i));
				3186	PyList_SET_ITEM(list, i, item);
				3187	}
				3188
				3189	/* Join the words to form a new string */
				3190	item = PyUnicode_Join(NULL, list);
				3191
				3192	onError:
				3193	Py_DECREF(list);
				3194	return (PyObject *)item;
				3195	}
				3196	#endif
				3197
				3198	static char center__doc__[] =
				3199	"S.center(width) -> unicode\n\
				3200	\n\
				3201	Return S centered in a Unicode string of length width. Padding is done\n\
				3202	using spaces.";
				3203
				3204	static PyObject *
				3205	unicode_center(PyUnicodeObject self, PyObject args)
				3206	{
				3207	int marg, left;
				3208	int width;
				3209
				3210	if (!PyArg_ParseTuple(args, "i:center", &width))
				3211	return NULL;
				3212
				3213	if (self->length >= width) {
				3214	Py_INCREF(self);
				3215	return (PyObject*) self;
				3216	}
				3217
				3218	marg = width - self->length;
				3219	left = marg / 2 + (marg & width & 1);
				3220
				3221	return (PyObject*) pad(self, left, marg - left, ' ');
				3222	}
				3223
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3224	#if 0
				3225
				3226	/* This code should go into some future Unicode collation support
				3227	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3228	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3229
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3230	/* speedy UTF-16 code point order comparison */
				3231	/* gleaned from: */
				3232	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3233
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3234	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3235	{
				3236	0, 0, 0, 0, 0, 0, 0, 0,
				3237	0, 0, 0, 0, 0, 0, 0, 0,
				3238	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3239	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3240	};
				3241
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3242	static int
				3243	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3244	{
				3245	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3246
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3247	Py_UNICODE *s1 = str1->str;
				3248	Py_UNICODE *s2 = str2->str;
				3249
				3250	len1 = str1->length;
				3251	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3252
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3253	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3254	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3255	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3256
				3257	c1 = *s1++;
				3258	c2 = *s2++;
				3259	if (c1 > (1<<11) * 26)
				3260	c1 += utf16Fixup[c1>>11];
				3261	if (c2 > (1<<11) * 26)
				3262	c2 += utf16Fixup[c2>>11];
				3263
				3264	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3265	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3266	if (diff)
				3267	return (diff < 0) ? -1 : (diff != 0);
				3268	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3269	}
				3270
				3271	return (len1 < len2) ? -1 : (len1 != len2);
				3272	}
				3273
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3274	#else
				3275
				3276	static int
				3277	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3278	{
				3279	register int len1, len2;
				3280
				3281	Py_UNICODE *s1 = str1->str;
				3282	Py_UNICODE *s2 = str2->str;
				3283
				3284	len1 = str1->length;
				3285	len2 = str2->length;
				3286
				3287	while (len1 > 0 && len2 > 0) {
				3288	register long diff;
				3289
				3290	diff = (long)s1++ - (long)s2++;
				3291	if (diff)
				3292	return (diff < 0) ? -1 : (diff != 0);
				3293	len1--; len2--;
				3294	}
				3295
				3296	return (len1 < len2) ? -1 : (len1 != len2);
				3297	}
				3298
				3299	#endif
				3300
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3301	int PyUnicode_Compare(PyObject *left,
				3302	PyObject *right)
				3303	{
				3304	PyUnicodeObject u = NULL, v = NULL;
				3305	int result;
				3306
				3307	/* Coerce the two arguments */
				3308	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3309	if (u == NULL)
				3310	goto onError;
				3311	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3312	if (v == NULL)
				3313	goto onError;
				3314
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3315	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3316	if (v == u) {
				3317	Py_DECREF(u);
				3318	Py_DECREF(v);
				3319	return 0;
				3320	}
				3321
				3322	result = unicode_compare(u, v);
				3323
				3324	Py_DECREF(u);
				3325	Py_DECREF(v);
				3326	return result;
				3327
				3328	onError:
				3329	Py_XDECREF(u);
				3330	Py_XDECREF(v);
				3331	return -1;
				3332	}
				3333
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3334	int PyUnicode_Contains(PyObject *container,
				3335	PyObject *element)
				3336	{
				3337	PyUnicodeObject u = NULL, v = NULL;
				3338	int result;
				3339	register const Py_UNICODE p, e;
				3340	register Py_UNICODE ch;
				3341
				3342	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3343	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3344	if (v == NULL) {
				3345	PyErr_SetString(PyExc_TypeError,
				3346	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3347	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3348	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3349	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3350	if (u == NULL) {
				3351	Py_DECREF(v);
				3352	goto onError;
				3353	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3354
				3355	/* Check v in u */
				3356	if (PyUnicode_GET_SIZE(v) != 1) {
				3357	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3358	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3359	goto onError;
				3360	}
				3361	ch = *PyUnicode_AS_UNICODE(v);
				3362	p = PyUnicode_AS_UNICODE(u);
				3363	e = p + PyUnicode_GET_SIZE(u);
				3364	result = 0;
				3365	while (p < e) {
				3366	if (*p++ == ch) {
				3367	result = 1;
				3368	break;
				3369	}
				3370	}
				3371
				3372	Py_DECREF(u);
				3373	Py_DECREF(v);
				3374	return result;
				3375
				3376	onError:
				3377	Py_XDECREF(u);
				3378	Py_XDECREF(v);
				3379	return -1;
				3380	}
				3381
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3382	/* Concat to string or Unicode object giving a new Unicode object. */
				3383
				3384	PyObject PyUnicode_Concat(PyObject left,
				3385	PyObject *right)
				3386	{
				3387	PyUnicodeObject u = NULL, v = NULL, *w;
				3388
				3389	/* Coerce the two arguments */
				3390	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3391	if (u == NULL)
				3392	goto onError;
				3393	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3394	if (v == NULL)
				3395	goto onError;
				3396
				3397	/* Shortcuts */
				3398	if (v == unicode_empty) {
				3399	Py_DECREF(v);
				3400	return (PyObject *)u;
				3401	}
				3402	if (u == unicode_empty) {
				3403	Py_DECREF(u);
				3404	return (PyObject *)v;
				3405	}
				3406
				3407	/* Concat the two Unicode strings */
				3408	w = _PyUnicode_New(u->length + v->length);
				3409	if (w == NULL)
				3410	goto onError;
				3411	Py_UNICODE_COPY(w->str, u->str, u->length);
				3412	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3413
				3414	Py_DECREF(u);
				3415	Py_DECREF(v);
				3416	return (PyObject *)w;
				3417
				3418	onError:
				3419	Py_XDECREF(u);
				3420	Py_XDECREF(v);
				3421	return NULL;
				3422	}
				3423
				3424	static char count__doc__[] =
				3425	"S.count(sub[, start[, end]]) -> int\n\
				3426	\n\
				3427	Return the number of occurrences of substring sub in Unicode string\n\
				3428	S[start:end]. Optional arguments start and end are\n\
				3429	interpreted as in slice notation.";
				3430
				3431	static PyObject *
				3432	unicode_count(PyUnicodeObject self, PyObject args)
				3433	{
				3434	PyUnicodeObject *substring;
				3435	int start = 0;
				3436	int end = INT_MAX;
				3437	PyObject *result;
				3438
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3439	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3440	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3441	return NULL;
				3442
				3443	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3444	(PyObject *)substring);
				3445	if (substring == NULL)
				3446	return NULL;
				3447
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3448	if (start < 0)
				3449	start += self->length;
				3450	if (start < 0)
				3451	start = 0;
				3452	if (end > self->length)
				3453	end = self->length;
				3454	if (end < 0)
				3455	end += self->length;
				3456	if (end < 0)
				3457	end = 0;
				3458
				3459	result = PyInt_FromLong((long) count(self, start, end, substring));
				3460
				3461	Py_DECREF(substring);
				3462	return result;
				3463	}
				3464
				3465	static char encode__doc__[] =
				3466	"S.encode([encoding[,errors]]) -> string\n\
				3467	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3468	Return an encoded string version of S. Default encoding is the current\n\
				3469	default string encoding. errors may be given to set a different error\n\
				3470	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3471	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3472
				3473	static PyObject *
				3474	unicode_encode(PyUnicodeObject self, PyObject args)
				3475	{
				3476	char *encoding = NULL;
				3477	char *errors = NULL;
				3478	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3479	return NULL;
				3480	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3481	}
				3482
				3483	static char expandtabs__doc__[] =
				3484	"S.expandtabs([tabsize]) -> unicode\n\
				3485	\n\
				3486	Return a copy of S where all tab characters are expanded using spaces.\n\
				3487	If tabsize is not given, a tab size of 8 characters is assumed.";
				3488
				3489	static PyObject*
				3490	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3491	{
				3492	Py_UNICODE *e;
				3493	Py_UNICODE *p;
				3494	Py_UNICODE *q;
				3495	int i, j;
				3496	PyUnicodeObject *u;
				3497	int tabsize = 8;
				3498
				3499	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3500	return NULL;
				3501
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3502	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3503	i = j = 0;
				3504	e = self->str + self->length;
				3505	for (p = self->str; p < e; p++)
				3506	if (*p == '\t') {
				3507	if (tabsize > 0)
				3508	j += tabsize - (j % tabsize);
				3509	}
				3510	else {
				3511	j++;
				3512	if (p == '\n' \|\| p == '\r') {
				3513	i += j;
				3514	j = 0;
				3515	}
				3516	}
				3517
				3518	/* Second pass: create output string and fill it */
				3519	u = _PyUnicode_New(i + j);
				3520	if (!u)
				3521	return NULL;
				3522
				3523	j = 0;
				3524	q = u->str;
				3525
				3526	for (p = self->str; p < e; p++)
				3527	if (*p == '\t') {
				3528	if (tabsize > 0) {
				3529	i = tabsize - (j % tabsize);
				3530	j += i;
				3531	while (i--)
				3532	*q++ = ' ';
				3533	}
				3534	}
				3535	else {
				3536	j++;
				3537	q++ = p;
				3538	if (p == '\n' \|\| p == '\r')
				3539	j = 0;
				3540	}
				3541
				3542	return (PyObject*) u;
				3543	}
				3544
				3545	static char find__doc__[] =
				3546	"S.find(sub [,start [,end]]) -> int\n\
				3547	\n\
				3548	Return the lowest index in S where substring sub is found,\n\
				3549	such that sub is contained within s[start,end]. Optional\n\
				3550	arguments start and end are interpreted as in slice notation.\n\
				3551	\n\
				3552	Return -1 on failure.";
				3553
				3554	static PyObject *
				3555	unicode_find(PyUnicodeObject self, PyObject args)
				3556	{
				3557	PyUnicodeObject *substring;
				3558	int start = 0;
				3559	int end = INT_MAX;
				3560	PyObject *result;
				3561
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3562	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3563	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3564	return NULL;
				3565	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3566	(PyObject *)substring);
				3567	if (substring == NULL)
				3568	return NULL;
				3569
				3570	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3571
				3572	Py_DECREF(substring);
				3573	return result;
				3574	}
				3575
				3576	static PyObject *
				3577	unicode_getitem(PyUnicodeObject *self, int index)
				3578	{
				3579	if (index < 0 \|\| index >= self->length) {
				3580	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3581	return NULL;
				3582	}
				3583
				3584	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3585	}
				3586
				3587	static long
				3588	unicode_hash(PyUnicodeObject *self)
				3589	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3590	/* Since Unicode objects compare equal to their ASCII string
				3591	counterparts, they should use the individual character values
				3592	as basis for their hash value. This is needed to assure that
				3593	strings and Unicode objects behave in the same way as
				3594	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3595
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3596	register int len;
				3597	register Py_UNICODE *p;
				3598	register long x;
				3599
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3600	if (self->hash != -1)
				3601	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3602	len = PyUnicode_GET_SIZE(self);
				3603	p = PyUnicode_AS_UNICODE(self);
				3604	x = *p << 7;
				3605	while (--len >= 0)
				3606	x = (1000003x) ^ p++;
				3607	x ^= PyUnicode_GET_SIZE(self);
				3608	if (x == -1)
				3609	x = -2;
				3610	self->hash = x;
				3611	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3612	}
				3613
				3614	static char index__doc__[] =
				3615	"S.index(sub [,start [,end]]) -> int\n\
				3616	\n\
				3617	Like S.find() but raise ValueError when the substring is not found.";
				3618
				3619	static PyObject *
				3620	unicode_index(PyUnicodeObject self, PyObject args)
				3621	{
				3622	int result;
				3623	PyUnicodeObject *substring;
				3624	int start = 0;
				3625	int end = INT_MAX;
				3626
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3627	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3628	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3629	return NULL;
				3630
				3631	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3632	(PyObject *)substring);
				3633	if (substring == NULL)
				3634	return NULL;
				3635
				3636	result = findstring(self, substring, start, end, 1);
				3637
				3638	Py_DECREF(substring);
				3639	if (result < 0) {
				3640	PyErr_SetString(PyExc_ValueError, "substring not found");
				3641	return NULL;
				3642	}
				3643	return PyInt_FromLong(result);
				3644	}
				3645
				3646	static char islower__doc__[] =
				3647	"S.islower() -> int\n\
				3648	\n\
				3649	Return 1 if all cased characters in S are lowercase and there is\n\
				3650	at least one cased character in S, 0 otherwise.";
				3651
				3652	static PyObject*
				3653	unicode_islower(PyUnicodeObject self, PyObject args)
				3654	{
				3655	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3656	register const Py_UNICODE *e;
				3657	int cased;
				3658
				3659	if (!PyArg_NoArgs(args))
				3660	return NULL;
				3661
				3662	/* Shortcut for single character strings */
				3663	if (PyUnicode_GET_SIZE(self) == 1)
				3664	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3665
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3666	/* Special case for empty strings */
				3667	if (PyString_GET_SIZE(self) == 0)
				3668	return PyInt_FromLong(0);
				3669
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3670	e = p + PyUnicode_GET_SIZE(self);
				3671	cased = 0;
				3672	for (; p < e; p++) {
				3673	register const Py_UNICODE ch = *p;
				3674
				3675	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3676	return PyInt_FromLong(0);
				3677	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3678	cased = 1;
				3679	}
				3680	return PyInt_FromLong(cased);
				3681	}
				3682
				3683	static char isupper__doc__[] =
				3684	"S.isupper() -> int\n\
				3685	\n\
				3686	Return 1 if all cased characters in S are uppercase and there is\n\
				3687	at least one cased character in S, 0 otherwise.";
				3688
				3689	static PyObject*
				3690	unicode_isupper(PyUnicodeObject self, PyObject args)
				3691	{
				3692	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3693	register const Py_UNICODE *e;
				3694	int cased;
				3695
				3696	if (!PyArg_NoArgs(args))
				3697	return NULL;
				3698
				3699	/* Shortcut for single character strings */
				3700	if (PyUnicode_GET_SIZE(self) == 1)
				3701	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3702
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3703	/* Special case for empty strings */
				3704	if (PyString_GET_SIZE(self) == 0)
				3705	return PyInt_FromLong(0);
				3706
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3707	e = p + PyUnicode_GET_SIZE(self);
				3708	cased = 0;
				3709	for (; p < e; p++) {
				3710	register const Py_UNICODE ch = *p;
				3711
				3712	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3713	return PyInt_FromLong(0);
				3714	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3715	cased = 1;
				3716	}
				3717	return PyInt_FromLong(cased);
				3718	}
				3719
				3720	static char istitle__doc__[] =
				3721	"S.istitle() -> int\n\
				3722	\n\
				3723	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3724	may only follow uncased characters and lowercase characters only cased\n\
				3725	ones. Return 0 otherwise.";
				3726
				3727	static PyObject*
				3728	unicode_istitle(PyUnicodeObject self, PyObject args)
				3729	{
				3730	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3731	register const Py_UNICODE *e;
				3732	int cased, previous_is_cased;
				3733
				3734	if (!PyArg_NoArgs(args))
				3735	return NULL;
				3736
				3737	/* Shortcut for single character strings */
				3738	if (PyUnicode_GET_SIZE(self) == 1)
				3739	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3740	(Py_UNICODE_ISUPPER(*p) != 0));
				3741
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3742	/* Special case for empty strings */
				3743	if (PyString_GET_SIZE(self) == 0)
				3744	return PyInt_FromLong(0);
				3745
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3746	e = p + PyUnicode_GET_SIZE(self);
				3747	cased = 0;
				3748	previous_is_cased = 0;
				3749	for (; p < e; p++) {
				3750	register const Py_UNICODE ch = *p;
				3751
				3752	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3753	if (previous_is_cased)
				3754	return PyInt_FromLong(0);
				3755	previous_is_cased = 1;
				3756	cased = 1;
				3757	}
				3758	else if (Py_UNICODE_ISLOWER(ch)) {
				3759	if (!previous_is_cased)
				3760	return PyInt_FromLong(0);
				3761	previous_is_cased = 1;
				3762	cased = 1;
				3763	}
				3764	else
				3765	previous_is_cased = 0;
				3766	}
				3767	return PyInt_FromLong(cased);
				3768	}
				3769
				3770	static char isspace__doc__[] =
				3771	"S.isspace() -> int\n\
				3772	\n\
				3773	Return 1 if there are only whitespace characters in S,\n\
				3774	0 otherwise.";
				3775
				3776	static PyObject*
				3777	unicode_isspace(PyUnicodeObject self, PyObject args)
				3778	{
				3779	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3780	register const Py_UNICODE *e;
				3781
				3782	if (!PyArg_NoArgs(args))
				3783	return NULL;
				3784
				3785	/* Shortcut for single character strings */
				3786	if (PyUnicode_GET_SIZE(self) == 1 &&
				3787	Py_UNICODE_ISSPACE(*p))
				3788	return PyInt_FromLong(1);
				3789
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3790	/* Special case for empty strings */
				3791	if (PyString_GET_SIZE(self) == 0)
				3792	return PyInt_FromLong(0);
				3793
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3794	e = p + PyUnicode_GET_SIZE(self);
				3795	for (; p < e; p++) {
				3796	if (!Py_UNICODE_ISSPACE(*p))
				3797	return PyInt_FromLong(0);
				3798	}
				3799	return PyInt_FromLong(1);
				3800	}
				3801
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3802	static char isalpha__doc__[] =
				3803	"S.isalpha() -> int\n\
				3804	\n\
				3805	Return 1 if all characters in S are alphabetic\n\
				3806	and there is at least one character in S, 0 otherwise.";
				3807
				3808	static PyObject*
				3809	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3810	{
				3811	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3812	register const Py_UNICODE *e;
				3813
				3814	if (!PyArg_NoArgs(args))
				3815	return NULL;
				3816
				3817	/* Shortcut for single character strings */
				3818	if (PyUnicode_GET_SIZE(self) == 1 &&
				3819	Py_UNICODE_ISALPHA(*p))
				3820	return PyInt_FromLong(1);
				3821
				3822	/* Special case for empty strings */
				3823	if (PyString_GET_SIZE(self) == 0)
				3824	return PyInt_FromLong(0);
				3825
				3826	e = p + PyUnicode_GET_SIZE(self);
				3827	for (; p < e; p++) {
				3828	if (!Py_UNICODE_ISALPHA(*p))
				3829	return PyInt_FromLong(0);
				3830	}
				3831	return PyInt_FromLong(1);
				3832	}
				3833
				3834	static char isalnum__doc__[] =
				3835	"S.isalnum() -> int\n\
				3836	\n\
				3837	Return 1 if all characters in S are alphanumeric\n\
				3838	and there is at least one character in S, 0 otherwise.";
				3839
				3840	static PyObject*
				3841	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3842	{
				3843	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3844	register const Py_UNICODE *e;
				3845
				3846	if (!PyArg_NoArgs(args))
				3847	return NULL;
				3848
				3849	/* Shortcut for single character strings */
				3850	if (PyUnicode_GET_SIZE(self) == 1 &&
				3851	Py_UNICODE_ISALNUM(*p))
				3852	return PyInt_FromLong(1);
				3853
				3854	/* Special case for empty strings */
				3855	if (PyString_GET_SIZE(self) == 0)
				3856	return PyInt_FromLong(0);
				3857
				3858	e = p + PyUnicode_GET_SIZE(self);
				3859	for (; p < e; p++) {
				3860	if (!Py_UNICODE_ISALNUM(*p))
				3861	return PyInt_FromLong(0);
				3862	}
				3863	return PyInt_FromLong(1);
				3864	}
				3865
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3866	static char isdecimal__doc__[] =
				3867	"S.isdecimal() -> int\n\
				3868	\n\
				3869	Return 1 if there are only decimal characters in S,\n\
				3870	0 otherwise.";
				3871
				3872	static PyObject*
				3873	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3874	{
				3875	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3876	register const Py_UNICODE *e;
				3877
				3878	if (!PyArg_NoArgs(args))
				3879	return NULL;
				3880
				3881	/* Shortcut for single character strings */
				3882	if (PyUnicode_GET_SIZE(self) == 1 &&
				3883	Py_UNICODE_ISDECIMAL(*p))
				3884	return PyInt_FromLong(1);
				3885
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3886	/* Special case for empty strings */
				3887	if (PyString_GET_SIZE(self) == 0)
				3888	return PyInt_FromLong(0);
				3889
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3890	e = p + PyUnicode_GET_SIZE(self);
				3891	for (; p < e; p++) {
				3892	if (!Py_UNICODE_ISDECIMAL(*p))
				3893	return PyInt_FromLong(0);
				3894	}
				3895	return PyInt_FromLong(1);
				3896	}
				3897
				3898	static char isdigit__doc__[] =
				3899	"S.isdigit() -> int\n\
				3900	\n\
				3901	Return 1 if there are only digit characters in S,\n\
				3902	0 otherwise.";
				3903
				3904	static PyObject*
				3905	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3906	{
				3907	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3908	register const Py_UNICODE *e;
				3909
				3910	if (!PyArg_NoArgs(args))
				3911	return NULL;
				3912
				3913	/* Shortcut for single character strings */
				3914	if (PyUnicode_GET_SIZE(self) == 1 &&
				3915	Py_UNICODE_ISDIGIT(*p))
				3916	return PyInt_FromLong(1);
				3917
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3918	/* Special case for empty strings */
				3919	if (PyString_GET_SIZE(self) == 0)
				3920	return PyInt_FromLong(0);
				3921
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3922	e = p + PyUnicode_GET_SIZE(self);
				3923	for (; p < e; p++) {
				3924	if (!Py_UNICODE_ISDIGIT(*p))
				3925	return PyInt_FromLong(0);
				3926	}
				3927	return PyInt_FromLong(1);
				3928	}
				3929
				3930	static char isnumeric__doc__[] =
				3931	"S.isnumeric() -> int\n\
				3932	\n\
				3933	Return 1 if there are only numeric characters in S,\n\
				3934	0 otherwise.";
				3935
				3936	static PyObject*
				3937	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3938	{
				3939	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3940	register const Py_UNICODE *e;
				3941
				3942	if (!PyArg_NoArgs(args))
				3943	return NULL;
				3944
				3945	/* Shortcut for single character strings */
				3946	if (PyUnicode_GET_SIZE(self) == 1 &&
				3947	Py_UNICODE_ISNUMERIC(*p))
				3948	return PyInt_FromLong(1);
				3949
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3950	/* Special case for empty strings */
				3951	if (PyString_GET_SIZE(self) == 0)
				3952	return PyInt_FromLong(0);
				3953
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3954	e = p + PyUnicode_GET_SIZE(self);
				3955	for (; p < e; p++) {
				3956	if (!Py_UNICODE_ISNUMERIC(*p))
				3957	return PyInt_FromLong(0);
				3958	}
				3959	return PyInt_FromLong(1);
				3960	}
				3961
				3962	static char join__doc__[] =
				3963	"S.join(sequence) -> unicode\n\
				3964	\n\
				3965	Return a string which is the concatenation of the strings in the\n\
				3966	sequence. The separator between elements is S.";
				3967
				3968	static PyObject*
				3969	unicode_join(PyUnicodeObject self, PyObject args)
				3970	{
				3971	PyObject *data;
				3972	if (!PyArg_ParseTuple(args, "O:join", &data))
				3973	return NULL;
				3974
				3975	return PyUnicode_Join((PyObject *)self, data);
				3976	}
				3977
				3978	static int
				3979	unicode_length(PyUnicodeObject *self)
				3980	{
				3981	return self->length;
				3982	}
				3983
				3984	static char ljust__doc__[] =
				3985	"S.ljust(width) -> unicode\n\
				3986	\n\
				3987	Return S left justified in a Unicode string of length width. Padding is\n\
				3988	done using spaces.";
				3989
				3990	static PyObject *
				3991	unicode_ljust(PyUnicodeObject self, PyObject args)
				3992	{
				3993	int width;
				3994	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3995	return NULL;
				3996
				3997	if (self->length >= width) {
				3998	Py_INCREF(self);
				3999	return (PyObject*) self;
				4000	}
				4001
				4002	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4003	}
				4004
				4005	static char lower__doc__[] =
				4006	"S.lower() -> unicode\n\
				4007	\n\
				4008	Return a copy of the string S converted to lowercase.";
				4009
				4010	static PyObject*
				4011	unicode_lower(PyUnicodeObject self, PyObject args)
				4012	{
				4013	if (!PyArg_NoArgs(args))
				4014	return NULL;
				4015	return fixup(self, fixlower);
				4016	}
				4017
				4018	static char lstrip__doc__[] =
				4019	"S.lstrip() -> unicode\n\
				4020	\n\
				4021	Return a copy of the string S with leading whitespace removed.";
				4022
				4023	static PyObject *
				4024	unicode_lstrip(PyUnicodeObject self, PyObject args)
				4025	{
				4026	if (!PyArg_NoArgs(args))
				4027	return NULL;
				4028	return strip(self, 1, 0);
				4029	}
				4030
				4031	static PyObject*
				4032	unicode_repeat(PyUnicodeObject *str, int len)
				4033	{
				4034	PyUnicodeObject *u;
				4035	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4036	int nchars;
				4037	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4038
				4039	if (len < 0)
				4040	len = 0;
				4041
				4042	if (len == 1) {
				4043	/* no repeat, return original string */
				4044	Py_INCREF(str);
				4045	return (PyObject*) str;
				4046	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4047
				4048	/* ensure # of chars needed doesn't overflow int and # of bytes
				4049	* needed doesn't overflow size_t
				4050	*/
				4051	nchars = len * str->length;
				4052	if (len && nchars / len != str->length) {
				4053	PyErr_SetString(PyExc_OverflowError,
				4054	"repeated string is too long");
				4055	return NULL;
				4056	}
				4057	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4058	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4059	PyErr_SetString(PyExc_OverflowError,
				4060	"repeated string is too long");
				4061	return NULL;
				4062	}
				4063	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4064	if (!u)
				4065	return NULL;
				4066
				4067	p = u->str;
				4068
				4069	while (len-- > 0) {
				4070	Py_UNICODE_COPY(p, str->str, str->length);
				4071	p += str->length;
				4072	}
				4073
				4074	return (PyObject*) u;
				4075	}
				4076
				4077	PyObject PyUnicode_Replace(PyObject obj,
				4078	PyObject *subobj,
				4079	PyObject *replobj,
				4080	int maxcount)
				4081	{
				4082	PyObject *self;
				4083	PyObject *str1;
				4084	PyObject *str2;
				4085	PyObject *result;
				4086
				4087	self = PyUnicode_FromObject(obj);
				4088	if (self == NULL)
				4089	return NULL;
				4090	str1 = PyUnicode_FromObject(subobj);
				4091	if (str1 == NULL) {
				4092	Py_DECREF(self);
				4093	return NULL;
				4094	}
				4095	str2 = PyUnicode_FromObject(replobj);
				4096	if (str2 == NULL) {
				4097	Py_DECREF(self);
				4098	Py_DECREF(str1);
				4099	return NULL;
				4100	}
				4101	result = replace((PyUnicodeObject *)self,
				4102	(PyUnicodeObject *)str1,
				4103	(PyUnicodeObject *)str2,
				4104	maxcount);
				4105	Py_DECREF(self);
				4106	Py_DECREF(str1);
				4107	Py_DECREF(str2);
				4108	return result;
				4109	}
				4110
				4111	static char replace__doc__[] =
				4112	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4113	\n\
				4114	Return a copy of S with all occurrences of substring\n\
				4115	old replaced by new. If the optional argument maxsplit is\n\
				4116	given, only the first maxsplit occurrences are replaced.";
				4117
				4118	static PyObject*
				4119	unicode_replace(PyUnicodeObject self, PyObject args)
				4120	{
				4121	PyUnicodeObject *str1;
				4122	PyUnicodeObject *str2;
				4123	int maxcount = -1;
				4124	PyObject *result;
				4125
				4126	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4127	return NULL;
				4128	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4129	if (str1 == NULL)
				4130	return NULL;
				4131	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4132	if (str2 == NULL)
				4133	return NULL;
				4134
				4135	result = replace(self, str1, str2, maxcount);
				4136
				4137	Py_DECREF(str1);
				4138	Py_DECREF(str2);
				4139	return result;
				4140	}
				4141
				4142	static
				4143	PyObject unicode_repr(PyObject unicode)
				4144	{
				4145	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4146	PyUnicode_GET_SIZE(unicode),
				4147	1);
				4148	}
				4149
				4150	static char rfind__doc__[] =
				4151	"S.rfind(sub [,start [,end]]) -> int\n\
				4152	\n\
				4153	Return the highest index in S where substring sub is found,\n\
				4154	such that sub is contained within s[start,end]. Optional\n\
				4155	arguments start and end are interpreted as in slice notation.\n\
				4156	\n\
				4157	Return -1 on failure.";
				4158
				4159	static PyObject *
				4160	unicode_rfind(PyUnicodeObject self, PyObject args)
				4161	{
				4162	PyUnicodeObject *substring;
				4163	int start = 0;
				4164	int end = INT_MAX;
				4165	PyObject *result;
				4166
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4167	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4168	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4169	return NULL;
				4170	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4171	(PyObject *)substring);
				4172	if (substring == NULL)
				4173	return NULL;
				4174
				4175	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4176
				4177	Py_DECREF(substring);
				4178	return result;
				4179	}
				4180
				4181	static char rindex__doc__[] =
				4182	"S.rindex(sub [,start [,end]]) -> int\n\
				4183	\n\
				4184	Like S.rfind() but raise ValueError when the substring is not found.";
				4185
				4186	static PyObject *
				4187	unicode_rindex(PyUnicodeObject self, PyObject args)
				4188	{
				4189	int result;
				4190	PyUnicodeObject *substring;
				4191	int start = 0;
				4192	int end = INT_MAX;
				4193
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4194	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4195	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4196	return NULL;
				4197	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4198	(PyObject *)substring);
				4199	if (substring == NULL)
				4200	return NULL;
				4201
				4202	result = findstring(self, substring, start, end, -1);
				4203
				4204	Py_DECREF(substring);
				4205	if (result < 0) {
				4206	PyErr_SetString(PyExc_ValueError, "substring not found");
				4207	return NULL;
				4208	}
				4209	return PyInt_FromLong(result);
				4210	}
				4211
				4212	static char rjust__doc__[] =
				4213	"S.rjust(width) -> unicode\n\
				4214	\n\
				4215	Return S right justified in a Unicode string of length width. Padding is\n\
				4216	done using spaces.";
				4217
				4218	static PyObject *
				4219	unicode_rjust(PyUnicodeObject self, PyObject args)
				4220	{
				4221	int width;
				4222	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4223	return NULL;
				4224
				4225	if (self->length >= width) {
				4226	Py_INCREF(self);
				4227	return (PyObject*) self;
				4228	}
				4229
				4230	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4231	}
				4232
				4233	static char rstrip__doc__[] =
				4234	"S.rstrip() -> unicode\n\
				4235	\n\
				4236	Return a copy of the string S with trailing whitespace removed.";
				4237
				4238	static PyObject *
				4239	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4240	{
				4241	if (!PyArg_NoArgs(args))
				4242	return NULL;
				4243	return strip(self, 0, 1);
				4244	}
				4245
				4246	static PyObject*
				4247	unicode_slice(PyUnicodeObject *self, int start, int end)
				4248	{
				4249	/* standard clamping */
				4250	if (start < 0)
				4251	start = 0;
				4252	if (end < 0)
				4253	end = 0;
				4254	if (end > self->length)
				4255	end = self->length;
				4256	if (start == 0 && end == self->length) {
				4257	/* full slice, return original string */
				4258	Py_INCREF(self);
				4259	return (PyObject*) self;
				4260	}
				4261	if (start > end)
				4262	start = end;
				4263	/* copy slice */
				4264	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4265	end - start);
				4266	}
				4267
				4268	PyObject PyUnicode_Split(PyObject s,
				4269	PyObject *sep,
				4270	int maxsplit)
				4271	{
				4272	PyObject *result;
				4273
				4274	s = PyUnicode_FromObject(s);
				4275	if (s == NULL)
				4276	return NULL;
				4277	if (sep != NULL) {
				4278	sep = PyUnicode_FromObject(sep);
				4279	if (sep == NULL) {
				4280	Py_DECREF(s);
				4281	return NULL;
				4282	}
				4283	}
				4284
				4285	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4286
				4287	Py_DECREF(s);
				4288	Py_XDECREF(sep);
				4289	return result;
				4290	}
				4291
				4292	static char split__doc__[] =
				4293	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4294	\n\
				4295	Return a list of the words in S, using sep as the\n\
				4296	delimiter string. If maxsplit is given, at most maxsplit\n\
				4297	splits are done. If sep is not specified, any whitespace string\n\
				4298	is a separator.";
				4299
				4300	static PyObject*
				4301	unicode_split(PyUnicodeObject self, PyObject args)
				4302	{
				4303	PyObject *substring = Py_None;
				4304	int maxcount = -1;
				4305
				4306	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4307	return NULL;
				4308
				4309	if (substring == Py_None)
				4310	return split(self, NULL, maxcount);
				4311	else if (PyUnicode_Check(substring))
				4312	return split(self, (PyUnicodeObject *)substring, maxcount);
				4313	else
				4314	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4315	}
				4316
				4317	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4318	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4319	\n\
				4320	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4321	Line breaks are not included in the resulting list unless keepends\n\
				4322	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4323
				4324	static PyObject*
				4325	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4326	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4327	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4328
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4329	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4330	return NULL;
				4331
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4332	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4333	}
				4334
				4335	static
				4336	PyObject unicode_str(PyUnicodeObject self)
				4337	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4338	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4339	}
				4340
				4341	static char strip__doc__[] =
				4342	"S.strip() -> unicode\n\
				4343	\n\
				4344	Return a copy of S with leading and trailing whitespace removed.";
				4345
				4346	static PyObject *
				4347	unicode_strip(PyUnicodeObject self, PyObject args)
				4348	{
				4349	if (!PyArg_NoArgs(args))
				4350	return NULL;
				4351	return strip(self, 1, 1);
				4352	}
				4353
				4354	static char swapcase__doc__[] =
				4355	"S.swapcase() -> unicode\n\
				4356	\n\
				4357	Return a copy of S with uppercase characters converted to lowercase\n\
				4358	and vice versa.";
				4359
				4360	static PyObject*
				4361	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4362	{
				4363	if (!PyArg_NoArgs(args))
				4364	return NULL;
				4365	return fixup(self, fixswapcase);
				4366	}
				4367
				4368	static char translate__doc__[] =
				4369	"S.translate(table) -> unicode\n\
				4370	\n\
				4371	Return a copy of the string S, where all characters have been mapped\n\
				4372	through the given translation table, which must be a mapping of\n\
				4373	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4374	are left untouched. Characters mapped to None are deleted.";
				4375
				4376	static PyObject*
				4377	unicode_translate(PyUnicodeObject self, PyObject args)
				4378	{
				4379	PyObject *table;
				4380
				4381	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4382	return NULL;
				4383	return PyUnicode_TranslateCharmap(self->str,
				4384	self->length,
				4385	table,
				4386	"ignore");
				4387	}
				4388
				4389	static char upper__doc__[] =
				4390	"S.upper() -> unicode\n\
				4391	\n\
				4392	Return a copy of S converted to uppercase.";
				4393
				4394	static PyObject*
				4395	unicode_upper(PyUnicodeObject self, PyObject args)
				4396	{
				4397	if (!PyArg_NoArgs(args))
				4398	return NULL;
				4399	return fixup(self, fixupper);
				4400	}
				4401
				4402	#if 0
				4403	static char zfill__doc__[] =
				4404	"S.zfill(width) -> unicode\n\
				4405	\n\
				4406	Pad a numeric string x with zeros on the left, to fill a field\n\
				4407	of the specified width. The string x is never truncated.";
				4408
				4409	static PyObject *
				4410	unicode_zfill(PyUnicodeObject self, PyObject args)
				4411	{
				4412	int fill;
				4413	PyUnicodeObject *u;
				4414
				4415	int width;
				4416	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4417	return NULL;
				4418
				4419	if (self->length >= width) {
				4420	Py_INCREF(self);
				4421	return (PyObject*) self;
				4422	}
				4423
				4424	fill = width - self->length;
				4425
				4426	u = pad(self, fill, 0, '0');
				4427
				4428	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4429	/* move sign to beginning of string */
				4430	u->str[0] = u->str[fill];
				4431	u->str[fill] = '0';
				4432	}
				4433
				4434	return (PyObject*) u;
				4435	}
				4436	#endif
				4437
				4438	#if 0
				4439	static PyObject*
				4440	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4441	{
				4442	if (!PyArg_NoArgs(args))
				4443	return NULL;
				4444	return PyInt_FromLong(unicode_freelist_size);
				4445	}
				4446	#endif
				4447
				4448	static char startswith__doc__[] =
				4449	"S.startswith(prefix[, start[, end]]) -> int\n\
				4450	\n\
				4451	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4452	optional start, test S beginning at that position. With optional end, stop\n\
				4453	comparing S at that position.";
				4454
				4455	static PyObject *
				4456	unicode_startswith(PyUnicodeObject *self,
				4457	PyObject *args)
				4458	{
				4459	PyUnicodeObject *substring;
				4460	int start = 0;
				4461	int end = INT_MAX;
				4462	PyObject *result;
				4463
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4464	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4465	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4466	return NULL;
				4467	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4468	(PyObject *)substring);
				4469	if (substring == NULL)
				4470	return NULL;
				4471
				4472	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4473
				4474	Py_DECREF(substring);
				4475	return result;
				4476	}
				4477
				4478
				4479	static char endswith__doc__[] =
				4480	"S.endswith(suffix[, start[, end]]) -> int\n\
				4481	\n\
				4482	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4483	optional start, test S beginning at that position. With optional end, stop\n\
				4484	comparing S at that position.";
				4485
				4486	static PyObject *
				4487	unicode_endswith(PyUnicodeObject *self,
				4488	PyObject *args)
				4489	{
				4490	PyUnicodeObject *substring;
				4491	int start = 0;
				4492	int end = INT_MAX;
				4493	PyObject *result;
				4494
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4495	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4496	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4497	return NULL;
				4498	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4499	(PyObject *)substring);
				4500	if (substring == NULL)
				4501	return NULL;
				4502
				4503	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4504
				4505	Py_DECREF(substring);
				4506	return result;
				4507	}
				4508
				4509
				4510	static PyMethodDef unicode_methods[] = {
				4511
				4512	/* Order is according to common usage: often used methods should
				4513	appear first, since lookup is done sequentially. */
				4514
				4515	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4516	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4517	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4518	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4519	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4520	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4521	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4522	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4523	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4524	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4525	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4526	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4527	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4528	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4529	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4530	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4531	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4532	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4533	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4534	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4535	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4536	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4537	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4538	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4539	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4540	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4541	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4542	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4543	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4544	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4545	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4546	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4547	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4548	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4549	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4550	#if 0
				4551	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4552	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4553	#endif
				4554
				4555	#if 0
				4556	/* This one is just used for debugging the implementation. */
				4557	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4558	#endif
				4559
				4560	{NULL, NULL}
				4561	};
				4562
				4563	static PyObject *
				4564	unicode_getattr(PyUnicodeObject self, char name)
				4565	{
				4566	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4567	}
				4568
				4569	static PySequenceMethods unicode_as_sequence = {
				4570	(inquiry) unicode_length, /* sq_length */
				4571	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4572	(intargfunc) unicode_repeat, /* sq_repeat */
				4573	(intargfunc) unicode_getitem, /* sq_item */
				4574	(intintargfunc) unicode_slice, /* sq_slice */
				4575	0, /* sq_ass_item */
				4576	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4577	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4578	};
				4579
				4580	static int
				4581	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4582	int index,
				4583	const void **ptr)
				4584	{
				4585	if (index != 0) {
				4586	PyErr_SetString(PyExc_SystemError,
				4587	"accessing non-existent unicode segment");
				4588	return -1;
				4589	}
				4590	ptr = (void ) self->str;
				4591	return PyUnicode_GET_DATA_SIZE(self);
				4592	}
				4593
				4594	static int
				4595	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4596	const void **ptr)
				4597	{
				4598	PyErr_SetString(PyExc_TypeError,
				4599	"cannot use unicode as modifyable buffer");
				4600	return -1;
				4601	}
				4602
				4603	static int
				4604	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4605	int *lenp)
				4606	{
				4607	if (lenp)
				4608	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4609	return 1;
				4610	}
				4611
				4612	static int
				4613	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4614	int index,
				4615	const void **ptr)
				4616	{
				4617	PyObject *str;
				4618
				4619	if (index != 0) {
				4620	PyErr_SetString(PyExc_SystemError,
				4621	"accessing non-existent unicode segment");
				4622	return -1;
				4623	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4624	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4625	if (str == NULL)
				4626	return -1;
				4627	ptr = (void ) PyString_AS_STRING(str);
				4628	return PyString_GET_SIZE(str);
				4629	}
				4630
				4631	/* Helpers for PyUnicode_Format() */
				4632
				4633	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4634	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4635	{
				4636	int argidx = *p_argidx;
				4637	if (argidx < arglen) {
				4638	(*p_argidx)++;
				4639	if (arglen < 0)
				4640	return args;
				4641	else
				4642	return PyTuple_GetItem(args, argidx);
				4643	}
				4644	PyErr_SetString(PyExc_TypeError,
				4645	"not enough arguments for format string");
				4646	return NULL;
				4647	}
				4648
				4649	#define F_LJUST (1<<0)
				4650	#define F_SIGN (1<<1)
				4651	#define F_BLANK (1<<2)
				4652	#define F_ALT (1<<3)
				4653	#define F_ZERO (1<<4)
				4654
				4655	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4656	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4657	{
				4658	register int i;
				4659	int len;
				4660	va_list va;
				4661	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4662	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4663
				4664	/* First, format the string as char array, then expand to Py_UNICODE
				4665	array. */
				4666	charbuffer = (char *)buffer;
				4667	len = vsprintf(charbuffer, format, va);
				4668	for (i = len - 1; i >= 0; i--)
				4669	buffer[i] = (Py_UNICODE) charbuffer[i];
				4670
				4671	va_end(va);
				4672	return len;
				4673	}
				4674
				4675	static int
				4676	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4677	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4678	int flags,
				4679	int prec,
				4680	int type,
				4681	PyObject *v)
				4682	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4683	/* fmt = '%#.' + `prec` + `type`
				4684	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4685	char fmt[20];
				4686	double x;
				4687
				4688	x = PyFloat_AsDouble(v);
				4689	if (x == -1.0 && PyErr_Occurred())
				4690	return -1;
				4691	if (prec < 0)
				4692	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4693	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4694	type = 'g';
				4695	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4696	/* worst case length calc to ensure no buffer overrun:
				4697	fmt = %#.<prec>g
				4698	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4699	for any double rep.)
				4700	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4701	If prec=0 the effective precision is 1 (the leading digit is
				4702	always given), therefore increase by one to 10+prec. */
				4703	if (buflen <= (size_t)10 + (size_t)prec) {
				4704	PyErr_SetString(PyExc_OverflowError,
				4705	"formatted float is too long (precision too long?)");
				4706	return -1;
				4707	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4708	return usprintf(buf, fmt, x);
				4709	}
				4710
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4711	static PyObject*
				4712	formatlong(PyObject *val, int flags, int prec, int type)
				4713	{
				4714	char *buf;
				4715	int i, len;
				4716	PyObject str; / temporary string object. */
				4717	PyUnicodeObject *result;
				4718
				4719	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4720	if (!str)
				4721	return NULL;
				4722	result = _PyUnicode_New(len);
				4723	for (i = 0; i < len; i++)
				4724	result->str[i] = buf[i];
				4725	result->str[len] = 0;
				4726	Py_DECREF(str);
				4727	return (PyObject*)result;
				4728	}
				4729
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4730	static int
				4731	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4732	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4733	int flags,
				4734	int prec,
				4735	int type,
				4736	PyObject *v)
				4737	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4738	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4739	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4740	+ 1 + 1 = 24*/
				4741	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4742	long x;
				4743
				4744	x = PyInt_AsLong(v);
				4745	if (x == -1 && PyErr_Occurred())
				4746	return -1;
				4747	if (prec < 0)
				4748	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4749	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4750	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4751	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4752	PyErr_SetString(PyExc_OverflowError,
				4753	"formatted integer is too long (precision too long?)");
				4754	return -1;
				4755	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4756	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4757	return usprintf(buf, fmt, x);
				4758	}
				4759
				4760	static int
				4761	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4762	size_t buflen,
				4763	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4764	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4765	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4766	if (PyUnicode_Check(v)) {
				4767	if (PyUnicode_GET_SIZE(v) != 1)
				4768	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4769	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4770	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4771
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4772	else if (PyString_Check(v)) {
				4773	if (PyString_GET_SIZE(v) != 1)
				4774	goto onError;
				4775	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4776	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4777
				4778	else {
				4779	/* Integer input truncated to a character */
				4780	long x;
				4781	x = PyInt_AsLong(v);
				4782	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4783	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4784	buf[0] = (char) x;
				4785	}
				4786	buf[1] = '\0';
				4787	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4788
				4789	onError:
				4790	PyErr_SetString(PyExc_TypeError,
				4791	"%c requires int or char");
				4792	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4793	}
				4794
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4795	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4796
				4797	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4798	chars are formatted. XXX This is a magic number. Each formatting
				4799	routine does bounds checking to ensure no overflow, but a better
				4800	solution may be to malloc a buffer of appropriate size for each
				4801	format. For now, the current solution is sufficient.
				4802	*/
				4803	#define FORMATBUFLEN (size_t)120
				4804
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4805	PyObject PyUnicode_Format(PyObject format,
				4806	PyObject *args)
				4807	{
				4808	Py_UNICODE fmt, res;
				4809	int fmtcnt, rescnt, reslen, arglen, argidx;
				4810	int args_owned = 0;
				4811	PyUnicodeObject *result = NULL;
				4812	PyObject *dict = NULL;
				4813	PyObject *uformat;
				4814
				4815	if (format == NULL \|\| args == NULL) {
				4816	PyErr_BadInternalCall();
				4817	return NULL;
				4818	}
				4819	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4820	if (uformat == NULL)
				4821	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4822	fmt = PyUnicode_AS_UNICODE(uformat);
				4823	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4824
				4825	reslen = rescnt = fmtcnt + 100;
				4826	result = _PyUnicode_New(reslen);
				4827	if (result == NULL)
				4828	goto onError;
				4829	res = PyUnicode_AS_UNICODE(result);
				4830
				4831	if (PyTuple_Check(args)) {
				4832	arglen = PyTuple_Size(args);
				4833	argidx = 0;
				4834	}
				4835	else {
				4836	arglen = -1;
				4837	argidx = -2;
				4838	}
				4839	if (args->ob_type->tp_as_mapping)
				4840	dict = args;
				4841
				4842	while (--fmtcnt >= 0) {
				4843	if (*fmt != '%') {
				4844	if (--rescnt < 0) {
				4845	rescnt = fmtcnt + 100;
				4846	reslen += rescnt;
				4847	if (_PyUnicode_Resize(result, reslen) < 0)
				4848	return NULL;
				4849	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4850	--rescnt;
				4851	}
				4852	res++ = fmt++;
				4853	}
				4854	else {
				4855	/* Got a format specifier */
				4856	int flags = 0;
				4857	int width = -1;
				4858	int prec = -1;
				4859	int size = 0;
				4860	Py_UNICODE c = '\0';
				4861	Py_UNICODE fill;
				4862	PyObject *v = NULL;
				4863	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4864	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4865	Py_UNICODE sign;
				4866	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4867	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4868
				4869	fmt++;
				4870	if (*fmt == '(') {
				4871	Py_UNICODE *keystart;
				4872	int keylen;
				4873	PyObject *key;
				4874	int pcount = 1;
				4875
				4876	if (dict == NULL) {
				4877	PyErr_SetString(PyExc_TypeError,
				4878	"format requires a mapping");
				4879	goto onError;
				4880	}
				4881	++fmt;
				4882	--fmtcnt;
				4883	keystart = fmt;
				4884	/* Skip over balanced parentheses */
				4885	while (pcount > 0 && --fmtcnt >= 0) {
				4886	if (*fmt == ')')
				4887	--pcount;
				4888	else if (*fmt == '(')
				4889	++pcount;
				4890	fmt++;
				4891	}
				4892	keylen = fmt - keystart - 1;
				4893	if (fmtcnt < 0 \|\| pcount > 0) {
				4894	PyErr_SetString(PyExc_ValueError,
				4895	"incomplete format key");
				4896	goto onError;
				4897	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4898	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4899	then looked up since Python uses strings to hold
				4900	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4901	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4902	key = PyUnicode_EncodeUTF8(keystart,
				4903	keylen,
				4904	NULL);
				4905	if (key == NULL)
				4906	goto onError;
				4907	if (args_owned) {
				4908	Py_DECREF(args);
				4909	args_owned = 0;
				4910	}
				4911	args = PyObject_GetItem(dict, key);
				4912	Py_DECREF(key);
				4913	if (args == NULL) {
				4914	goto onError;
				4915	}
				4916	args_owned = 1;
				4917	arglen = -1;
				4918	argidx = -2;
				4919	}
				4920	while (--fmtcnt >= 0) {
				4921	switch (c = *fmt++) {
				4922	case '-': flags \|= F_LJUST; continue;
				4923	case '+': flags \|= F_SIGN; continue;
				4924	case ' ': flags \|= F_BLANK; continue;
				4925	case '#': flags \|= F_ALT; continue;
				4926	case '0': flags \|= F_ZERO; continue;
				4927	}
				4928	break;
				4929	}
				4930	if (c == '*') {
				4931	v = getnextarg(args, arglen, &argidx);
				4932	if (v == NULL)
				4933	goto onError;
				4934	if (!PyInt_Check(v)) {
				4935	PyErr_SetString(PyExc_TypeError,
				4936	"* wants int");
				4937	goto onError;
				4938	}
				4939	width = PyInt_AsLong(v);
				4940	if (width < 0) {
				4941	flags \|= F_LJUST;
				4942	width = -width;
				4943	}
				4944	if (--fmtcnt >= 0)
				4945	c = *fmt++;
				4946	}
				4947	else if (c >= '0' && c <= '9') {
				4948	width = c - '0';
				4949	while (--fmtcnt >= 0) {
				4950	c = *fmt++;
				4951	if (c < '0' \|\| c > '9')
				4952	break;
				4953	if ((width*10) / 10 != width) {
				4954	PyErr_SetString(PyExc_ValueError,
				4955	"width too big");
				4956	goto onError;
				4957	}
				4958	width = width*10 + (c - '0');
				4959	}
				4960	}
				4961	if (c == '.') {
				4962	prec = 0;
				4963	if (--fmtcnt >= 0)
				4964	c = *fmt++;
				4965	if (c == '*') {
				4966	v = getnextarg(args, arglen, &argidx);
				4967	if (v == NULL)
				4968	goto onError;
				4969	if (!PyInt_Check(v)) {
				4970	PyErr_SetString(PyExc_TypeError,
				4971	"* wants int");
				4972	goto onError;
				4973	}
				4974	prec = PyInt_AsLong(v);
				4975	if (prec < 0)
				4976	prec = 0;
				4977	if (--fmtcnt >= 0)
				4978	c = *fmt++;
				4979	}
				4980	else if (c >= '0' && c <= '9') {
				4981	prec = c - '0';
				4982	while (--fmtcnt >= 0) {
				4983	c = Py_CHARMASK(*fmt++);
				4984	if (c < '0' \|\| c > '9')
				4985	break;
				4986	if ((prec*10) / 10 != prec) {
				4987	PyErr_SetString(PyExc_ValueError,
				4988	"prec too big");
				4989	goto onError;
				4990	}
				4991	prec = prec*10 + (c - '0');
				4992	}
				4993	}
				4994	} /* prec */
				4995	if (fmtcnt >= 0) {
				4996	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4997	size = c;
				4998	if (--fmtcnt >= 0)
				4999	c = *fmt++;
				5000	}
				5001	}
				5002	if (fmtcnt < 0) {
				5003	PyErr_SetString(PyExc_ValueError,
				5004	"incomplete format");
				5005	goto onError;
				5006	}
				5007	if (c != '%') {
				5008	v = getnextarg(args, arglen, &argidx);
				5009	if (v == NULL)
				5010	goto onError;
				5011	}
				5012	sign = 0;
				5013	fill = ' ';
				5014	switch (c) {
				5015
				5016	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5017	pbuf = formatbuf;
				5018	/* presume that buffer length is at least 1 */
				5019	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5020	len = 1;
				5021	break;
				5022
				5023	case 's':
				5024	case 'r':
				5025	if (PyUnicode_Check(v) && c == 's') {
				5026	temp = v;
				5027	Py_INCREF(temp);
				5028	}
				5029	else {
				5030	PyObject *unicode;
				5031	if (c == 's')
				5032	temp = PyObject_Str(v);
				5033	else
				5034	temp = PyObject_Repr(v);
				5035	if (temp == NULL)
				5036	goto onError;
				5037	if (!PyString_Check(temp)) {
				5038	/* XXX Note: this should never happen, since
				5039	PyObject_Repr() and PyObject_Str() assure
				5040	this */
				5041	Py_DECREF(temp);
				5042	PyErr_SetString(PyExc_TypeError,
				5043	"%s argument has non-string str()");
				5044	goto onError;
				5045	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5046	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5047	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5048	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5049	"strict");
				5050	Py_DECREF(temp);
				5051	temp = unicode;
				5052	if (temp == NULL)
				5053	goto onError;
				5054	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5055	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5056	len = PyUnicode_GET_SIZE(temp);
				5057	if (prec >= 0 && len > prec)
				5058	len = prec;
				5059	break;
				5060
				5061	case 'i':
				5062	case 'd':
				5063	case 'u':
				5064	case 'o':
				5065	case 'x':
				5066	case 'X':
				5067	if (c == 'i')
				5068	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5069	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5070	temp = formatlong(v, flags, prec, c);
				5071	if (!temp)
				5072	goto onError;
				5073	pbuf = PyUnicode_AS_UNICODE(temp);
				5074	len = PyUnicode_GET_SIZE(temp);
				5075	/* unbounded ints can always produce
				5076	a sign character! */
				5077	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5078	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5079	else {
				5080	pbuf = formatbuf;
				5081	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5082	flags, prec, c, v);
				5083	if (len < 0)
				5084	goto onError;
				5085	/* only d conversion is signed */
				5086	sign = c == 'd';
				5087	}
				5088	if (flags & F_ZERO)
				5089	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5090	break;
				5091
				5092	case 'e':
				5093	case 'E':
				5094	case 'f':
				5095	case 'g':
				5096	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5097	pbuf = formatbuf;
				5098	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5099	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5100	if (len < 0)
				5101	goto onError;
				5102	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5103	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5104	fill = '0';
				5105	break;
				5106
				5107	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5108	pbuf = formatbuf;
				5109	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5110	if (len < 0)
				5111	goto onError;
				5112	break;
				5113
				5114	default:
				5115	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5116	"unsupported format character '%c' (0x%x) "
				5117	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5118	(31<=c && c<=126) ? c : '?',
				5119	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5120	goto onError;
				5121	}
				5122	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5123	if (pbuf == '-' \|\| pbuf == '+') {
				5124	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5125	len--;
				5126	}
				5127	else if (flags & F_SIGN)
				5128	sign = '+';
				5129	else if (flags & F_BLANK)
				5130	sign = ' ';
				5131	else
				5132	sign = 0;
				5133	}
				5134	if (width < len)
				5135	width = len;
				5136	if (rescnt < width + (sign != 0)) {
				5137	reslen -= rescnt;
				5138	rescnt = width + fmtcnt + 100;
				5139	reslen += rescnt;
				5140	if (_PyUnicode_Resize(result, reslen) < 0)
				5141	return NULL;
				5142	res = PyUnicode_AS_UNICODE(result)
				5143	+ reslen - rescnt;
				5144	}
				5145	if (sign) {
				5146	if (fill != ' ')
				5147	*res++ = sign;
				5148	rescnt--;
				5149	if (width > len)
				5150	width--;
				5151	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5152	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5153	assert(pbuf[0] == '0');
				5154	assert(pbuf[1] == c);
				5155	if (fill != ' ') {
				5156	res++ = pbuf++;
				5157	res++ = pbuf++;
				5158	}
				5159	rescnt -= 2;
				5160	width -= 2;
				5161	if (width < 0)
				5162	width = 0;
				5163	len -= 2;
				5164	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5165	if (width > len && !(flags & F_LJUST)) {
				5166	do {
				5167	--rescnt;
				5168	*res++ = fill;
				5169	} while (--width > len);
				5170	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5171	if (fill == ' ') {
				5172	if (sign)
				5173	*res++ = sign;
				5174	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5175	assert(pbuf[0] == '0');
				5176	assert(pbuf[1] == c);
				5177	res++ = pbuf++;
				5178	res++ = pbuf++;
				5179	}
				5180	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5181	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5182	res += len;
				5183	rescnt -= len;
				5184	while (--width >= len) {
				5185	--rescnt;
				5186	*res++ = ' ';
				5187	}
				5188	if (dict && (argidx < arglen) && c != '%') {
				5189	PyErr_SetString(PyExc_TypeError,
				5190	"not all arguments converted");
				5191	goto onError;
				5192	}
				5193	Py_XDECREF(temp);
				5194	} /* '%' */
				5195	} /* until end */
				5196	if (argidx < arglen && !dict) {
				5197	PyErr_SetString(PyExc_TypeError,
				5198	"not all arguments converted");
				5199	goto onError;
				5200	}
				5201
				5202	if (args_owned) {
				5203	Py_DECREF(args);
				5204	}
				5205	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5206	if (_PyUnicode_Resize(result, reslen - rescnt))
				5207	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5208	return (PyObject *)result;
				5209
				5210	onError:
				5211	Py_XDECREF(result);
				5212	Py_DECREF(uformat);
				5213	if (args_owned) {
				5214	Py_DECREF(args);
				5215	}
				5216	return NULL;
				5217	}
				5218
				5219	static PyBufferProcs unicode_as_buffer = {
				5220	(getreadbufferproc) unicode_buffer_getreadbuf,
				5221	(getwritebufferproc) unicode_buffer_getwritebuf,
				5222	(getsegcountproc) unicode_buffer_getsegcount,
				5223	(getcharbufferproc) unicode_buffer_getcharbuf,
				5224	};
				5225
				5226	PyTypeObject PyUnicode_Type = {
				5227	PyObject_HEAD_INIT(&PyType_Type)
				5228	0, /* ob_size */
				5229	"unicode", /* tp_name */
				5230	sizeof(PyUnicodeObject), /* tp_size */
				5231	0, /* tp_itemsize */
				5232	/* Slots */
				5233	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5234	0, /* tp_print */
				5235	(getattrfunc)unicode_getattr, /* tp_getattr */
				5236	0, /* tp_setattr */
				5237	(cmpfunc) unicode_compare, /* tp_compare */
				5238	(reprfunc) unicode_repr, /* tp_repr */
				5239	0, /* tp_as_number */
				5240	&unicode_as_sequence, /* tp_as_sequence */
				5241	0, /* tp_as_mapping */
				5242	(hashfunc) unicode_hash, /* tp_hash*/
				5243	0, /* tp_call*/
				5244	(reprfunc) unicode_str, /* tp_str */
				5245	(getattrofunc) NULL, /* tp_getattro */
				5246	(setattrofunc) NULL, /* tp_setattro */
				5247	&unicode_as_buffer, /* tp_as_buffer */
				5248	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5249	};
				5250
				5251	/* Initialize the Unicode implementation */
				5252
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5253	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5254	{
				5255	/* Doublecheck the configuration... */
				5256	if (sizeof(Py_UNICODE) != 2)
				5257	Py_FatalError("Unicode configuration error: "
				5258	"sizeof(Py_UNICODE) != 2 bytes");
				5259
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5260	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5261	unicode_freelist = NULL;
				5262	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5263	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5264	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5265	}
				5266
				5267	/* Finalize the Unicode implementation */
				5268
				5269	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5270	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5271	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5272	PyUnicodeObject *u;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5273
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5274	Py_XDECREF(unicode_empty);
				5275	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5276
				5277	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5278	PyUnicodeObject *v = u;
				5279	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5280	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5281	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5282	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5283	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5284	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5285	unicode_freelist = NULL;
				5286	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5287	}