Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: b9e457d6a7beb553e60d36ccaef75f0b01b15876 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	67	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	68	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	69
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	70	#ifdef MS_WIN32
				71	#include <windows.h>
				72	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	73
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	74	/* Limit for the Unicode object free list */
				75
				76	#define MAX_UNICODE_FREELIST_SIZE 1024
				77
				78	/* Limit for the Unicode object free list stay alive optimization.
				79
				80	The implementation will keep allocated Unicode memory intact for
				81	all objects on the free list having a size less than this
				82	limit. This reduces malloc() overhead for small Unicode objects.
				83
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	84	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	85	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	malloc()-overhead) bytes of unused garbage.
				87
				88	Setting the limit to 0 effectively turns the feature off.
				89
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	90	Note: This is an experimental feature ! If you get core dumps when
				91	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92
				93	*/
				94
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	95	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	96
				97	/* Endianness switches; defaults to little endian */
				98
				99	#ifdef WORDS_BIGENDIAN
				100	# define BYTEORDER_IS_BIG_ENDIAN
				101	#else
				102	# define BYTEORDER_IS_LITTLE_ENDIAN
				103	#endif
				104
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	105	/* --- Globals ------------------------------------------------------------
				106
				107	The globals are initialized by the _PyUnicode_Init() API and should
				108	not be used before calling that API.
				109
				110	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	111
				112	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	113	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	114
				115	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	116	static PyUnicodeObject *unicode_freelist;
				117	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	119	/* Default encoding to use and assume when NULL is passed as encoding
				120	parameter; it is initialized by _PyUnicode_Init().
				121
				122	Always use the PyUnicode_SetDefaultEncoding() and
				123	PyUnicode_GetDefaultEncoding() APIs to access this global.
				124
				125	*/
				126
				127	static char unicode_default_encoding[100];
				128
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129	/* --- Unicode Object ----------------------------------------------------- */
				130
				131	static
				132	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				133	int length)
				134	{
				135	void *oldstr;
				136
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	137	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	138	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	139	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	140
				141	/* Resizing unicode_empty is not allowed. */
				142	if (unicode == unicode_empty) {
				143	PyErr_SetString(PyExc_SystemError,
				144	"can't resize empty unicode object");
				145	return -1;
				146	}
				147
				148	/* We allocate one more byte to make sure the string is
				149	Ux0000 terminated -- XXX is this needed ? */
				150	oldstr = unicode->str;
				151	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				152	if (!unicode->str) {
				153	unicode->str = oldstr;
				154	PyErr_NoMemory();
				155	return -1;
				156	}
				157	unicode->str[length] = 0;
				158	unicode->length = length;
				159
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	160	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	161	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	162	if (unicode->defenc) {
				163	Py_DECREF(unicode->defenc);
				164	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	165	}
				166	unicode->hash = -1;
				167
				168	return 0;
				169	}
				170
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	171	int PyUnicode_Resize(PyObject **unicode,
				172	int length)
				173	{
				174	PyUnicodeObject *v;
				175
				176	if (unicode == NULL) {
				177	PyErr_BadInternalCall();
				178	return -1;
				179	}
				180	v = (PyUnicodeObject )unicode;
				181	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				182	PyErr_BadInternalCall();
				183	return -1;
				184	}
				185	return _PyUnicode_Resize(v, length);
				186	}
				187
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	/* We allocate one more byte to make sure the string is
				189	Ux0000 terminated -- XXX is this needed ?
				190
				191	XXX This allocator could further be enhanced by assuring that the
				192	free list never reduces its size below 1.
				193
				194	*/
				195
				196	static
				197	PyUnicodeObject *_PyUnicode_New(int length)
				198	{
				199	register PyUnicodeObject *unicode;
				200
				201	/* Optimization for empty strings */
				202	if (length == 0 && unicode_empty != NULL) {
				203	Py_INCREF(unicode_empty);
				204	return unicode_empty;
				205	}
				206
				207	/* Unicode freelist & memory allocation */
				208	if (unicode_freelist) {
				209	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	210	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	211	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	212	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	213	/* Keep-Alive optimization: we only upsize the buffer,
				214	never downsize it. */
				215	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	216	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	217	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	218	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	219	}
				220	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	221	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	222	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	223	}
				224	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	225	}
				226	else {
				227	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				228	if (unicode == NULL)
				229	return NULL;
				230	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				231	}
				232
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	233	if (!unicode->str) {
				234	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	235	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	236	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	237	unicode->str[length] = 0;
				238	unicode->length = length;
				239	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	240	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	242
				243	onError:
				244	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	246	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	247	}
				248
				249	static
				250	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				251	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	252	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	253	/* Keep-Alive optimization */
				254	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	255	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	256	unicode->str = NULL;
				257	unicode->length = 0;
				258	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	259	if (unicode->defenc) {
				260	Py_DECREF(unicode->defenc);
				261	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	262	}
				263	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	264	(PyUnicodeObject *)unicode = unicode_freelist;
				265	unicode_freelist = unicode;
				266	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	267	}
				268	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	269	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	270	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	271	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	272	}
				273	}
				274
				275	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				276	int size)
				277	{
				278	PyUnicodeObject *unicode;
				279
				280	unicode = _PyUnicode_New(size);
				281	if (!unicode)
				282	return NULL;
				283
				284	/* Copy the Unicode data into the new object */
				285	if (u != NULL)
				286	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				287
				288	return (PyObject *)unicode;
				289	}
				290
				291	#ifdef HAVE_WCHAR_H
				292
				293	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				294	int size)
				295	{
				296	PyUnicodeObject *unicode;
				297
				298	if (w == NULL) {
				299	PyErr_BadInternalCall();
				300	return NULL;
				301	}
				302
				303	unicode = _PyUnicode_New(size);
				304	if (!unicode)
				305	return NULL;
				306
				307	/* Copy the wchar_t data into the new object */
				308	#ifdef HAVE_USABLE_WCHAR_T
				309	memcpy(unicode->str, w, size * sizeof(wchar_t));
				310	#else
				311	{
				312	register Py_UNICODE *u;
				313	register int i;
				314	u = PyUnicode_AS_UNICODE(unicode);
				315	for (i = size; i >= 0; i--)
				316	u++ = w++;
				317	}
				318	#endif
				319
				320	return (PyObject *)unicode;
				321	}
				322
				323	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				324	register wchar_t *w,
				325	int size)
				326	{
				327	if (unicode == NULL) {
				328	PyErr_BadInternalCall();
				329	return -1;
				330	}
				331	if (size > PyUnicode_GET_SIZE(unicode))
				332	size = PyUnicode_GET_SIZE(unicode);
				333	#ifdef HAVE_USABLE_WCHAR_T
				334	memcpy(w, unicode->str, size * sizeof(wchar_t));
				335	#else
				336	{
				337	register Py_UNICODE *u;
				338	register int i;
				339	u = PyUnicode_AS_UNICODE(unicode);
				340	for (i = size; i >= 0; i--)
				341	w++ = u++;
				342	}
				343	#endif
				344
				345	return size;
				346	}
				347
				348	#endif
				349
				350	PyObject PyUnicode_FromObject(register PyObject obj)
				351	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	352	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				353	}
				354
				355	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				356	const char *encoding,
				357	const char *errors)
				358	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	359	const char *s;
				360	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	361	int owned = 0;
				362	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	363
				364	if (obj == NULL) {
				365	PyErr_BadInternalCall();
				366	return NULL;
				367	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	368
				369	/* Coerce object */
				370	if (PyInstance_Check(obj)) {
				371	PyObject *func;
				372	func = PyObject_GetAttrString(obj, "__str__");
				373	if (func == NULL) {
				374	PyErr_SetString(PyExc_TypeError,
				375	"coercing to Unicode: instance doesn't define __str__");
				376	return NULL;
				377	}
				378	obj = PyEval_CallObject(func, NULL);
				379	Py_DECREF(func);
				380	if (obj == NULL)
				381	return NULL;
				382	owned = 1;
				383	}
				384	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	385	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	386	v = obj;
				387	if (encoding) {
				388	PyErr_SetString(PyExc_TypeError,
				389	"decoding Unicode is not supported");
				390	return NULL;
				391	}
				392	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	393	}
				394	else if (PyString_Check(obj)) {
				395	s = PyString_AS_STRING(obj);
				396	len = PyString_GET_SIZE(obj);
				397	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	398	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				399	/* Overwrite the error message with something more useful in
				400	case of a TypeError. */
				401	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	402	PyErr_Format(PyExc_TypeError,
				403	"coercing to Unicode: need string or buffer, "
				404	"%.80s found",
				405	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	406	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	407	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	408
				409	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	410	if (len == 0) {
				411	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	412	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	413	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	414	else
				415	v = PyUnicode_Decode(s, len, encoding, errors);
				416	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	417	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	418	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	419	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	420	return v;
				421
				422	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	423	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	424	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	425	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	426	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	427	}
				428
				429	PyObject PyUnicode_Decode(const char s,
				430	int size,
				431	const char *encoding,
				432	const char *errors)
				433	{
				434	PyObject buffer = NULL, unicode;
				435
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	436	if (encoding == NULL)
				437	encoding = PyUnicode_GetDefaultEncoding();
				438
				439	/* Shortcuts for common default encodings */
				440	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	441	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	442	else if (strcmp(encoding, "latin-1") == 0)
				443	return PyUnicode_DecodeLatin1(s, size, errors);
				444	else if (strcmp(encoding, "ascii") == 0)
				445	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	446
				447	/* Decode via the codec registry */
				448	buffer = PyBuffer_FromMemory((void *)s, size);
				449	if (buffer == NULL)
				450	goto onError;
				451	unicode = PyCodec_Decode(buffer, encoding, errors);
				452	if (unicode == NULL)
				453	goto onError;
				454	if (!PyUnicode_Check(unicode)) {
				455	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	456	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	457	unicode->ob_type->tp_name);
				458	Py_DECREF(unicode);
				459	goto onError;
				460	}
				461	Py_DECREF(buffer);
				462	return unicode;
				463
				464	onError:
				465	Py_XDECREF(buffer);
				466	return NULL;
				467	}
				468
				469	PyObject PyUnicode_Encode(const Py_UNICODE s,
				470	int size,
				471	const char *encoding,
				472	const char *errors)
				473	{
				474	PyObject v, unicode;
				475
				476	unicode = PyUnicode_FromUnicode(s, size);
				477	if (unicode == NULL)
				478	return NULL;
				479	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				480	Py_DECREF(unicode);
				481	return v;
				482	}
				483
				484	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				485	const char *encoding,
				486	const char *errors)
				487	{
				488	PyObject *v;
				489
				490	if (!PyUnicode_Check(unicode)) {
				491	PyErr_BadArgument();
				492	goto onError;
				493	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	494
				495	if (encoding == NULL)
				496	encoding = PyUnicode_GetDefaultEncoding();
				497
				498	/* Shortcuts for common default encodings */
				499	if (errors == NULL) {
				500	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	501	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	502	else if (strcmp(encoding, "latin-1") == 0)
				503	return PyUnicode_AsLatin1String(unicode);
				504	else if (strcmp(encoding, "ascii") == 0)
				505	return PyUnicode_AsASCIIString(unicode);
				506	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	507
				508	/* Encode via the codec registry */
				509	v = PyCodec_Encode(unicode, encoding, errors);
				510	if (v == NULL)
				511	goto onError;
				512	/* XXX Should we really enforce this ? */
				513	if (!PyString_Check(v)) {
				514	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	515	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	516	v->ob_type->tp_name);
				517	Py_DECREF(v);
				518	goto onError;
				519	}
				520	return v;
				521
				522	onError:
				523	return NULL;
				524	}
				525
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	526	/* Return a Python string holding the default encoded value of the
				527	Unicode object.
				528
				529	The resulting string is cached in the Unicode object for subsequent
				530	usage by this function. The cached version is needed to implement
				531	the character buffer interface and will live (at least) as long as
				532	the Unicode object itself.
				533
				534	The refcount of the string is not incremented.
				535
				536	* Exported for internal use by the interpreter only !!! *
				537
				538	*/
				539
				540	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				541	const char *errors)
				542	{
				543	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				544
				545	if (v)
				546	return v;
				547	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				548	if (v && errors == NULL)
				549	((PyUnicodeObject *)unicode)->defenc = v;
				550	return v;
				551	}
				552
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	553	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				554	{
				555	if (!PyUnicode_Check(unicode)) {
				556	PyErr_BadArgument();
				557	goto onError;
				558	}
				559	return PyUnicode_AS_UNICODE(unicode);
				560
				561	onError:
				562	return NULL;
				563	}
				564
				565	int PyUnicode_GetSize(PyObject *unicode)
				566	{
				567	if (!PyUnicode_Check(unicode)) {
				568	PyErr_BadArgument();
				569	goto onError;
				570	}
				571	return PyUnicode_GET_SIZE(unicode);
				572
				573	onError:
				574	return -1;
				575	}
				576
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	577	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	578	{
				579	return unicode_default_encoding;
				580	}
				581
				582	int PyUnicode_SetDefaultEncoding(const char *encoding)
				583	{
				584	PyObject *v;
				585
				586	/* Make sure the encoding is valid. As side effect, this also
				587	loads the encoding into the codec registry cache. */
				588	v = _PyCodec_Lookup(encoding);
				589	if (v == NULL)
				590	goto onError;
				591	Py_DECREF(v);
				592	strncpy(unicode_default_encoding,
				593	encoding,
				594	sizeof(unicode_default_encoding));
				595	return 0;
				596
				597	onError:
				598	return -1;
				599	}
				600
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	601	/* --- UTF-8 Codec -------------------------------------------------------- */
				602
				603	static
				604	char utf8_code_length[256] = {
				605	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				606	illegal prefix. see RFC 2279 for details */
				607	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				608	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				609	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				610	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				611	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				612	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				613	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				614	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				615	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				616	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				617	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				618	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				619	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				620	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				621	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				622	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				623	};
				624
				625	static
				626	int utf8_decoding_error(const char **source,
				627	Py_UNICODE **dest,
				628	const char *errors,
				629	const char *details)
				630	{
				631	if ((errors == NULL) \|\|
				632	(strcmp(errors,"strict") == 0)) {
				633	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	634	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	635	details);
				636	return -1;
				637	}
				638	else if (strcmp(errors,"ignore") == 0) {
				639	(*source)++;
				640	return 0;
				641	}
				642	else if (strcmp(errors,"replace") == 0) {
				643	(*source)++;
				644	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				645	(*dest)++;
				646	return 0;
				647	}
				648	else {
				649	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	650	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	651	errors);
				652	return -1;
				653	}
				654	}
				655
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	656	PyObject PyUnicode_DecodeUTF8(const char s,
				657	int size,
				658	const char *errors)
				659	{
				660	int n;
				661	const char *e;
				662	PyUnicodeObject *unicode;
				663	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	664	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	665
				666	/* Note: size will always be longer than the resulting Unicode
				667	character count */
				668	unicode = _PyUnicode_New(size);
				669	if (!unicode)
				670	return NULL;
				671	if (size == 0)
				672	return (PyObject *)unicode;
				673
				674	/* Unpack UTF-8 encoded data */
				675	p = unicode->str;
				676	e = s + size;
				677
				678	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	679	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	680
				681	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	682	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	683	s++;
				684	continue;
				685	}
				686
				687	n = utf8_code_length[ch];
				688
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	689	if (s + n > e) {
				690	errmsg = "unexpected end of data";
				691	goto utf8Error;
				692	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	693
				694	switch (n) {
				695
				696	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	697	errmsg = "unexpected code byte";
				698	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699	break;
				700
				701	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	702	errmsg = "internal error";
				703	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	704	break;
				705
				706	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	707	if ((s[1] & 0xc0) != 0x80) {
				708	errmsg = "invalid data";
				709	goto utf8Error;
				710	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	711	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	712	if (ch < 0x80) {
				713	errmsg = "illegal encoding";
				714	goto utf8Error;
				715	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	716	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	717	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	718	break;
				719
				720	case 3:
				721	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	722	(s[2] & 0xc0) != 0x80) {
				723	errmsg = "invalid data";
				724	goto utf8Error;
				725	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	726	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	727	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				728	errmsg = "illegal encoding";
				729	goto utf8Error;
				730	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	731	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	732	*p++ = (Py_UNICODE)ch;
				733	break;
				734
				735	case 4:
				736	if ((s[1] & 0xc0) != 0x80 \|\|
				737	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	738	(s[3] & 0xc0) != 0x80) {
				739	errmsg = "invalid data";
				740	goto utf8Error;
				741	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	742	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				743	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				744	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	745	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				746	byte encoding */
				747	(ch > 0x10ffff)) { /* maximum value allowed for
				748	UTF-16 */
				749	errmsg = "illegal encoding";
				750	goto utf8Error;
				751	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	752	/* compute and append the two surrogates: */
				753
				754	/* translate from 10000..10FFFF to 0..FFFF */
				755	ch -= 0x10000;
				756
				757	/* high surrogate = top 10 bits added to D800 */
				758	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				759
				760	/* low surrogate = bottom 10 bits added to DC00 */
				761	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	762	break;
				763
				764	default:
				765	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	766	errmsg = "unsupported Unicode code range";
				767	goto utf8Error;
				768	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	769	}
				770	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	771	continue;
				772
				773	utf8Error:
				774	if (utf8_decoding_error(&s, &p, errors, errmsg))
				775	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	776	}
				777
				778	/* Adjust length */
				779	if (_PyUnicode_Resize(unicode, p - unicode->str))
				780	goto onError;
				781
				782	return (PyObject *)unicode;
				783
				784	onError:
				785	Py_DECREF(unicode);
				786	return NULL;
				787	}
				788
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	789	/* Not used anymore, now that the encoder supports UTF-16
				790	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	791	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	792	static
				793	int utf8_encoding_error(const Py_UNICODE **source,
				794	char **dest,
				795	const char *errors,
				796	const char *details)
				797	{
				798	if ((errors == NULL) \|\|
				799	(strcmp(errors,"strict") == 0)) {
				800	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	801	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	802	details);
				803	return -1;
				804	}
				805	else if (strcmp(errors,"ignore") == 0) {
				806	return 0;
				807	}
				808	else if (strcmp(errors,"replace") == 0) {
				809	**dest = '?';
				810	(*dest)++;
				811	return 0;
				812	}
				813	else {
				814	PyErr_Format(PyExc_ValueError,
				815	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	816	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	817	errors);
				818	return -1;
				819	}
				820	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	821	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	822
				823	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				824	int size,
				825	const char *errors)
				826	{
				827	PyObject *v;
				828	char *p;
				829	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	830	Py_UCS4 ch2;
				831	unsigned int cbAllocated = 3 * size;
				832	unsigned int cbWritten = 0;
				833	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	834
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	835	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	836	if (v == NULL)
				837	return NULL;
				838	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	839	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	840
				841	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	842	while (i < size) {
				843	Py_UCS4 ch = s[i++];
				844	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	845	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	846	cbWritten++;
				847	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	848	else if (ch < 0x0800) {
				849	*p++ = 0xc0 \| (ch >> 6);
				850	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	851	cbWritten += 2;
				852	}
				853	else {
				854	/* Check for high surrogate */
				855	if (0xD800 <= ch && ch <= 0xDBFF) {
				856	if (i != size) {
				857	ch2 = s[i];
				858	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				859
				860	if (cbWritten >= (cbAllocated - 4)) {
				861	/* Provide enough room for some more
				862	surrogates */
				863	cbAllocated += 4*10;
				864	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	865	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	866	}
				867
				868	/* combine the two values */
				869	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				870
				871	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	872	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	873	i++;
				874	cbWritten += 4;
				875	}
				876	}
				877	}
				878	else {
				879	*p++ = (char)(0xe0 \| (ch >> 12));
				880	cbWritten += 3;
				881	}
				882	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				883	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	884	}
				885	}
				886	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	887	if (_PyString_Resize(&v, p - q))
				888	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	889	return v;
				890
				891	onError:
				892	Py_DECREF(v);
				893	return NULL;
				894	}
				895
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	896	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				897	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	898	if (!PyUnicode_Check(unicode)) {
				899	PyErr_BadArgument();
				900	return NULL;
				901	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	902	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				903	PyUnicode_GET_SIZE(unicode),
				904	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	905	}
				906
				907	/* --- UTF-16 Codec ------------------------------------------------------- */
				908
				909	static
				910	int utf16_decoding_error(const Py_UNICODE **source,
				911	Py_UNICODE **dest,
				912	const char *errors,
				913	const char *details)
				914	{
				915	if ((errors == NULL) \|\|
				916	(strcmp(errors,"strict") == 0)) {
				917	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	918	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	919	details);
				920	return -1;
				921	}
				922	else if (strcmp(errors,"ignore") == 0) {
				923	return 0;
				924	}
				925	else if (strcmp(errors,"replace") == 0) {
				926	if (dest) {
				927	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				928	(*dest)++;
				929	}
				930	return 0;
				931	}
				932	else {
				933	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	934	"UTF-16 decoding error; "
				935	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	936	errors);
				937	return -1;
				938	}
				939	}
				940
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	941	PyObject PyUnicode_DecodeUTF16(const char s,
				942	int size,
				943	const char *errors,
				944	int *byteorder)
				945	{
				946	PyUnicodeObject *unicode;
				947	Py_UNICODE *p;
				948	const Py_UNICODE q, e;
				949	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	950	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	951
				952	/* size should be an even number */
				953	if (size % sizeof(Py_UNICODE) != 0) {
				954	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				955	return NULL;
				956	/* The remaining input chars are ignored if we fall through
				957	here... */
				958	}
				959
				960	/* Note: size will always be longer than the resulting Unicode
				961	character count */
				962	unicode = _PyUnicode_New(size);
				963	if (!unicode)
				964	return NULL;
				965	if (size == 0)
				966	return (PyObject *)unicode;
				967
				968	/* Unpack UTF-16 encoded data */
				969	p = unicode->str;
				970	q = (Py_UNICODE *)s;
				971	e = q + (size / sizeof(Py_UNICODE));
				972
				973	if (byteorder)
				974	bo = *byteorder;
				975
				976	while (q < e) {
				977	register Py_UNICODE ch = *q++;
				978
				979	/* Check for BOM marks (U+FEFF) in the input and adjust
				980	current byte order setting accordingly. Swap input
				981	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				982	!) */
				983	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				984	if (ch == 0xFEFF) {
				985	bo = -1;
				986	continue;
				987	} else if (ch == 0xFFFE) {
				988	bo = 1;
				989	continue;
				990	}
				991	if (bo == 1)
				992	ch = (ch >> 8) \| (ch << 8);
				993	#else
				994	if (ch == 0xFEFF) {
				995	bo = 1;
				996	continue;
				997	} else if (ch == 0xFFFE) {
				998	bo = -1;
				999	continue;
				1000	}
				1001	if (bo == -1)
				1002	ch = (ch >> 8) \| (ch << 8);
				1003	#endif
				1004	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1005	*p++ = ch;
				1006	continue;
				1007	}
				1008
				1009	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1010	if (q >= e) {
				1011	errmsg = "unexpected end of data";
				1012	goto utf16Error;
				1013	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1014	if (0xDC00 <= q && q <= 0xDFFF) {
				1015	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1016	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1017	/* This is valid data (a UTF-16 surrogate pair), but
				1018	we are not able to store this information since our
				1019	Py_UNICODE type only has 16 bits... this might
				1020	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1021	errmsg = "code pairs are not supported";
				1022	goto utf16Error;
				1023	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1024	else
				1025	continue;
				1026	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1027	errmsg = "illegal encoding";
				1028	/* Fall through to report the error */
				1029
				1030	utf16Error:
				1031	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1032	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1033	}
				1034
				1035	if (byteorder)
				1036	*byteorder = bo;
				1037
				1038	/* Adjust length */
				1039	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1040	goto onError;
				1041
				1042	return (PyObject *)unicode;
				1043
				1044	onError:
				1045	Py_DECREF(unicode);
				1046	return NULL;
				1047	}
				1048
				1049	#undef UTF16_ERROR
				1050
				1051	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1052	int size,
				1053	const char *errors,
				1054	int byteorder)
				1055	{
				1056	PyObject *v;
				1057	Py_UNICODE *p;
				1058	char *q;
				1059
				1060	/* We don't create UTF-16 pairs... */
				1061	v = PyString_FromStringAndSize(NULL,
				1062	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1063	if (v == NULL)
				1064	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1065
				1066	q = PyString_AS_STRING(v);
				1067	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1068	if (byteorder == 0)
				1069	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1070	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1071	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1072	if (byteorder == 0 \|\|
				1073	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1074	byteorder == -1
				1075	#else
				1076	byteorder == 1
				1077	#endif
				1078	)
				1079	memcpy(p, s, size * sizeof(Py_UNICODE));
				1080	else
				1081	while (size-- > 0) {
				1082	Py_UNICODE ch = *s++;
				1083	*p++ = (ch >> 8) \| (ch << 8);
				1084	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1085	return v;
				1086	}
				1087
				1088	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1089	{
				1090	if (!PyUnicode_Check(unicode)) {
				1091	PyErr_BadArgument();
				1092	return NULL;
				1093	}
				1094	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1095	PyUnicode_GET_SIZE(unicode),
				1096	NULL,
				1097	0);
				1098	}
				1099
				1100	/* --- Unicode Escape Codec ----------------------------------------------- */
				1101
				1102	static
				1103	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1104	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1105	const char *errors,
				1106	const char *details)
				1107	{
				1108	if ((errors == NULL) \|\|
				1109	(strcmp(errors,"strict") == 0)) {
				1110	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1111	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1112	details);
				1113	return -1;
				1114	}
				1115	else if (strcmp(errors,"ignore") == 0) {
				1116	return 0;
				1117	}
				1118	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1119	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1120	return 0;
				1121	}
				1122	else {
				1123	PyErr_Format(PyExc_ValueError,
				1124	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1125	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1126	errors);
				1127	return -1;
				1128	}
				1129	}
				1130
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1131	static _Py_UCNHashAPI *pucnHash = NULL;
				1132
				1133	static
				1134	int mystrnicmp(const char s1, const char s2, size_t count)
				1135	{
				1136	char c1, c2;
				1137
				1138	if (count)
				1139	{
				1140	do
				1141	{
				1142	c1 = tolower(*(s1++));
				1143	c2 = tolower(*(s2++));
				1144	}
				1145	while(--count && c1 == c2);
				1146
				1147	return c1 - c2;
				1148	}
				1149
				1150	return 0;
				1151	}
				1152
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1153	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1154	int size,
				1155	const char *errors)
				1156	{
				1157	PyUnicodeObject *v;
				1158	Py_UNICODE p = NULL, buf = NULL;
				1159	const char *end;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1160	Py_UCS4 chr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1161
				1162	/* Escaped strings will always be longer than the resulting
				1163	Unicode string, so we start with size here and then reduce the
				1164	length after conversion to the true value. */
				1165	v = _PyUnicode_New(size);
				1166	if (v == NULL)
				1167	goto onError;
				1168	if (size == 0)
				1169	return (PyObject *)v;
				1170	p = buf = PyUnicode_AS_UNICODE(v);
				1171	end = s + size;
				1172	while (s < end) {
				1173	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1174	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1175	int i;
				1176
				1177	/* Non-escape characters are interpreted as Unicode ordinals */
				1178	if (*s != '\\') {
				1179	p++ = (unsigned char)s++;
				1180	continue;
				1181	}
				1182
				1183	/* \ - Escapes */
				1184	s++;
				1185	switch (*s++) {
				1186
				1187	/* \x escapes */
				1188	case '\n': break;
				1189	case '\\': *p++ = '\\'; break;
				1190	case '\'': *p++ = '\''; break;
				1191	case '\"': *p++ = '\"'; break;
				1192	case 'b': *p++ = '\b'; break;
				1193	case 'f': p++ = '\014'; break; / FF */
				1194	case 't': *p++ = '\t'; break;
				1195	case 'n': *p++ = '\n'; break;
				1196	case 'r': *p++ = '\r'; break;
				1197	case 'v': p++ = '\013'; break; / VT */
				1198	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1199
				1200	/* \OOO (octal) escapes */
				1201	case '0': case '1': case '2': case '3':
				1202	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1203	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1204	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1205	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1206	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1207	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1208	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1209	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1210	break;
				1211
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1212	/* \xXX with two hex digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1213	case 'x':
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1214	for (x = 0, i = 0; i < 2; i++) {
				1215	c = (unsigned char)s[i];
				1216	if (!isxdigit(c)) {
				1217	if (unicodeescape_decoding_error(&s, &x, errors,
				1218	"truncated \\xXX"))
				1219	goto onError;
				1220	i++;
				1221	break;
				1222	}
				1223	x = (x<<4) & ~0xF;
				1224	if (c >= '0' && c <= '9')
				1225	x += c - '0';
				1226	else if (c >= 'a' && c <= 'f')
				1227	x += 10 + c - 'a';
				1228	else
				1229	x += 10 + c - 'A';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1230	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1231	s += i;
				1232	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1233	break;
				1234
				1235	/* \uXXXX with 4 hex digits */
				1236	case 'u':
				1237	for (x = 0, i = 0; i < 4; i++) {
				1238	c = (unsigned char)s[i];
				1239	if (!isxdigit(c)) {
				1240	if (unicodeescape_decoding_error(&s, &x, errors,
				1241	"truncated \\uXXXX"))
				1242	goto onError;
				1243	i++;
				1244	break;
				1245	}
				1246	x = (x<<4) & ~0xF;
				1247	if (c >= '0' && c <= '9')
				1248	x += c - '0';
				1249	else if (c >= 'a' && c <= 'f')
				1250	x += 10 + c - 'a';
				1251	else
				1252	x += 10 + c - 'A';
				1253	}
				1254	s += i;
				1255	*p++ = x;
				1256	break;
				1257
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1258	/* \UXXXXXXXX with 8 hex digits */
				1259	case 'U':
				1260	for (chr = 0, i = 0; i < 8; i++) {
				1261	c = (unsigned char)s[i];
				1262	if (!isxdigit(c)) {
				1263	if (unicodeescape_decoding_error(&s, &x, errors,
				1264	"truncated \\uXXXX"))
				1265	goto onError;
				1266	i++;
				1267	break;
				1268	}
				1269	chr = (chr<<4) & ~0xF;
				1270	if (c >= '0' && c <= '9')
				1271	chr += c - '0';
				1272	else if (c >= 'a' && c <= 'f')
				1273	chr += 10 + c - 'a';
				1274	else
				1275	chr += 10 + c - 'A';
				1276	}
				1277	s += i;
				1278	goto store;
				1279
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1280	case 'N':
				1281	/* Ok, we need to deal with Unicode Character Names now,
				1282	* make sure we've imported the hash table data...
				1283	*/
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1284	if (pucnHash == NULL) {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1285	PyObject mod = 0, v = 0;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1286	mod = PyImport_ImportModule("ucnhash");
				1287	if (mod == NULL)
				1288	goto onError;
				1289	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1290	Py_DECREF(mod);
				1291	if (v == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1292	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1293	pucnHash = PyCObject_AsVoidPtr(v);
				1294	Py_DECREF(v);
				1295	if (pucnHash == NULL)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1296	goto onError;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1297	}
				1298
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1299	if (*s == '{') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1300	const char *start = s + 1;
				1301	const char *endBrace = start;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1302	unsigned long j;
				1303
				1304	/* look for either the closing brace, or we
				1305	* exceed the maximum length of the unicode character names
				1306	*/
				1307	while (*endBrace != '}' &&
				1308	(unsigned int)(endBrace - start) <=
				1309	pucnHash->cchMax &&
				1310	endBrace < end)
				1311	{
				1312	endBrace++;
				1313	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1314	if (endBrace != end && *endBrace == '}') {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1315	j = pucnHash->hash(start, endBrace - start);
				1316	if (j > pucnHash->cKeys \|\|
				1317	mystrnicmp(
				1318	start,
				1319	((_Py_UnicodeCharacterName *)
				1320	(pucnHash->getValue(j)))->pszUCN,
				1321	(int)(endBrace - start)) != 0)
				1322	{
				1323	if (unicodeescape_decoding_error(
				1324	&s, &x, errors,
				1325	"Invalid Unicode Character Name"))
				1326	{
				1327	goto onError;
				1328	}
				1329	goto ucnFallthrough;
				1330	}
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1331	chr = ((_Py_UnicodeCharacterName *)
				1332	(pucnHash->getValue(j)))->value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1333	s = endBrace + 1;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1334	goto store;
				1335	} else {
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1336	if (unicodeescape_decoding_error(
				1337	&s, &x, errors,
				1338	"Unicode name missing closing brace"))
				1339	goto onError;
				1340	goto ucnFallthrough;
				1341	}
				1342	break;
				1343	}
				1344	if (unicodeescape_decoding_error(
				1345	&s, &x, errors,
				1346	"Missing opening brace for Unicode Character Name escape"))
				1347	goto onError;
				1348	ucnFallthrough:
				1349	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1350	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1351	*p++ = '\\';
				1352	*p++ = (unsigned char)s[-1];
				1353	break;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1354	store:
				1355	/* when we get here, chr is a 32-bit unicode character */
				1356	if (chr <= 0xffff)
				1357	/* UCS-2 character */
				1358	*p++ = (Py_UNICODE) chr;
				1359	else if (chr <= 0x10ffff) {
				1360	/* UCS-4 character. store as two surrogate characters */
				1361	chr -= 0x10000L;
				1362	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
				1363	*p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
				1364	} else {
				1365	if (unicodeescape_decoding_error(
				1366	&s, &x, errors,
				1367	"Illegal Unicode character")
				1368	)
				1369	goto onError;
				1370	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1371	}
				1372	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1373	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1374	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1375	return (PyObject *)v;
				1376
				1377	onError:
				1378	Py_XDECREF(v);
				1379	return NULL;
				1380	}
				1381
				1382	/* Return a Unicode-Escape string version of the Unicode object.
				1383
				1384	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1385	appropriate.
				1386
				1387	*/
				1388
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1389	static const Py_UNICODE findchar(const Py_UNICODE s,
				1390	int size,
				1391	Py_UNICODE ch);
				1392
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1393	static
				1394	PyObject unicodeescape_string(const Py_UNICODE s,
				1395	int size,
				1396	int quotes)
				1397	{
				1398	PyObject *repr;
				1399	char *p;
				1400	char *q;
				1401
				1402	static const char *hexdigit = "0123456789ABCDEF";
				1403
				1404	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1405	if (repr == NULL)
				1406	return NULL;
				1407
				1408	p = q = PyString_AS_STRING(repr);
				1409
				1410	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1411	*p++ = 'u';
				1412	*p++ = (findchar(s, size, '\'') &&
				1413	!findchar(s, size, '"')) ? '"' : '\'';
				1414	}
				1415	while (size-- > 0) {
				1416	Py_UNICODE ch = *s++;
				1417	/* Escape quotes */
				1418	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1419	*p++ = '\\';
				1420	*p++ = (char) ch;
				1421	}
				1422	/* Map 16-bit characters to '\uxxxx' */
				1423	else if (ch >= 256) {
				1424	*p++ = '\\';
				1425	*p++ = 'u';
				1426	*p++ = hexdigit[(ch >> 12) & 0xf];
				1427	*p++ = hexdigit[(ch >> 8) & 0xf];
				1428	*p++ = hexdigit[(ch >> 4) & 0xf];
				1429	*p++ = hexdigit[ch & 15];
				1430	}
				1431	/* Map non-printable US ASCII to '\ooo' */
				1432	else if (ch < ' ' \|\| ch >= 128) {
				1433	*p++ = '\\';
				1434	*p++ = hexdigit[(ch >> 6) & 7];
				1435	*p++ = hexdigit[(ch >> 3) & 7];
				1436	*p++ = hexdigit[ch & 7];
				1437	}
				1438	/* Copy everything else as-is */
				1439	else
				1440	*p++ = (char) ch;
				1441	}
				1442	if (quotes)
				1443	*p++ = q[1];
				1444
				1445	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1446	if (_PyString_Resize(&repr, p - q))
				1447	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1448
				1449	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1450
				1451	onError:
				1452	Py_DECREF(repr);
				1453	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1454	}
				1455
				1456	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1457	int size)
				1458	{
				1459	return unicodeescape_string(s, size, 0);
				1460	}
				1461
				1462	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1463	{
				1464	if (!PyUnicode_Check(unicode)) {
				1465	PyErr_BadArgument();
				1466	return NULL;
				1467	}
				1468	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1469	PyUnicode_GET_SIZE(unicode));
				1470	}
				1471
				1472	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1473
				1474	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1475	int size,
				1476	const char *errors)
				1477	{
				1478	PyUnicodeObject *v;
				1479	Py_UNICODE p, buf;
				1480	const char *end;
				1481	const char *bs;
				1482
				1483	/* Escaped strings will always be longer than the resulting
				1484	Unicode string, so we start with size here and then reduce the
				1485	length after conversion to the true value. */
				1486	v = _PyUnicode_New(size);
				1487	if (v == NULL)
				1488	goto onError;
				1489	if (size == 0)
				1490	return (PyObject *)v;
				1491	p = buf = PyUnicode_AS_UNICODE(v);
				1492	end = s + size;
				1493	while (s < end) {
				1494	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1495	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1496	int i;
				1497
				1498	/* Non-escape characters are interpreted as Unicode ordinals */
				1499	if (*s != '\\') {
				1500	p++ = (unsigned char)s++;
				1501	continue;
				1502	}
				1503
				1504	/* \u-escapes are only interpreted iff the number of leading
				1505	backslashes if odd */
				1506	bs = s;
				1507	for (;s < end;) {
				1508	if (*s != '\\')
				1509	break;
				1510	p++ = (unsigned char)s++;
				1511	}
				1512	if (((s - bs) & 1) == 0 \|\|
				1513	s >= end \|\|
				1514	*s != 'u') {
				1515	continue;
				1516	}
				1517	p--;
				1518	s++;
				1519
				1520	/* \uXXXX with 4 hex digits */
				1521	for (x = 0, i = 0; i < 4; i++) {
				1522	c = (unsigned char)s[i];
				1523	if (!isxdigit(c)) {
				1524	if (unicodeescape_decoding_error(&s, &x, errors,
				1525	"truncated \\uXXXX"))
				1526	goto onError;
				1527	i++;
				1528	break;
				1529	}
				1530	x = (x<<4) & ~0xF;
				1531	if (c >= '0' && c <= '9')
				1532	x += c - '0';
				1533	else if (c >= 'a' && c <= 'f')
				1534	x += 10 + c - 'a';
				1535	else
				1536	x += 10 + c - 'A';
				1537	}
				1538	s += i;
				1539	*p++ = x;
				1540	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1541	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1542	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1543	return (PyObject *)v;
				1544
				1545	onError:
				1546	Py_XDECREF(v);
				1547	return NULL;
				1548	}
				1549
				1550	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1551	int size)
				1552	{
				1553	PyObject *repr;
				1554	char *p;
				1555	char *q;
				1556
				1557	static const char *hexdigit = "0123456789ABCDEF";
				1558
				1559	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1560	if (repr == NULL)
				1561	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1562	if (size == 0)
				1563	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1564
				1565	p = q = PyString_AS_STRING(repr);
				1566	while (size-- > 0) {
				1567	Py_UNICODE ch = *s++;
				1568	/* Map 16-bit characters to '\uxxxx' */
				1569	if (ch >= 256) {
				1570	*p++ = '\\';
				1571	*p++ = 'u';
				1572	*p++ = hexdigit[(ch >> 12) & 0xf];
				1573	*p++ = hexdigit[(ch >> 8) & 0xf];
				1574	*p++ = hexdigit[(ch >> 4) & 0xf];
				1575	*p++ = hexdigit[ch & 15];
				1576	}
				1577	/* Copy everything else as-is */
				1578	else
				1579	*p++ = (char) ch;
				1580	}
				1581	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1582	if (_PyString_Resize(&repr, p - q))
				1583	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1584
				1585	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1586
				1587	onError:
				1588	Py_DECREF(repr);
				1589	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1590	}
				1591
				1592	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1593	{
				1594	if (!PyUnicode_Check(unicode)) {
				1595	PyErr_BadArgument();
				1596	return NULL;
				1597	}
				1598	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1599	PyUnicode_GET_SIZE(unicode));
				1600	}
				1601
				1602	/* --- Latin-1 Codec ------------------------------------------------------ */
				1603
				1604	PyObject PyUnicode_DecodeLatin1(const char s,
				1605	int size,
				1606	const char *errors)
				1607	{
				1608	PyUnicodeObject *v;
				1609	Py_UNICODE *p;
				1610
				1611	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1612	v = _PyUnicode_New(size);
				1613	if (v == NULL)
				1614	goto onError;
				1615	if (size == 0)
				1616	return (PyObject *)v;
				1617	p = PyUnicode_AS_UNICODE(v);
				1618	while (size-- > 0)
				1619	p++ = (unsigned char)s++;
				1620	return (PyObject *)v;
				1621
				1622	onError:
				1623	Py_XDECREF(v);
				1624	return NULL;
				1625	}
				1626
				1627	static
				1628	int latin1_encoding_error(const Py_UNICODE **source,
				1629	char **dest,
				1630	const char *errors,
				1631	const char *details)
				1632	{
				1633	if ((errors == NULL) \|\|
				1634	(strcmp(errors,"strict") == 0)) {
				1635	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1636	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1637	details);
				1638	return -1;
				1639	}
				1640	else if (strcmp(errors,"ignore") == 0) {
				1641	return 0;
				1642	}
				1643	else if (strcmp(errors,"replace") == 0) {
				1644	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1645	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1646	return 0;
				1647	}
				1648	else {
				1649	PyErr_Format(PyExc_ValueError,
				1650	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1651	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1652	errors);
				1653	return -1;
				1654	}
				1655	}
				1656
				1657	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1658	int size,
				1659	const char *errors)
				1660	{
				1661	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1662	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1663
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1664	repr = PyString_FromStringAndSize(NULL, size);
				1665	if (repr == NULL)
				1666	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1667	if (size == 0)
				1668	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1669
				1670	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1671	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1672	while (size-- > 0) {
				1673	Py_UNICODE ch = *p++;
				1674	if (ch >= 256) {
				1675	if (latin1_encoding_error(&p, &s, errors,
				1676	"ordinal not in range(256)"))
				1677	goto onError;
				1678	}
				1679	else
				1680	*s++ = (char)ch;
				1681	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1682	/* Resize if error handling skipped some characters */
				1683	if (s - start < PyString_GET_SIZE(repr))
				1684	if (_PyString_Resize(&repr, s - start))
				1685	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1686	return repr;
				1687
				1688	onError:
				1689	Py_DECREF(repr);
				1690	return NULL;
				1691	}
				1692
				1693	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1694	{
				1695	if (!PyUnicode_Check(unicode)) {
				1696	PyErr_BadArgument();
				1697	return NULL;
				1698	}
				1699	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1700	PyUnicode_GET_SIZE(unicode),
				1701	NULL);
				1702	}
				1703
				1704	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1705
				1706	static
				1707	int ascii_decoding_error(const char **source,
				1708	Py_UNICODE **dest,
				1709	const char *errors,
				1710	const char *details)
				1711	{
				1712	if ((errors == NULL) \|\|
				1713	(strcmp(errors,"strict") == 0)) {
				1714	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1715	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1716	details);
				1717	return -1;
				1718	}
				1719	else if (strcmp(errors,"ignore") == 0) {
				1720	return 0;
				1721	}
				1722	else if (strcmp(errors,"replace") == 0) {
				1723	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1724	(*dest)++;
				1725	return 0;
				1726	}
				1727	else {
				1728	PyErr_Format(PyExc_ValueError,
				1729	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1730	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1731	errors);
				1732	return -1;
				1733	}
				1734	}
				1735
				1736	PyObject PyUnicode_DecodeASCII(const char s,
				1737	int size,
				1738	const char *errors)
				1739	{
				1740	PyUnicodeObject *v;
				1741	Py_UNICODE *p;
				1742
				1743	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1744	v = _PyUnicode_New(size);
				1745	if (v == NULL)
				1746	goto onError;
				1747	if (size == 0)
				1748	return (PyObject *)v;
				1749	p = PyUnicode_AS_UNICODE(v);
				1750	while (size-- > 0) {
				1751	register unsigned char c;
				1752
				1753	c = (unsigned char)*s++;
				1754	if (c < 128)
				1755	*p++ = c;
				1756	else if (ascii_decoding_error(&s, &p, errors,
				1757	"ordinal not in range(128)"))
				1758	goto onError;
				1759	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1760	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1761	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1762	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1763	return (PyObject *)v;
				1764
				1765	onError:
				1766	Py_XDECREF(v);
				1767	return NULL;
				1768	}
				1769
				1770	static
				1771	int ascii_encoding_error(const Py_UNICODE **source,
				1772	char **dest,
				1773	const char *errors,
				1774	const char *details)
				1775	{
				1776	if ((errors == NULL) \|\|
				1777	(strcmp(errors,"strict") == 0)) {
				1778	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1779	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1780	details);
				1781	return -1;
				1782	}
				1783	else if (strcmp(errors,"ignore") == 0) {
				1784	return 0;
				1785	}
				1786	else if (strcmp(errors,"replace") == 0) {
				1787	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1788	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1789	return 0;
				1790	}
				1791	else {
				1792	PyErr_Format(PyExc_ValueError,
				1793	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1794	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1795	errors);
				1796	return -1;
				1797	}
				1798	}
				1799
				1800	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1801	int size,
				1802	const char *errors)
				1803	{
				1804	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1805	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1806
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1807	repr = PyString_FromStringAndSize(NULL, size);
				1808	if (repr == NULL)
				1809	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1810	if (size == 0)
				1811	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1812
				1813	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1814	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1815	while (size-- > 0) {
				1816	Py_UNICODE ch = *p++;
				1817	if (ch >= 128) {
				1818	if (ascii_encoding_error(&p, &s, errors,
				1819	"ordinal not in range(128)"))
				1820	goto onError;
				1821	}
				1822	else
				1823	*s++ = (char)ch;
				1824	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1825	/* Resize if error handling skipped some characters */
				1826	if (s - start < PyString_GET_SIZE(repr))
				1827	if (_PyString_Resize(&repr, s - start))
				1828	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1829	return repr;
				1830
				1831	onError:
				1832	Py_DECREF(repr);
				1833	return NULL;
				1834	}
				1835
				1836	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1837	{
				1838	if (!PyUnicode_Check(unicode)) {
				1839	PyErr_BadArgument();
				1840	return NULL;
				1841	}
				1842	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1843	PyUnicode_GET_SIZE(unicode),
				1844	NULL);
				1845	}
				1846
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1847	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1848
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1849	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1850
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1851	PyObject PyUnicode_DecodeMBCS(const char s,
				1852	int size,
				1853	const char *errors)
				1854	{
				1855	PyUnicodeObject *v;
				1856	Py_UNICODE *p;
				1857
				1858	/* First get the size of the result */
				1859	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1860	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1861	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1862
				1863	v = _PyUnicode_New(usize);
				1864	if (v == NULL)
				1865	return NULL;
				1866	if (usize == 0)
				1867	return (PyObject *)v;
				1868	p = PyUnicode_AS_UNICODE(v);
				1869	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1870	Py_DECREF(v);
				1871	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1872	}
				1873
				1874	return (PyObject *)v;
				1875	}
				1876
				1877	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1878	int size,
				1879	const char *errors)
				1880	{
				1881	PyObject *repr;
				1882	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1883	DWORD mbcssize;
				1884
				1885	/* If there are no characters, bail now! */
				1886	if (size==0)
				1887	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1888
				1889	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1890	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1891	if (mbcssize==0)
				1892	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1893
				1894	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1895	if (repr == NULL)
				1896	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1897	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1898	return repr;
				1899
				1900	/* Do the conversion */
				1901	s = PyString_AS_STRING(repr);
				1902	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1903	Py_DECREF(repr);
				1904	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1905	}
				1906	return repr;
				1907	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1908
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1909	#endif /* MS_WIN32 */
				1910
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1911	/* --- Character Mapping Codec -------------------------------------------- */
				1912
				1913	static
				1914	int charmap_decoding_error(const char **source,
				1915	Py_UNICODE **dest,
				1916	const char *errors,
				1917	const char *details)
				1918	{
				1919	if ((errors == NULL) \|\|
				1920	(strcmp(errors,"strict") == 0)) {
				1921	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1922	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1923	details);
				1924	return -1;
				1925	}
				1926	else if (strcmp(errors,"ignore") == 0) {
				1927	return 0;
				1928	}
				1929	else if (strcmp(errors,"replace") == 0) {
				1930	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1931	(*dest)++;
				1932	return 0;
				1933	}
				1934	else {
				1935	PyErr_Format(PyExc_ValueError,
				1936	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1937	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1938	errors);
				1939	return -1;
				1940	}
				1941	}
				1942
				1943	PyObject PyUnicode_DecodeCharmap(const char s,
				1944	int size,
				1945	PyObject *mapping,
				1946	const char *errors)
				1947	{
				1948	PyUnicodeObject *v;
				1949	Py_UNICODE *p;
				1950
				1951	/* Default to Latin-1 */
				1952	if (mapping == NULL)
				1953	return PyUnicode_DecodeLatin1(s, size, errors);
				1954
				1955	v = _PyUnicode_New(size);
				1956	if (v == NULL)
				1957	goto onError;
				1958	if (size == 0)
				1959	return (PyObject *)v;
				1960	p = PyUnicode_AS_UNICODE(v);
				1961	while (size-- > 0) {
				1962	unsigned char ch = *s++;
				1963	PyObject w, x;
				1964
				1965	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1966	w = PyInt_FromLong((long)ch);
				1967	if (w == NULL)
				1968	goto onError;
				1969	x = PyObject_GetItem(mapping, w);
				1970	Py_DECREF(w);
				1971	if (x == NULL) {
				1972	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame^]	1973	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1974	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame^]	1975	x = Py_None;
				1976	Py_INCREF(x);
				1977	} else
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1978	goto onError;
				1979	}
				1980
				1981	/* Apply mapping */
				1982	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1983	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1984	if (value < 0 \|\| value > 65535) {
				1985	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1986	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1987	Py_DECREF(x);
				1988	goto onError;
				1989	}
				1990	*p++ = (Py_UNICODE)value;
				1991	}
				1992	else if (x == Py_None) {
				1993	/* undefined mapping */
				1994	if (charmap_decoding_error(&s, &p, errors,
				1995	"character maps to <undefined>")) {
				1996	Py_DECREF(x);
				1997	goto onError;
				1998	}
				1999	}
				2000	else if (PyUnicode_Check(x)) {
				2001	if (PyUnicode_GET_SIZE(x) != 1) {
				2002	/* 1-n mapping */
				2003	PyErr_SetString(PyExc_NotImplementedError,
				2004	"1-n mappings are currently not implemented");
				2005	Py_DECREF(x);
				2006	goto onError;
				2007	}
				2008	p++ = PyUnicode_AS_UNICODE(x);
				2009	}
				2010	else {
				2011	/* wrong return value */
				2012	PyErr_SetString(PyExc_TypeError,
				2013	"character mapping must return integer, None or unicode");
				2014	Py_DECREF(x);
				2015	goto onError;
				2016	}
				2017	Py_DECREF(x);
				2018	}
				2019	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				2020	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2021	goto onError;
				2022	return (PyObject *)v;
				2023
				2024	onError:
				2025	Py_XDECREF(v);
				2026	return NULL;
				2027	}
				2028
				2029	static
				2030	int charmap_encoding_error(const Py_UNICODE **source,
				2031	char **dest,
				2032	const char *errors,
				2033	const char *details)
				2034	{
				2035	if ((errors == NULL) \|\|
				2036	(strcmp(errors,"strict") == 0)) {
				2037	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2038	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2039	details);
				2040	return -1;
				2041	}
				2042	else if (strcmp(errors,"ignore") == 0) {
				2043	return 0;
				2044	}
				2045	else if (strcmp(errors,"replace") == 0) {
				2046	**dest = '?';
				2047	(*dest)++;
				2048	return 0;
				2049	}
				2050	else {
				2051	PyErr_Format(PyExc_ValueError,
				2052	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2053	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2054	errors);
				2055	return -1;
				2056	}
				2057	}
				2058
				2059	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2060	int size,
				2061	PyObject *mapping,
				2062	const char *errors)
				2063	{
				2064	PyObject *v;
				2065	char *s;
				2066
				2067	/* Default to Latin-1 */
				2068	if (mapping == NULL)
				2069	return PyUnicode_EncodeLatin1(p, size, errors);
				2070
				2071	v = PyString_FromStringAndSize(NULL, size);
				2072	if (v == NULL)
				2073	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2074	if (size == 0)
				2075	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2076	s = PyString_AS_STRING(v);
				2077	while (size-- > 0) {
				2078	Py_UNICODE ch = *p++;
				2079	PyObject w, x;
				2080
				2081	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2082	w = PyInt_FromLong((long)ch);
				2083	if (w == NULL)
				2084	goto onError;
				2085	x = PyObject_GetItem(mapping, w);
				2086	Py_DECREF(w);
				2087	if (x == NULL) {
				2088	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame^]	2089	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2090	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame^]	2091	x = Py_None;
				2092	Py_INCREF(x);
				2093	} else
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2094	goto onError;
				2095	}
				2096
				2097	/* Apply mapping */
				2098	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2099	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2100	if (value < 0 \|\| value > 255) {
				2101	PyErr_SetString(PyExc_TypeError,
				2102	"character mapping must be in range(256)");
				2103	Py_DECREF(x);
				2104	goto onError;
				2105	}
				2106	*s++ = (char)value;
				2107	}
				2108	else if (x == Py_None) {
				2109	/* undefined mapping */
				2110	if (charmap_encoding_error(&p, &s, errors,
				2111	"character maps to <undefined>")) {
				2112	Py_DECREF(x);
				2113	goto onError;
				2114	}
				2115	}
				2116	else if (PyString_Check(x)) {
				2117	if (PyString_GET_SIZE(x) != 1) {
				2118	/* 1-n mapping */
				2119	PyErr_SetString(PyExc_NotImplementedError,
				2120	"1-n mappings are currently not implemented");
				2121	Py_DECREF(x);
				2122	goto onError;
				2123	}
				2124	s++ = PyString_AS_STRING(x);
				2125	}
				2126	else {
				2127	/* wrong return value */
				2128	PyErr_SetString(PyExc_TypeError,
				2129	"character mapping must return integer, None or unicode");
				2130	Py_DECREF(x);
				2131	goto onError;
				2132	}
				2133	Py_DECREF(x);
				2134	}
				2135	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2136	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2137	goto onError;
				2138	return v;
				2139
				2140	onError:
				2141	Py_DECREF(v);
				2142	return NULL;
				2143	}
				2144
				2145	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2146	PyObject *mapping)
				2147	{
				2148	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2149	PyErr_BadArgument();
				2150	return NULL;
				2151	}
				2152	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2153	PyUnicode_GET_SIZE(unicode),
				2154	mapping,
				2155	NULL);
				2156	}
				2157
				2158	static
				2159	int translate_error(const Py_UNICODE **source,
				2160	Py_UNICODE **dest,
				2161	const char *errors,
				2162	const char *details)
				2163	{
				2164	if ((errors == NULL) \|\|
				2165	(strcmp(errors,"strict") == 0)) {
				2166	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2167	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2168	details);
				2169	return -1;
				2170	}
				2171	else if (strcmp(errors,"ignore") == 0) {
				2172	return 0;
				2173	}
				2174	else if (strcmp(errors,"replace") == 0) {
				2175	**dest = '?';
				2176	(*dest)++;
				2177	return 0;
				2178	}
				2179	else {
				2180	PyErr_Format(PyExc_ValueError,
				2181	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2182	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2183	errors);
				2184	return -1;
				2185	}
				2186	}
				2187
				2188	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2189	int size,
				2190	PyObject *mapping,
				2191	const char *errors)
				2192	{
				2193	PyUnicodeObject *v;
				2194	Py_UNICODE *p;
				2195
				2196	if (mapping == NULL) {
				2197	PyErr_BadArgument();
				2198	return NULL;
				2199	}
				2200
				2201	/* Output will never be longer than input */
				2202	v = _PyUnicode_New(size);
				2203	if (v == NULL)
				2204	goto onError;
				2205	if (size == 0)
				2206	goto done;
				2207	p = PyUnicode_AS_UNICODE(v);
				2208	while (size-- > 0) {
				2209	Py_UNICODE ch = *s++;
				2210	PyObject w, x;
				2211
				2212	/* Get mapping */
				2213	w = PyInt_FromLong(ch);
				2214	if (w == NULL)
				2215	goto onError;
				2216	x = PyObject_GetItem(mapping, w);
				2217	Py_DECREF(w);
				2218	if (x == NULL) {
				2219	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2220	/* No mapping found: default to 1-1 mapping */
				2221	PyErr_Clear();
				2222	*p++ = ch;
				2223	continue;
				2224	}
				2225	goto onError;
				2226	}
				2227
				2228	/* Apply mapping */
				2229	if (PyInt_Check(x))
				2230	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2231	else if (x == Py_None) {
				2232	/* undefined mapping */
				2233	if (translate_error(&s, &p, errors,
				2234	"character maps to <undefined>")) {
				2235	Py_DECREF(x);
				2236	goto onError;
				2237	}
				2238	}
				2239	else if (PyUnicode_Check(x)) {
				2240	if (PyUnicode_GET_SIZE(x) != 1) {
				2241	/* 1-n mapping */
				2242	PyErr_SetString(PyExc_NotImplementedError,
				2243	"1-n mappings are currently not implemented");
				2244	Py_DECREF(x);
				2245	goto onError;
				2246	}
				2247	p++ = PyUnicode_AS_UNICODE(x);
				2248	}
				2249	else {
				2250	/* wrong return value */
				2251	PyErr_SetString(PyExc_TypeError,
				2252	"translate mapping must return integer, None or unicode");
				2253	Py_DECREF(x);
				2254	goto onError;
				2255	}
				2256	Py_DECREF(x);
				2257	}
				2258	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2259	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2260	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2261
				2262	done:
				2263	return (PyObject *)v;
				2264
				2265	onError:
				2266	Py_XDECREF(v);
				2267	return NULL;
				2268	}
				2269
				2270	PyObject PyUnicode_Translate(PyObject str,
				2271	PyObject *mapping,
				2272	const char *errors)
				2273	{
				2274	PyObject *result;
				2275
				2276	str = PyUnicode_FromObject(str);
				2277	if (str == NULL)
				2278	goto onError;
				2279	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2280	PyUnicode_GET_SIZE(str),
				2281	mapping,
				2282	errors);
				2283	Py_DECREF(str);
				2284	return result;
				2285
				2286	onError:
				2287	Py_XDECREF(str);
				2288	return NULL;
				2289	}
				2290
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2291	/* --- Decimal Encoder ---------------------------------------------------- */
				2292
				2293	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2294	int length,
				2295	char *output,
				2296	const char *errors)
				2297	{
				2298	Py_UNICODE p, end;
				2299
				2300	if (output == NULL) {
				2301	PyErr_BadArgument();
				2302	return -1;
				2303	}
				2304
				2305	p = s;
				2306	end = s + length;
				2307	while (p < end) {
				2308	register Py_UNICODE ch = *p++;
				2309	int decimal;
				2310
				2311	if (Py_UNICODE_ISSPACE(ch)) {
				2312	*output++ = ' ';
				2313	continue;
				2314	}
				2315	decimal = Py_UNICODE_TODECIMAL(ch);
				2316	if (decimal >= 0) {
				2317	*output++ = '0' + decimal;
				2318	continue;
				2319	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2320	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2321	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2322	continue;
				2323	}
				2324	/* All other characters are considered invalid */
				2325	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2326	PyErr_SetString(PyExc_ValueError,
				2327	"invalid decimal Unicode string");
				2328	goto onError;
				2329	}
				2330	else if (strcmp(errors, "ignore") == 0)
				2331	continue;
				2332	else if (strcmp(errors, "replace") == 0) {
				2333	*output++ = '?';
				2334	continue;
				2335	}
				2336	}
				2337	/* 0-terminate the output string */
				2338	*output++ = '\0';
				2339	return 0;
				2340
				2341	onError:
				2342	return -1;
				2343	}
				2344
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2345	/* --- Helpers ------------------------------------------------------------ */
				2346
				2347	static
				2348	int count(PyUnicodeObject *self,
				2349	int start,
				2350	int end,
				2351	PyUnicodeObject *substring)
				2352	{
				2353	int count = 0;
				2354
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2355	if (substring->length == 0)
				2356	return (end - start + 1);
				2357
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2358	end -= substring->length;
				2359
				2360	while (start <= end)
				2361	if (Py_UNICODE_MATCH(self, start, substring)) {
				2362	count++;
				2363	start += substring->length;
				2364	} else
				2365	start++;
				2366
				2367	return count;
				2368	}
				2369
				2370	int PyUnicode_Count(PyObject *str,
				2371	PyObject *substr,
				2372	int start,
				2373	int end)
				2374	{
				2375	int result;
				2376
				2377	str = PyUnicode_FromObject(str);
				2378	if (str == NULL)
				2379	return -1;
				2380	substr = PyUnicode_FromObject(substr);
				2381	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2382	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2383	return -1;
				2384	}
				2385
				2386	result = count((PyUnicodeObject *)str,
				2387	start, end,
				2388	(PyUnicodeObject *)substr);
				2389
				2390	Py_DECREF(str);
				2391	Py_DECREF(substr);
				2392	return result;
				2393	}
				2394
				2395	static
				2396	int findstring(PyUnicodeObject *self,
				2397	PyUnicodeObject *substring,
				2398	int start,
				2399	int end,
				2400	int direction)
				2401	{
				2402	if (start < 0)
				2403	start += self->length;
				2404	if (start < 0)
				2405	start = 0;
				2406
				2407	if (substring->length == 0)
				2408	return start;
				2409
				2410	if (end > self->length)
				2411	end = self->length;
				2412	if (end < 0)
				2413	end += self->length;
				2414	if (end < 0)
				2415	end = 0;
				2416
				2417	end -= substring->length;
				2418
				2419	if (direction < 0) {
				2420	for (; end >= start; end--)
				2421	if (Py_UNICODE_MATCH(self, end, substring))
				2422	return end;
				2423	} else {
				2424	for (; start <= end; start++)
				2425	if (Py_UNICODE_MATCH(self, start, substring))
				2426	return start;
				2427	}
				2428
				2429	return -1;
				2430	}
				2431
				2432	int PyUnicode_Find(PyObject *str,
				2433	PyObject *substr,
				2434	int start,
				2435	int end,
				2436	int direction)
				2437	{
				2438	int result;
				2439
				2440	str = PyUnicode_FromObject(str);
				2441	if (str == NULL)
				2442	return -1;
				2443	substr = PyUnicode_FromObject(substr);
				2444	if (substr == NULL) {
				2445	Py_DECREF(substr);
				2446	return -1;
				2447	}
				2448
				2449	result = findstring((PyUnicodeObject *)str,
				2450	(PyUnicodeObject *)substr,
				2451	start, end, direction);
				2452	Py_DECREF(str);
				2453	Py_DECREF(substr);
				2454	return result;
				2455	}
				2456
				2457	static
				2458	int tailmatch(PyUnicodeObject *self,
				2459	PyUnicodeObject *substring,
				2460	int start,
				2461	int end,
				2462	int direction)
				2463	{
				2464	if (start < 0)
				2465	start += self->length;
				2466	if (start < 0)
				2467	start = 0;
				2468
				2469	if (substring->length == 0)
				2470	return 1;
				2471
				2472	if (end > self->length)
				2473	end = self->length;
				2474	if (end < 0)
				2475	end += self->length;
				2476	if (end < 0)
				2477	end = 0;
				2478
				2479	end -= substring->length;
				2480	if (end < start)
				2481	return 0;
				2482
				2483	if (direction > 0) {
				2484	if (Py_UNICODE_MATCH(self, end, substring))
				2485	return 1;
				2486	} else {
				2487	if (Py_UNICODE_MATCH(self, start, substring))
				2488	return 1;
				2489	}
				2490
				2491	return 0;
				2492	}
				2493
				2494	int PyUnicode_Tailmatch(PyObject *str,
				2495	PyObject *substr,
				2496	int start,
				2497	int end,
				2498	int direction)
				2499	{
				2500	int result;
				2501
				2502	str = PyUnicode_FromObject(str);
				2503	if (str == NULL)
				2504	return -1;
				2505	substr = PyUnicode_FromObject(substr);
				2506	if (substr == NULL) {
				2507	Py_DECREF(substr);
				2508	return -1;
				2509	}
				2510
				2511	result = tailmatch((PyUnicodeObject *)str,
				2512	(PyUnicodeObject *)substr,
				2513	start, end, direction);
				2514	Py_DECREF(str);
				2515	Py_DECREF(substr);
				2516	return result;
				2517	}
				2518
				2519	static
				2520	const Py_UNICODE findchar(const Py_UNICODE s,
				2521	int size,
				2522	Py_UNICODE ch)
				2523	{
				2524	/* like wcschr, but doesn't stop at NULL characters */
				2525
				2526	while (size-- > 0) {
				2527	if (*s == ch)
				2528	return s;
				2529	s++;
				2530	}
				2531
				2532	return NULL;
				2533	}
				2534
				2535	/* Apply fixfct filter to the Unicode object self and return a
				2536	reference to the modified object */
				2537
				2538	static
				2539	PyObject fixup(PyUnicodeObject self,
				2540	int (fixfct)(PyUnicodeObject s))
				2541	{
				2542
				2543	PyUnicodeObject *u;
				2544
				2545	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2546	self->length);
				2547	if (u == NULL)
				2548	return NULL;
				2549	if (!fixfct(u)) {
				2550	/* fixfct should return TRUE if it modified the buffer. If
				2551	FALSE, return a reference to the original buffer instead
				2552	(to save space, not time) */
				2553	Py_INCREF(self);
				2554	Py_DECREF(u);
				2555	return (PyObject*) self;
				2556	}
				2557	return (PyObject*) u;
				2558	}
				2559
				2560	static
				2561	int fixupper(PyUnicodeObject *self)
				2562	{
				2563	int len = self->length;
				2564	Py_UNICODE *s = self->str;
				2565	int status = 0;
				2566
				2567	while (len-- > 0) {
				2568	register Py_UNICODE ch;
				2569
				2570	ch = Py_UNICODE_TOUPPER(*s);
				2571	if (ch != *s) {
				2572	status = 1;
				2573	*s = ch;
				2574	}
				2575	s++;
				2576	}
				2577
				2578	return status;
				2579	}
				2580
				2581	static
				2582	int fixlower(PyUnicodeObject *self)
				2583	{
				2584	int len = self->length;
				2585	Py_UNICODE *s = self->str;
				2586	int status = 0;
				2587
				2588	while (len-- > 0) {
				2589	register Py_UNICODE ch;
				2590
				2591	ch = Py_UNICODE_TOLOWER(*s);
				2592	if (ch != *s) {
				2593	status = 1;
				2594	*s = ch;
				2595	}
				2596	s++;
				2597	}
				2598
				2599	return status;
				2600	}
				2601
				2602	static
				2603	int fixswapcase(PyUnicodeObject *self)
				2604	{
				2605	int len = self->length;
				2606	Py_UNICODE *s = self->str;
				2607	int status = 0;
				2608
				2609	while (len-- > 0) {
				2610	if (Py_UNICODE_ISUPPER(*s)) {
				2611	s = Py_UNICODE_TOLOWER(s);
				2612	status = 1;
				2613	} else if (Py_UNICODE_ISLOWER(*s)) {
				2614	s = Py_UNICODE_TOUPPER(s);
				2615	status = 1;
				2616	}
				2617	s++;
				2618	}
				2619
				2620	return status;
				2621	}
				2622
				2623	static
				2624	int fixcapitalize(PyUnicodeObject *self)
				2625	{
				2626	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2627	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2628	return 1;
				2629	}
				2630	return 0;
				2631	}
				2632
				2633	static
				2634	int fixtitle(PyUnicodeObject *self)
				2635	{
				2636	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2637	register Py_UNICODE *e;
				2638	int previous_is_cased;
				2639
				2640	/* Shortcut for single character strings */
				2641	if (PyUnicode_GET_SIZE(self) == 1) {
				2642	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2643	if (*p != ch) {
				2644	*p = ch;
				2645	return 1;
				2646	}
				2647	else
				2648	return 0;
				2649	}
				2650
				2651	e = p + PyUnicode_GET_SIZE(self);
				2652	previous_is_cased = 0;
				2653	for (; p < e; p++) {
				2654	register const Py_UNICODE ch = *p;
				2655
				2656	if (previous_is_cased)
				2657	*p = Py_UNICODE_TOLOWER(ch);
				2658	else
				2659	*p = Py_UNICODE_TOTITLE(ch);
				2660
				2661	if (Py_UNICODE_ISLOWER(ch) \|\|
				2662	Py_UNICODE_ISUPPER(ch) \|\|
				2663	Py_UNICODE_ISTITLE(ch))
				2664	previous_is_cased = 1;
				2665	else
				2666	previous_is_cased = 0;
				2667	}
				2668	return 1;
				2669	}
				2670
				2671	PyObject PyUnicode_Join(PyObject separator,
				2672	PyObject *seq)
				2673	{
				2674	Py_UNICODE *sep;
				2675	int seplen;
				2676	PyUnicodeObject *res = NULL;
				2677	int reslen = 0;
				2678	Py_UNICODE *p;
				2679	int seqlen = 0;
				2680	int sz = 100;
				2681	int i;
				2682
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2683	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2684	if (seqlen < 0 && PyErr_Occurred())
				2685	return NULL;
				2686
				2687	if (separator == NULL) {
				2688	Py_UNICODE blank = ' ';
				2689	sep = &blank;
				2690	seplen = 1;
				2691	}
				2692	else {
				2693	separator = PyUnicode_FromObject(separator);
				2694	if (separator == NULL)
				2695	return NULL;
				2696	sep = PyUnicode_AS_UNICODE(separator);
				2697	seplen = PyUnicode_GET_SIZE(separator);
				2698	}
				2699
				2700	res = _PyUnicode_New(sz);
				2701	if (res == NULL)
				2702	goto onError;
				2703	p = PyUnicode_AS_UNICODE(res);
				2704	reslen = 0;
				2705
				2706	for (i = 0; i < seqlen; i++) {
				2707	int itemlen;
				2708	PyObject *item;
				2709
				2710	item = PySequence_GetItem(seq, i);
				2711	if (item == NULL)
				2712	goto onError;
				2713	if (!PyUnicode_Check(item)) {
				2714	PyObject *v;
				2715	v = PyUnicode_FromObject(item);
				2716	Py_DECREF(item);
				2717	item = v;
				2718	if (item == NULL)
				2719	goto onError;
				2720	}
				2721	itemlen = PyUnicode_GET_SIZE(item);
				2722	while (reslen + itemlen + seplen >= sz) {
				2723	if (_PyUnicode_Resize(res, sz*2))
				2724	goto onError;
				2725	sz *= 2;
				2726	p = PyUnicode_AS_UNICODE(res) + reslen;
				2727	}
				2728	if (i > 0) {
				2729	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2730	p += seplen;
				2731	reslen += seplen;
				2732	}
				2733	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2734	p += itemlen;
				2735	reslen += itemlen;
				2736	Py_DECREF(item);
				2737	}
				2738	if (_PyUnicode_Resize(res, reslen))
				2739	goto onError;
				2740
				2741	Py_XDECREF(separator);
				2742	return (PyObject *)res;
				2743
				2744	onError:
				2745	Py_XDECREF(separator);
				2746	Py_DECREF(res);
				2747	return NULL;
				2748	}
				2749
				2750	static
				2751	PyUnicodeObject pad(PyUnicodeObject self,
				2752	int left,
				2753	int right,
				2754	Py_UNICODE fill)
				2755	{
				2756	PyUnicodeObject *u;
				2757
				2758	if (left < 0)
				2759	left = 0;
				2760	if (right < 0)
				2761	right = 0;
				2762
				2763	if (left == 0 && right == 0) {
				2764	Py_INCREF(self);
				2765	return self;
				2766	}
				2767
				2768	u = _PyUnicode_New(left + self->length + right);
				2769	if (u) {
				2770	if (left)
				2771	Py_UNICODE_FILL(u->str, fill, left);
				2772	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2773	if (right)
				2774	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2775	}
				2776
				2777	return u;
				2778	}
				2779
				2780	#define SPLIT_APPEND(data, left, right) \
				2781	str = PyUnicode_FromUnicode(data + left, right - left); \
				2782	if (!str) \
				2783	goto onError; \
				2784	if (PyList_Append(list, str)) { \
				2785	Py_DECREF(str); \
				2786	goto onError; \
				2787	} \
				2788	else \
				2789	Py_DECREF(str);
				2790
				2791	static
				2792	PyObject split_whitespace(PyUnicodeObject self,
				2793	PyObject *list,
				2794	int maxcount)
				2795	{
				2796	register int i;
				2797	register int j;
				2798	int len = self->length;
				2799	PyObject *str;
				2800
				2801	for (i = j = 0; i < len; ) {
				2802	/* find a token */
				2803	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2804	i++;
				2805	j = i;
				2806	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2807	i++;
				2808	if (j < i) {
				2809	if (maxcount-- <= 0)
				2810	break;
				2811	SPLIT_APPEND(self->str, j, i);
				2812	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2813	i++;
				2814	j = i;
				2815	}
				2816	}
				2817	if (j < len) {
				2818	SPLIT_APPEND(self->str, j, len);
				2819	}
				2820	return list;
				2821
				2822	onError:
				2823	Py_DECREF(list);
				2824	return NULL;
				2825	}
				2826
				2827	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2828	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2829	{
				2830	register int i;
				2831	register int j;
				2832	int len;
				2833	PyObject *list;
				2834	PyObject *str;
				2835	Py_UNICODE *data;
				2836
				2837	string = PyUnicode_FromObject(string);
				2838	if (string == NULL)
				2839	return NULL;
				2840	data = PyUnicode_AS_UNICODE(string);
				2841	len = PyUnicode_GET_SIZE(string);
				2842
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2843	list = PyList_New(0);
				2844	if (!list)
				2845	goto onError;
				2846
				2847	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2848	int eol;
				2849
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2850	/* Find a line and append it */
				2851	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2852	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2853
				2854	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2855	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2856	if (i < len) {
				2857	if (data[i] == '\r' && i + 1 < len &&
				2858	data[i+1] == '\n')
				2859	i += 2;
				2860	else
				2861	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2862	if (keepends)
				2863	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2864	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2865	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2866	j = i;
				2867	}
				2868	if (j < len) {
				2869	SPLIT_APPEND(data, j, len);
				2870	}
				2871
				2872	Py_DECREF(string);
				2873	return list;
				2874
				2875	onError:
				2876	Py_DECREF(list);
				2877	Py_DECREF(string);
				2878	return NULL;
				2879	}
				2880
				2881	static
				2882	PyObject split_char(PyUnicodeObject self,
				2883	PyObject *list,
				2884	Py_UNICODE ch,
				2885	int maxcount)
				2886	{
				2887	register int i;
				2888	register int j;
				2889	int len = self->length;
				2890	PyObject *str;
				2891
				2892	for (i = j = 0; i < len; ) {
				2893	if (self->str[i] == ch) {
				2894	if (maxcount-- <= 0)
				2895	break;
				2896	SPLIT_APPEND(self->str, j, i);
				2897	i = j = i + 1;
				2898	} else
				2899	i++;
				2900	}
				2901	if (j <= len) {
				2902	SPLIT_APPEND(self->str, j, len);
				2903	}
				2904	return list;
				2905
				2906	onError:
				2907	Py_DECREF(list);
				2908	return NULL;
				2909	}
				2910
				2911	static
				2912	PyObject split_substring(PyUnicodeObject self,
				2913	PyObject *list,
				2914	PyUnicodeObject *substring,
				2915	int maxcount)
				2916	{
				2917	register int i;
				2918	register int j;
				2919	int len = self->length;
				2920	int sublen = substring->length;
				2921	PyObject *str;
				2922
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	2923	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2924	if (Py_UNICODE_MATCH(self, i, substring)) {
				2925	if (maxcount-- <= 0)
				2926	break;
				2927	SPLIT_APPEND(self->str, j, i);
				2928	i = j = i + sublen;
				2929	} else
				2930	i++;
				2931	}
				2932	if (j <= len) {
				2933	SPLIT_APPEND(self->str, j, len);
				2934	}
				2935	return list;
				2936
				2937	onError:
				2938	Py_DECREF(list);
				2939	return NULL;
				2940	}
				2941
				2942	#undef SPLIT_APPEND
				2943
				2944	static
				2945	PyObject split(PyUnicodeObject self,
				2946	PyUnicodeObject *substring,
				2947	int maxcount)
				2948	{
				2949	PyObject *list;
				2950
				2951	if (maxcount < 0)
				2952	maxcount = INT_MAX;
				2953
				2954	list = PyList_New(0);
				2955	if (!list)
				2956	return NULL;
				2957
				2958	if (substring == NULL)
				2959	return split_whitespace(self,list,maxcount);
				2960
				2961	else if (substring->length == 1)
				2962	return split_char(self,list,substring->str[0],maxcount);
				2963
				2964	else if (substring->length == 0) {
				2965	Py_DECREF(list);
				2966	PyErr_SetString(PyExc_ValueError, "empty separator");
				2967	return NULL;
				2968	}
				2969	else
				2970	return split_substring(self,list,substring,maxcount);
				2971	}
				2972
				2973	static
				2974	PyObject strip(PyUnicodeObject self,
				2975	int left,
				2976	int right)
				2977	{
				2978	Py_UNICODE *p = self->str;
				2979	int start = 0;
				2980	int end = self->length;
				2981
				2982	if (left)
				2983	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2984	start++;
				2985
				2986	if (right)
				2987	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2988	end--;
				2989
				2990	if (start == 0 && end == self->length) {
				2991	/* couldn't strip anything off, return original string */
				2992	Py_INCREF(self);
				2993	return (PyObject*) self;
				2994	}
				2995
				2996	return (PyObject*) PyUnicode_FromUnicode(
				2997	self->str + start,
				2998	end - start
				2999	);
				3000	}
				3001
				3002	static
				3003	PyObject replace(PyUnicodeObject self,
				3004	PyUnicodeObject *str1,
				3005	PyUnicodeObject *str2,
				3006	int maxcount)
				3007	{
				3008	PyUnicodeObject *u;
				3009
				3010	if (maxcount < 0)
				3011	maxcount = INT_MAX;
				3012
				3013	if (str1->length == 1 && str2->length == 1) {
				3014	int i;
				3015
				3016	/* replace characters */
				3017	if (!findchar(self->str, self->length, str1->str[0])) {
				3018	/* nothing to replace, return original string */
				3019	Py_INCREF(self);
				3020	u = self;
				3021	} else {
				3022	Py_UNICODE u1 = str1->str[0];
				3023	Py_UNICODE u2 = str2->str[0];
				3024
				3025	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3026	self->str,
				3027	self->length
				3028	);
				3029	if (u)
				3030	for (i = 0; i < u->length; i++)
				3031	if (u->str[i] == u1) {
				3032	if (--maxcount < 0)
				3033	break;
				3034	u->str[i] = u2;
				3035	}
				3036	}
				3037
				3038	} else {
				3039	int n, i;
				3040	Py_UNICODE *p;
				3041
				3042	/* replace strings */
				3043	n = count(self, 0, self->length, str1);
				3044	if (n > maxcount)
				3045	n = maxcount;
				3046	if (n == 0) {
				3047	/* nothing to replace, return original string */
				3048	Py_INCREF(self);
				3049	u = self;
				3050	} else {
				3051	u = _PyUnicode_New(
				3052	self->length + n * (str2->length - str1->length));
				3053	if (u) {
				3054	i = 0;
				3055	p = u->str;
				3056	while (i <= self->length - str1->length)
				3057	if (Py_UNICODE_MATCH(self, i, str1)) {
				3058	/* replace string segment */
				3059	Py_UNICODE_COPY(p, str2->str, str2->length);
				3060	p += str2->length;
				3061	i += str1->length;
				3062	if (--n <= 0) {
				3063	/* copy remaining part */
				3064	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3065	break;
				3066	}
				3067	} else
				3068	*p++ = self->str[i++];
				3069	}
				3070	}
				3071	}
				3072
				3073	return (PyObject *) u;
				3074	}
				3075
				3076	/* --- Unicode Object Methods --------------------------------------------- */
				3077
				3078	static char title__doc__[] =
				3079	"S.title() -> unicode\n\
				3080	\n\
				3081	Return a titlecased version of S, i.e. words start with title case\n\
				3082	characters, all remaining cased characters have lower case.";
				3083
				3084	static PyObject*
				3085	unicode_title(PyUnicodeObject self, PyObject args)
				3086	{
				3087	if (!PyArg_NoArgs(args))
				3088	return NULL;
				3089	return fixup(self, fixtitle);
				3090	}
				3091
				3092	static char capitalize__doc__[] =
				3093	"S.capitalize() -> unicode\n\
				3094	\n\
				3095	Return a capitalized version of S, i.e. make the first character\n\
				3096	have upper case.";
				3097
				3098	static PyObject*
				3099	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3100	{
				3101	if (!PyArg_NoArgs(args))
				3102	return NULL;
				3103	return fixup(self, fixcapitalize);
				3104	}
				3105
				3106	#if 0
				3107	static char capwords__doc__[] =
				3108	"S.capwords() -> unicode\n\
				3109	\n\
				3110	Apply .capitalize() to all words in S and return the result with\n\
				3111	normalized whitespace (all whitespace strings are replaced by ' ').";
				3112
				3113	static PyObject*
				3114	unicode_capwords(PyUnicodeObject self, PyObject args)
				3115	{
				3116	PyObject *list;
				3117	PyObject *item;
				3118	int i;
				3119
				3120	if (!PyArg_NoArgs(args))
				3121	return NULL;
				3122
				3123	/* Split into words */
				3124	list = split(self, NULL, -1);
				3125	if (!list)
				3126	return NULL;
				3127
				3128	/* Capitalize each word */
				3129	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3130	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3131	fixcapitalize);
				3132	if (item == NULL)
				3133	goto onError;
				3134	Py_DECREF(PyList_GET_ITEM(list, i));
				3135	PyList_SET_ITEM(list, i, item);
				3136	}
				3137
				3138	/* Join the words to form a new string */
				3139	item = PyUnicode_Join(NULL, list);
				3140
				3141	onError:
				3142	Py_DECREF(list);
				3143	return (PyObject *)item;
				3144	}
				3145	#endif
				3146
				3147	static char center__doc__[] =
				3148	"S.center(width) -> unicode\n\
				3149	\n\
				3150	Return S centered in a Unicode string of length width. Padding is done\n\
				3151	using spaces.";
				3152
				3153	static PyObject *
				3154	unicode_center(PyUnicodeObject self, PyObject args)
				3155	{
				3156	int marg, left;
				3157	int width;
				3158
				3159	if (!PyArg_ParseTuple(args, "i:center", &width))
				3160	return NULL;
				3161
				3162	if (self->length >= width) {
				3163	Py_INCREF(self);
				3164	return (PyObject*) self;
				3165	}
				3166
				3167	marg = width - self->length;
				3168	left = marg / 2 + (marg & width & 1);
				3169
				3170	return (PyObject*) pad(self, left, marg - left, ' ');
				3171	}
				3172
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3173	#if 0
				3174
				3175	/* This code should go into some future Unicode collation support
				3176	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3177	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3178
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3179	/* speedy UTF-16 code point order comparison */
				3180	/* gleaned from: */
				3181	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3182
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3183	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3184	{
				3185	0, 0, 0, 0, 0, 0, 0, 0,
				3186	0, 0, 0, 0, 0, 0, 0, 0,
				3187	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3188	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3189	};
				3190
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3191	static int
				3192	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3193	{
				3194	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3195
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3196	Py_UNICODE *s1 = str1->str;
				3197	Py_UNICODE *s2 = str2->str;
				3198
				3199	len1 = str1->length;
				3200	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3201
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3202	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3203	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3204	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3205
				3206	c1 = *s1++;
				3207	c2 = *s2++;
				3208	if (c1 > (1<<11) * 26)
				3209	c1 += utf16Fixup[c1>>11];
				3210	if (c2 > (1<<11) * 26)
				3211	c2 += utf16Fixup[c2>>11];
				3212
				3213	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3214	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3215	if (diff)
				3216	return (diff < 0) ? -1 : (diff != 0);
				3217	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3218	}
				3219
				3220	return (len1 < len2) ? -1 : (len1 != len2);
				3221	}
				3222
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3223	#else
				3224
				3225	static int
				3226	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3227	{
				3228	register int len1, len2;
				3229
				3230	Py_UNICODE *s1 = str1->str;
				3231	Py_UNICODE *s2 = str2->str;
				3232
				3233	len1 = str1->length;
				3234	len2 = str2->length;
				3235
				3236	while (len1 > 0 && len2 > 0) {
				3237	register long diff;
				3238
				3239	diff = (long)s1++ - (long)s2++;
				3240	if (diff)
				3241	return (diff < 0) ? -1 : (diff != 0);
				3242	len1--; len2--;
				3243	}
				3244
				3245	return (len1 < len2) ? -1 : (len1 != len2);
				3246	}
				3247
				3248	#endif
				3249
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3250	int PyUnicode_Compare(PyObject *left,
				3251	PyObject *right)
				3252	{
				3253	PyUnicodeObject u = NULL, v = NULL;
				3254	int result;
				3255
				3256	/* Coerce the two arguments */
				3257	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3258	if (u == NULL)
				3259	goto onError;
				3260	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3261	if (v == NULL)
				3262	goto onError;
				3263
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3264	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3265	if (v == u) {
				3266	Py_DECREF(u);
				3267	Py_DECREF(v);
				3268	return 0;
				3269	}
				3270
				3271	result = unicode_compare(u, v);
				3272
				3273	Py_DECREF(u);
				3274	Py_DECREF(v);
				3275	return result;
				3276
				3277	onError:
				3278	Py_XDECREF(u);
				3279	Py_XDECREF(v);
				3280	return -1;
				3281	}
				3282
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3283	int PyUnicode_Contains(PyObject *container,
				3284	PyObject *element)
				3285	{
				3286	PyUnicodeObject u = NULL, v = NULL;
				3287	int result;
				3288	register const Py_UNICODE p, e;
				3289	register Py_UNICODE ch;
				3290
				3291	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3292	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3293	if (v == NULL) {
				3294	PyErr_SetString(PyExc_TypeError,
				3295	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3296	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3297	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3298	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3299	if (u == NULL) {
				3300	Py_DECREF(v);
				3301	goto onError;
				3302	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3303
				3304	/* Check v in u */
				3305	if (PyUnicode_GET_SIZE(v) != 1) {
				3306	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3307	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3308	goto onError;
				3309	}
				3310	ch = *PyUnicode_AS_UNICODE(v);
				3311	p = PyUnicode_AS_UNICODE(u);
				3312	e = p + PyUnicode_GET_SIZE(u);
				3313	result = 0;
				3314	while (p < e) {
				3315	if (*p++ == ch) {
				3316	result = 1;
				3317	break;
				3318	}
				3319	}
				3320
				3321	Py_DECREF(u);
				3322	Py_DECREF(v);
				3323	return result;
				3324
				3325	onError:
				3326	Py_XDECREF(u);
				3327	Py_XDECREF(v);
				3328	return -1;
				3329	}
				3330
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3331	/* Concat to string or Unicode object giving a new Unicode object. */
				3332
				3333	PyObject PyUnicode_Concat(PyObject left,
				3334	PyObject *right)
				3335	{
				3336	PyUnicodeObject u = NULL, v = NULL, *w;
				3337
				3338	/* Coerce the two arguments */
				3339	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3340	if (u == NULL)
				3341	goto onError;
				3342	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3343	if (v == NULL)
				3344	goto onError;
				3345
				3346	/* Shortcuts */
				3347	if (v == unicode_empty) {
				3348	Py_DECREF(v);
				3349	return (PyObject *)u;
				3350	}
				3351	if (u == unicode_empty) {
				3352	Py_DECREF(u);
				3353	return (PyObject *)v;
				3354	}
				3355
				3356	/* Concat the two Unicode strings */
				3357	w = _PyUnicode_New(u->length + v->length);
				3358	if (w == NULL)
				3359	goto onError;
				3360	Py_UNICODE_COPY(w->str, u->str, u->length);
				3361	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3362
				3363	Py_DECREF(u);
				3364	Py_DECREF(v);
				3365	return (PyObject *)w;
				3366
				3367	onError:
				3368	Py_XDECREF(u);
				3369	Py_XDECREF(v);
				3370	return NULL;
				3371	}
				3372
				3373	static char count__doc__[] =
				3374	"S.count(sub[, start[, end]]) -> int\n\
				3375	\n\
				3376	Return the number of occurrences of substring sub in Unicode string\n\
				3377	S[start:end]. Optional arguments start and end are\n\
				3378	interpreted as in slice notation.";
				3379
				3380	static PyObject *
				3381	unicode_count(PyUnicodeObject self, PyObject args)
				3382	{
				3383	PyUnicodeObject *substring;
				3384	int start = 0;
				3385	int end = INT_MAX;
				3386	PyObject *result;
				3387
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3388	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3389	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3390	return NULL;
				3391
				3392	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3393	(PyObject *)substring);
				3394	if (substring == NULL)
				3395	return NULL;
				3396
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3397	if (start < 0)
				3398	start += self->length;
				3399	if (start < 0)
				3400	start = 0;
				3401	if (end > self->length)
				3402	end = self->length;
				3403	if (end < 0)
				3404	end += self->length;
				3405	if (end < 0)
				3406	end = 0;
				3407
				3408	result = PyInt_FromLong((long) count(self, start, end, substring));
				3409
				3410	Py_DECREF(substring);
				3411	return result;
				3412	}
				3413
				3414	static char encode__doc__[] =
				3415	"S.encode([encoding[,errors]]) -> string\n\
				3416	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3417	Return an encoded string version of S. Default encoding is the current\n\
				3418	default string encoding. errors may be given to set a different error\n\
				3419	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3420	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3421
				3422	static PyObject *
				3423	unicode_encode(PyUnicodeObject self, PyObject args)
				3424	{
				3425	char *encoding = NULL;
				3426	char *errors = NULL;
				3427	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3428	return NULL;
				3429	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3430	}
				3431
				3432	static char expandtabs__doc__[] =
				3433	"S.expandtabs([tabsize]) -> unicode\n\
				3434	\n\
				3435	Return a copy of S where all tab characters are expanded using spaces.\n\
				3436	If tabsize is not given, a tab size of 8 characters is assumed.";
				3437
				3438	static PyObject*
				3439	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3440	{
				3441	Py_UNICODE *e;
				3442	Py_UNICODE *p;
				3443	Py_UNICODE *q;
				3444	int i, j;
				3445	PyUnicodeObject *u;
				3446	int tabsize = 8;
				3447
				3448	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3449	return NULL;
				3450
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3451	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3452	i = j = 0;
				3453	e = self->str + self->length;
				3454	for (p = self->str; p < e; p++)
				3455	if (*p == '\t') {
				3456	if (tabsize > 0)
				3457	j += tabsize - (j % tabsize);
				3458	}
				3459	else {
				3460	j++;
				3461	if (p == '\n' \|\| p == '\r') {
				3462	i += j;
				3463	j = 0;
				3464	}
				3465	}
				3466
				3467	/* Second pass: create output string and fill it */
				3468	u = _PyUnicode_New(i + j);
				3469	if (!u)
				3470	return NULL;
				3471
				3472	j = 0;
				3473	q = u->str;
				3474
				3475	for (p = self->str; p < e; p++)
				3476	if (*p == '\t') {
				3477	if (tabsize > 0) {
				3478	i = tabsize - (j % tabsize);
				3479	j += i;
				3480	while (i--)
				3481	*q++ = ' ';
				3482	}
				3483	}
				3484	else {
				3485	j++;
				3486	q++ = p;
				3487	if (p == '\n' \|\| p == '\r')
				3488	j = 0;
				3489	}
				3490
				3491	return (PyObject*) u;
				3492	}
				3493
				3494	static char find__doc__[] =
				3495	"S.find(sub [,start [,end]]) -> int\n\
				3496	\n\
				3497	Return the lowest index in S where substring sub is found,\n\
				3498	such that sub is contained within s[start,end]. Optional\n\
				3499	arguments start and end are interpreted as in slice notation.\n\
				3500	\n\
				3501	Return -1 on failure.";
				3502
				3503	static PyObject *
				3504	unicode_find(PyUnicodeObject self, PyObject args)
				3505	{
				3506	PyUnicodeObject *substring;
				3507	int start = 0;
				3508	int end = INT_MAX;
				3509	PyObject *result;
				3510
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3511	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3512	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3513	return NULL;
				3514	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3515	(PyObject *)substring);
				3516	if (substring == NULL)
				3517	return NULL;
				3518
				3519	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3520
				3521	Py_DECREF(substring);
				3522	return result;
				3523	}
				3524
				3525	static PyObject *
				3526	unicode_getitem(PyUnicodeObject *self, int index)
				3527	{
				3528	if (index < 0 \|\| index >= self->length) {
				3529	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3530	return NULL;
				3531	}
				3532
				3533	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3534	}
				3535
				3536	static long
				3537	unicode_hash(PyUnicodeObject *self)
				3538	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3539	/* Since Unicode objects compare equal to their ASCII string
				3540	counterparts, they should use the individual character values
				3541	as basis for their hash value. This is needed to assure that
				3542	strings and Unicode objects behave in the same way as
				3543	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3544
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3545	register int len;
				3546	register Py_UNICODE *p;
				3547	register long x;
				3548
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3549	if (self->hash != -1)
				3550	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3551	len = PyUnicode_GET_SIZE(self);
				3552	p = PyUnicode_AS_UNICODE(self);
				3553	x = *p << 7;
				3554	while (--len >= 0)
				3555	x = (1000003x) ^ p++;
				3556	x ^= PyUnicode_GET_SIZE(self);
				3557	if (x == -1)
				3558	x = -2;
				3559	self->hash = x;
				3560	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3561	}
				3562
				3563	static char index__doc__[] =
				3564	"S.index(sub [,start [,end]]) -> int\n\
				3565	\n\
				3566	Like S.find() but raise ValueError when the substring is not found.";
				3567
				3568	static PyObject *
				3569	unicode_index(PyUnicodeObject self, PyObject args)
				3570	{
				3571	int result;
				3572	PyUnicodeObject *substring;
				3573	int start = 0;
				3574	int end = INT_MAX;
				3575
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3576	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3577	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3578	return NULL;
				3579
				3580	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3581	(PyObject *)substring);
				3582	if (substring == NULL)
				3583	return NULL;
				3584
				3585	result = findstring(self, substring, start, end, 1);
				3586
				3587	Py_DECREF(substring);
				3588	if (result < 0) {
				3589	PyErr_SetString(PyExc_ValueError, "substring not found");
				3590	return NULL;
				3591	}
				3592	return PyInt_FromLong(result);
				3593	}
				3594
				3595	static char islower__doc__[] =
				3596	"S.islower() -> int\n\
				3597	\n\
				3598	Return 1 if all cased characters in S are lowercase and there is\n\
				3599	at least one cased character in S, 0 otherwise.";
				3600
				3601	static PyObject*
				3602	unicode_islower(PyUnicodeObject self, PyObject args)
				3603	{
				3604	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3605	register const Py_UNICODE *e;
				3606	int cased;
				3607
				3608	if (!PyArg_NoArgs(args))
				3609	return NULL;
				3610
				3611	/* Shortcut for single character strings */
				3612	if (PyUnicode_GET_SIZE(self) == 1)
				3613	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3614
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3615	/* Special case for empty strings */
				3616	if (PyString_GET_SIZE(self) == 0)
				3617	return PyInt_FromLong(0);
				3618
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3619	e = p + PyUnicode_GET_SIZE(self);
				3620	cased = 0;
				3621	for (; p < e; p++) {
				3622	register const Py_UNICODE ch = *p;
				3623
				3624	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3625	return PyInt_FromLong(0);
				3626	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3627	cased = 1;
				3628	}
				3629	return PyInt_FromLong(cased);
				3630	}
				3631
				3632	static char isupper__doc__[] =
				3633	"S.isupper() -> int\n\
				3634	\n\
				3635	Return 1 if all cased characters in S are uppercase and there is\n\
				3636	at least one cased character in S, 0 otherwise.";
				3637
				3638	static PyObject*
				3639	unicode_isupper(PyUnicodeObject self, PyObject args)
				3640	{
				3641	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3642	register const Py_UNICODE *e;
				3643	int cased;
				3644
				3645	if (!PyArg_NoArgs(args))
				3646	return NULL;
				3647
				3648	/* Shortcut for single character strings */
				3649	if (PyUnicode_GET_SIZE(self) == 1)
				3650	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3651
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3652	/* Special case for empty strings */
				3653	if (PyString_GET_SIZE(self) == 0)
				3654	return PyInt_FromLong(0);
				3655
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3656	e = p + PyUnicode_GET_SIZE(self);
				3657	cased = 0;
				3658	for (; p < e; p++) {
				3659	register const Py_UNICODE ch = *p;
				3660
				3661	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3662	return PyInt_FromLong(0);
				3663	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3664	cased = 1;
				3665	}
				3666	return PyInt_FromLong(cased);
				3667	}
				3668
				3669	static char istitle__doc__[] =
				3670	"S.istitle() -> int\n\
				3671	\n\
				3672	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3673	may only follow uncased characters and lowercase characters only cased\n\
				3674	ones. Return 0 otherwise.";
				3675
				3676	static PyObject*
				3677	unicode_istitle(PyUnicodeObject self, PyObject args)
				3678	{
				3679	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3680	register const Py_UNICODE *e;
				3681	int cased, previous_is_cased;
				3682
				3683	if (!PyArg_NoArgs(args))
				3684	return NULL;
				3685
				3686	/* Shortcut for single character strings */
				3687	if (PyUnicode_GET_SIZE(self) == 1)
				3688	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3689	(Py_UNICODE_ISUPPER(*p) != 0));
				3690
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3691	/* Special case for empty strings */
				3692	if (PyString_GET_SIZE(self) == 0)
				3693	return PyInt_FromLong(0);
				3694
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3695	e = p + PyUnicode_GET_SIZE(self);
				3696	cased = 0;
				3697	previous_is_cased = 0;
				3698	for (; p < e; p++) {
				3699	register const Py_UNICODE ch = *p;
				3700
				3701	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3702	if (previous_is_cased)
				3703	return PyInt_FromLong(0);
				3704	previous_is_cased = 1;
				3705	cased = 1;
				3706	}
				3707	else if (Py_UNICODE_ISLOWER(ch)) {
				3708	if (!previous_is_cased)
				3709	return PyInt_FromLong(0);
				3710	previous_is_cased = 1;
				3711	cased = 1;
				3712	}
				3713	else
				3714	previous_is_cased = 0;
				3715	}
				3716	return PyInt_FromLong(cased);
				3717	}
				3718
				3719	static char isspace__doc__[] =
				3720	"S.isspace() -> int\n\
				3721	\n\
				3722	Return 1 if there are only whitespace characters in S,\n\
				3723	0 otherwise.";
				3724
				3725	static PyObject*
				3726	unicode_isspace(PyUnicodeObject self, PyObject args)
				3727	{
				3728	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3729	register const Py_UNICODE *e;
				3730
				3731	if (!PyArg_NoArgs(args))
				3732	return NULL;
				3733
				3734	/* Shortcut for single character strings */
				3735	if (PyUnicode_GET_SIZE(self) == 1 &&
				3736	Py_UNICODE_ISSPACE(*p))
				3737	return PyInt_FromLong(1);
				3738
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3739	/* Special case for empty strings */
				3740	if (PyString_GET_SIZE(self) == 0)
				3741	return PyInt_FromLong(0);
				3742
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3743	e = p + PyUnicode_GET_SIZE(self);
				3744	for (; p < e; p++) {
				3745	if (!Py_UNICODE_ISSPACE(*p))
				3746	return PyInt_FromLong(0);
				3747	}
				3748	return PyInt_FromLong(1);
				3749	}
				3750
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3751	static char isalpha__doc__[] =
				3752	"S.isalpha() -> int\n\
				3753	\n\
				3754	Return 1 if all characters in S are alphabetic\n\
				3755	and there is at least one character in S, 0 otherwise.";
				3756
				3757	static PyObject*
				3758	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3759	{
				3760	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3761	register const Py_UNICODE *e;
				3762
				3763	if (!PyArg_NoArgs(args))
				3764	return NULL;
				3765
				3766	/* Shortcut for single character strings */
				3767	if (PyUnicode_GET_SIZE(self) == 1 &&
				3768	Py_UNICODE_ISALPHA(*p))
				3769	return PyInt_FromLong(1);
				3770
				3771	/* Special case for empty strings */
				3772	if (PyString_GET_SIZE(self) == 0)
				3773	return PyInt_FromLong(0);
				3774
				3775	e = p + PyUnicode_GET_SIZE(self);
				3776	for (; p < e; p++) {
				3777	if (!Py_UNICODE_ISALPHA(*p))
				3778	return PyInt_FromLong(0);
				3779	}
				3780	return PyInt_FromLong(1);
				3781	}
				3782
				3783	static char isalnum__doc__[] =
				3784	"S.isalnum() -> int\n\
				3785	\n\
				3786	Return 1 if all characters in S are alphanumeric\n\
				3787	and there is at least one character in S, 0 otherwise.";
				3788
				3789	static PyObject*
				3790	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3791	{
				3792	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3793	register const Py_UNICODE *e;
				3794
				3795	if (!PyArg_NoArgs(args))
				3796	return NULL;
				3797
				3798	/* Shortcut for single character strings */
				3799	if (PyUnicode_GET_SIZE(self) == 1 &&
				3800	Py_UNICODE_ISALNUM(*p))
				3801	return PyInt_FromLong(1);
				3802
				3803	/* Special case for empty strings */
				3804	if (PyString_GET_SIZE(self) == 0)
				3805	return PyInt_FromLong(0);
				3806
				3807	e = p + PyUnicode_GET_SIZE(self);
				3808	for (; p < e; p++) {
				3809	if (!Py_UNICODE_ISALNUM(*p))
				3810	return PyInt_FromLong(0);
				3811	}
				3812	return PyInt_FromLong(1);
				3813	}
				3814
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3815	static char isdecimal__doc__[] =
				3816	"S.isdecimal() -> int\n\
				3817	\n\
				3818	Return 1 if there are only decimal characters in S,\n\
				3819	0 otherwise.";
				3820
				3821	static PyObject*
				3822	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3823	{
				3824	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3825	register const Py_UNICODE *e;
				3826
				3827	if (!PyArg_NoArgs(args))
				3828	return NULL;
				3829
				3830	/* Shortcut for single character strings */
				3831	if (PyUnicode_GET_SIZE(self) == 1 &&
				3832	Py_UNICODE_ISDECIMAL(*p))
				3833	return PyInt_FromLong(1);
				3834
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3835	/* Special case for empty strings */
				3836	if (PyString_GET_SIZE(self) == 0)
				3837	return PyInt_FromLong(0);
				3838
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3839	e = p + PyUnicode_GET_SIZE(self);
				3840	for (; p < e; p++) {
				3841	if (!Py_UNICODE_ISDECIMAL(*p))
				3842	return PyInt_FromLong(0);
				3843	}
				3844	return PyInt_FromLong(1);
				3845	}
				3846
				3847	static char isdigit__doc__[] =
				3848	"S.isdigit() -> int\n\
				3849	\n\
				3850	Return 1 if there are only digit characters in S,\n\
				3851	0 otherwise.";
				3852
				3853	static PyObject*
				3854	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3855	{
				3856	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3857	register const Py_UNICODE *e;
				3858
				3859	if (!PyArg_NoArgs(args))
				3860	return NULL;
				3861
				3862	/* Shortcut for single character strings */
				3863	if (PyUnicode_GET_SIZE(self) == 1 &&
				3864	Py_UNICODE_ISDIGIT(*p))
				3865	return PyInt_FromLong(1);
				3866
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3867	/* Special case for empty strings */
				3868	if (PyString_GET_SIZE(self) == 0)
				3869	return PyInt_FromLong(0);
				3870
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3871	e = p + PyUnicode_GET_SIZE(self);
				3872	for (; p < e; p++) {
				3873	if (!Py_UNICODE_ISDIGIT(*p))
				3874	return PyInt_FromLong(0);
				3875	}
				3876	return PyInt_FromLong(1);
				3877	}
				3878
				3879	static char isnumeric__doc__[] =
				3880	"S.isnumeric() -> int\n\
				3881	\n\
				3882	Return 1 if there are only numeric characters in S,\n\
				3883	0 otherwise.";
				3884
				3885	static PyObject*
				3886	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3887	{
				3888	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3889	register const Py_UNICODE *e;
				3890
				3891	if (!PyArg_NoArgs(args))
				3892	return NULL;
				3893
				3894	/* Shortcut for single character strings */
				3895	if (PyUnicode_GET_SIZE(self) == 1 &&
				3896	Py_UNICODE_ISNUMERIC(*p))
				3897	return PyInt_FromLong(1);
				3898
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3899	/* Special case for empty strings */
				3900	if (PyString_GET_SIZE(self) == 0)
				3901	return PyInt_FromLong(0);
				3902
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3903	e = p + PyUnicode_GET_SIZE(self);
				3904	for (; p < e; p++) {
				3905	if (!Py_UNICODE_ISNUMERIC(*p))
				3906	return PyInt_FromLong(0);
				3907	}
				3908	return PyInt_FromLong(1);
				3909	}
				3910
				3911	static char join__doc__[] =
				3912	"S.join(sequence) -> unicode\n\
				3913	\n\
				3914	Return a string which is the concatenation of the strings in the\n\
				3915	sequence. The separator between elements is S.";
				3916
				3917	static PyObject*
				3918	unicode_join(PyUnicodeObject self, PyObject args)
				3919	{
				3920	PyObject *data;
				3921	if (!PyArg_ParseTuple(args, "O:join", &data))
				3922	return NULL;
				3923
				3924	return PyUnicode_Join((PyObject *)self, data);
				3925	}
				3926
				3927	static int
				3928	unicode_length(PyUnicodeObject *self)
				3929	{
				3930	return self->length;
				3931	}
				3932
				3933	static char ljust__doc__[] =
				3934	"S.ljust(width) -> unicode\n\
				3935	\n\
				3936	Return S left justified in a Unicode string of length width. Padding is\n\
				3937	done using spaces.";
				3938
				3939	static PyObject *
				3940	unicode_ljust(PyUnicodeObject self, PyObject args)
				3941	{
				3942	int width;
				3943	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3944	return NULL;
				3945
				3946	if (self->length >= width) {
				3947	Py_INCREF(self);
				3948	return (PyObject*) self;
				3949	}
				3950
				3951	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3952	}
				3953
				3954	static char lower__doc__[] =
				3955	"S.lower() -> unicode\n\
				3956	\n\
				3957	Return a copy of the string S converted to lowercase.";
				3958
				3959	static PyObject*
				3960	unicode_lower(PyUnicodeObject self, PyObject args)
				3961	{
				3962	if (!PyArg_NoArgs(args))
				3963	return NULL;
				3964	return fixup(self, fixlower);
				3965	}
				3966
				3967	static char lstrip__doc__[] =
				3968	"S.lstrip() -> unicode\n\
				3969	\n\
				3970	Return a copy of the string S with leading whitespace removed.";
				3971
				3972	static PyObject *
				3973	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3974	{
				3975	if (!PyArg_NoArgs(args))
				3976	return NULL;
				3977	return strip(self, 1, 0);
				3978	}
				3979
				3980	static PyObject*
				3981	unicode_repeat(PyUnicodeObject *str, int len)
				3982	{
				3983	PyUnicodeObject *u;
				3984	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3985	int nchars;
				3986	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3987
				3988	if (len < 0)
				3989	len = 0;
				3990
				3991	if (len == 1) {
				3992	/* no repeat, return original string */
				3993	Py_INCREF(str);
				3994	return (PyObject*) str;
				3995	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	3996
				3997	/* ensure # of chars needed doesn't overflow int and # of bytes
				3998	* needed doesn't overflow size_t
				3999	*/
				4000	nchars = len * str->length;
				4001	if (len && nchars / len != str->length) {
				4002	PyErr_SetString(PyExc_OverflowError,
				4003	"repeated string is too long");
				4004	return NULL;
				4005	}
				4006	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4007	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4008	PyErr_SetString(PyExc_OverflowError,
				4009	"repeated string is too long");
				4010	return NULL;
				4011	}
				4012	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4013	if (!u)
				4014	return NULL;
				4015
				4016	p = u->str;
				4017
				4018	while (len-- > 0) {
				4019	Py_UNICODE_COPY(p, str->str, str->length);
				4020	p += str->length;
				4021	}
				4022
				4023	return (PyObject*) u;
				4024	}
				4025
				4026	PyObject PyUnicode_Replace(PyObject obj,
				4027	PyObject *subobj,
				4028	PyObject *replobj,
				4029	int maxcount)
				4030	{
				4031	PyObject *self;
				4032	PyObject *str1;
				4033	PyObject *str2;
				4034	PyObject *result;
				4035
				4036	self = PyUnicode_FromObject(obj);
				4037	if (self == NULL)
				4038	return NULL;
				4039	str1 = PyUnicode_FromObject(subobj);
				4040	if (str1 == NULL) {
				4041	Py_DECREF(self);
				4042	return NULL;
				4043	}
				4044	str2 = PyUnicode_FromObject(replobj);
				4045	if (str2 == NULL) {
				4046	Py_DECREF(self);
				4047	Py_DECREF(str1);
				4048	return NULL;
				4049	}
				4050	result = replace((PyUnicodeObject *)self,
				4051	(PyUnicodeObject *)str1,
				4052	(PyUnicodeObject *)str2,
				4053	maxcount);
				4054	Py_DECREF(self);
				4055	Py_DECREF(str1);
				4056	Py_DECREF(str2);
				4057	return result;
				4058	}
				4059
				4060	static char replace__doc__[] =
				4061	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4062	\n\
				4063	Return a copy of S with all occurrences of substring\n\
				4064	old replaced by new. If the optional argument maxsplit is\n\
				4065	given, only the first maxsplit occurrences are replaced.";
				4066
				4067	static PyObject*
				4068	unicode_replace(PyUnicodeObject self, PyObject args)
				4069	{
				4070	PyUnicodeObject *str1;
				4071	PyUnicodeObject *str2;
				4072	int maxcount = -1;
				4073	PyObject *result;
				4074
				4075	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4076	return NULL;
				4077	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4078	if (str1 == NULL)
				4079	return NULL;
				4080	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4081	if (str2 == NULL)
				4082	return NULL;
				4083
				4084	result = replace(self, str1, str2, maxcount);
				4085
				4086	Py_DECREF(str1);
				4087	Py_DECREF(str2);
				4088	return result;
				4089	}
				4090
				4091	static
				4092	PyObject unicode_repr(PyObject unicode)
				4093	{
				4094	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4095	PyUnicode_GET_SIZE(unicode),
				4096	1);
				4097	}
				4098
				4099	static char rfind__doc__[] =
				4100	"S.rfind(sub [,start [,end]]) -> int\n\
				4101	\n\
				4102	Return the highest index in S where substring sub is found,\n\
				4103	such that sub is contained within s[start,end]. Optional\n\
				4104	arguments start and end are interpreted as in slice notation.\n\
				4105	\n\
				4106	Return -1 on failure.";
				4107
				4108	static PyObject *
				4109	unicode_rfind(PyUnicodeObject self, PyObject args)
				4110	{
				4111	PyUnicodeObject *substring;
				4112	int start = 0;
				4113	int end = INT_MAX;
				4114	PyObject *result;
				4115
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4116	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4117	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4118	return NULL;
				4119	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4120	(PyObject *)substring);
				4121	if (substring == NULL)
				4122	return NULL;
				4123
				4124	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4125
				4126	Py_DECREF(substring);
				4127	return result;
				4128	}
				4129
				4130	static char rindex__doc__[] =
				4131	"S.rindex(sub [,start [,end]]) -> int\n\
				4132	\n\
				4133	Like S.rfind() but raise ValueError when the substring is not found.";
				4134
				4135	static PyObject *
				4136	unicode_rindex(PyUnicodeObject self, PyObject args)
				4137	{
				4138	int result;
				4139	PyUnicodeObject *substring;
				4140	int start = 0;
				4141	int end = INT_MAX;
				4142
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4143	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4144	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4145	return NULL;
				4146	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4147	(PyObject *)substring);
				4148	if (substring == NULL)
				4149	return NULL;
				4150
				4151	result = findstring(self, substring, start, end, -1);
				4152
				4153	Py_DECREF(substring);
				4154	if (result < 0) {
				4155	PyErr_SetString(PyExc_ValueError, "substring not found");
				4156	return NULL;
				4157	}
				4158	return PyInt_FromLong(result);
				4159	}
				4160
				4161	static char rjust__doc__[] =
				4162	"S.rjust(width) -> unicode\n\
				4163	\n\
				4164	Return S right justified in a Unicode string of length width. Padding is\n\
				4165	done using spaces.";
				4166
				4167	static PyObject *
				4168	unicode_rjust(PyUnicodeObject self, PyObject args)
				4169	{
				4170	int width;
				4171	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4172	return NULL;
				4173
				4174	if (self->length >= width) {
				4175	Py_INCREF(self);
				4176	return (PyObject*) self;
				4177	}
				4178
				4179	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4180	}
				4181
				4182	static char rstrip__doc__[] =
				4183	"S.rstrip() -> unicode\n\
				4184	\n\
				4185	Return a copy of the string S with trailing whitespace removed.";
				4186
				4187	static PyObject *
				4188	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4189	{
				4190	if (!PyArg_NoArgs(args))
				4191	return NULL;
				4192	return strip(self, 0, 1);
				4193	}
				4194
				4195	static PyObject*
				4196	unicode_slice(PyUnicodeObject *self, int start, int end)
				4197	{
				4198	/* standard clamping */
				4199	if (start < 0)
				4200	start = 0;
				4201	if (end < 0)
				4202	end = 0;
				4203	if (end > self->length)
				4204	end = self->length;
				4205	if (start == 0 && end == self->length) {
				4206	/* full slice, return original string */
				4207	Py_INCREF(self);
				4208	return (PyObject*) self;
				4209	}
				4210	if (start > end)
				4211	start = end;
				4212	/* copy slice */
				4213	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4214	end - start);
				4215	}
				4216
				4217	PyObject PyUnicode_Split(PyObject s,
				4218	PyObject *sep,
				4219	int maxsplit)
				4220	{
				4221	PyObject *result;
				4222
				4223	s = PyUnicode_FromObject(s);
				4224	if (s == NULL)
				4225	return NULL;
				4226	if (sep != NULL) {
				4227	sep = PyUnicode_FromObject(sep);
				4228	if (sep == NULL) {
				4229	Py_DECREF(s);
				4230	return NULL;
				4231	}
				4232	}
				4233
				4234	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4235
				4236	Py_DECREF(s);
				4237	Py_XDECREF(sep);
				4238	return result;
				4239	}
				4240
				4241	static char split__doc__[] =
				4242	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4243	\n\
				4244	Return a list of the words in S, using sep as the\n\
				4245	delimiter string. If maxsplit is given, at most maxsplit\n\
				4246	splits are done. If sep is not specified, any whitespace string\n\
				4247	is a separator.";
				4248
				4249	static PyObject*
				4250	unicode_split(PyUnicodeObject self, PyObject args)
				4251	{
				4252	PyObject *substring = Py_None;
				4253	int maxcount = -1;
				4254
				4255	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4256	return NULL;
				4257
				4258	if (substring == Py_None)
				4259	return split(self, NULL, maxcount);
				4260	else if (PyUnicode_Check(substring))
				4261	return split(self, (PyUnicodeObject *)substring, maxcount);
				4262	else
				4263	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4264	}
				4265
				4266	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4267	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4268	\n\
				4269	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4270	Line breaks are not included in the resulting list unless keepends\n\
				4271	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4272
				4273	static PyObject*
				4274	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4275	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4276	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4277
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4278	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4279	return NULL;
				4280
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4281	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4282	}
				4283
				4284	static
				4285	PyObject unicode_str(PyUnicodeObject self)
				4286	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4287	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4288	}
				4289
				4290	static char strip__doc__[] =
				4291	"S.strip() -> unicode\n\
				4292	\n\
				4293	Return a copy of S with leading and trailing whitespace removed.";
				4294
				4295	static PyObject *
				4296	unicode_strip(PyUnicodeObject self, PyObject args)
				4297	{
				4298	if (!PyArg_NoArgs(args))
				4299	return NULL;
				4300	return strip(self, 1, 1);
				4301	}
				4302
				4303	static char swapcase__doc__[] =
				4304	"S.swapcase() -> unicode\n\
				4305	\n\
				4306	Return a copy of S with uppercase characters converted to lowercase\n\
				4307	and vice versa.";
				4308
				4309	static PyObject*
				4310	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4311	{
				4312	if (!PyArg_NoArgs(args))
				4313	return NULL;
				4314	return fixup(self, fixswapcase);
				4315	}
				4316
				4317	static char translate__doc__[] =
				4318	"S.translate(table) -> unicode\n\
				4319	\n\
				4320	Return a copy of the string S, where all characters have been mapped\n\
				4321	through the given translation table, which must be a mapping of\n\
				4322	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4323	are left untouched. Characters mapped to None are deleted.";
				4324
				4325	static PyObject*
				4326	unicode_translate(PyUnicodeObject self, PyObject args)
				4327	{
				4328	PyObject *table;
				4329
				4330	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4331	return NULL;
				4332	return PyUnicode_TranslateCharmap(self->str,
				4333	self->length,
				4334	table,
				4335	"ignore");
				4336	}
				4337
				4338	static char upper__doc__[] =
				4339	"S.upper() -> unicode\n\
				4340	\n\
				4341	Return a copy of S converted to uppercase.";
				4342
				4343	static PyObject*
				4344	unicode_upper(PyUnicodeObject self, PyObject args)
				4345	{
				4346	if (!PyArg_NoArgs(args))
				4347	return NULL;
				4348	return fixup(self, fixupper);
				4349	}
				4350
				4351	#if 0
				4352	static char zfill__doc__[] =
				4353	"S.zfill(width) -> unicode\n\
				4354	\n\
				4355	Pad a numeric string x with zeros on the left, to fill a field\n\
				4356	of the specified width. The string x is never truncated.";
				4357
				4358	static PyObject *
				4359	unicode_zfill(PyUnicodeObject self, PyObject args)
				4360	{
				4361	int fill;
				4362	PyUnicodeObject *u;
				4363
				4364	int width;
				4365	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4366	return NULL;
				4367
				4368	if (self->length >= width) {
				4369	Py_INCREF(self);
				4370	return (PyObject*) self;
				4371	}
				4372
				4373	fill = width - self->length;
				4374
				4375	u = pad(self, fill, 0, '0');
				4376
				4377	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4378	/* move sign to beginning of string */
				4379	u->str[0] = u->str[fill];
				4380	u->str[fill] = '0';
				4381	}
				4382
				4383	return (PyObject*) u;
				4384	}
				4385	#endif
				4386
				4387	#if 0
				4388	static PyObject*
				4389	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4390	{
				4391	if (!PyArg_NoArgs(args))
				4392	return NULL;
				4393	return PyInt_FromLong(unicode_freelist_size);
				4394	}
				4395	#endif
				4396
				4397	static char startswith__doc__[] =
				4398	"S.startswith(prefix[, start[, end]]) -> int\n\
				4399	\n\
				4400	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4401	optional start, test S beginning at that position. With optional end, stop\n\
				4402	comparing S at that position.";
				4403
				4404	static PyObject *
				4405	unicode_startswith(PyUnicodeObject *self,
				4406	PyObject *args)
				4407	{
				4408	PyUnicodeObject *substring;
				4409	int start = 0;
				4410	int end = INT_MAX;
				4411	PyObject *result;
				4412
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4413	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4414	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4415	return NULL;
				4416	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4417	(PyObject *)substring);
				4418	if (substring == NULL)
				4419	return NULL;
				4420
				4421	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4422
				4423	Py_DECREF(substring);
				4424	return result;
				4425	}
				4426
				4427
				4428	static char endswith__doc__[] =
				4429	"S.endswith(suffix[, start[, end]]) -> int\n\
				4430	\n\
				4431	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4432	optional start, test S beginning at that position. With optional end, stop\n\
				4433	comparing S at that position.";
				4434
				4435	static PyObject *
				4436	unicode_endswith(PyUnicodeObject *self,
				4437	PyObject *args)
				4438	{
				4439	PyUnicodeObject *substring;
				4440	int start = 0;
				4441	int end = INT_MAX;
				4442	PyObject *result;
				4443
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4444	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4445	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4446	return NULL;
				4447	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4448	(PyObject *)substring);
				4449	if (substring == NULL)
				4450	return NULL;
				4451
				4452	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4453
				4454	Py_DECREF(substring);
				4455	return result;
				4456	}
				4457
				4458
				4459	static PyMethodDef unicode_methods[] = {
				4460
				4461	/* Order is according to common usage: often used methods should
				4462	appear first, since lookup is done sequentially. */
				4463
				4464	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4465	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4466	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4467	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4468	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4469	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4470	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4471	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4472	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4473	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4474	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4475	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4476	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4477	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4478	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4479	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4480	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4481	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4482	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4483	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4484	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4485	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4486	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4487	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4488	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4489	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4490	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4491	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4492	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4493	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4494	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4495	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4496	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4497	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4498	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4499	#if 0
				4500	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4501	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4502	#endif
				4503
				4504	#if 0
				4505	/* This one is just used for debugging the implementation. */
				4506	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4507	#endif
				4508
				4509	{NULL, NULL}
				4510	};
				4511
				4512	static PyObject *
				4513	unicode_getattr(PyUnicodeObject self, char name)
				4514	{
				4515	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4516	}
				4517
				4518	static PySequenceMethods unicode_as_sequence = {
				4519	(inquiry) unicode_length, /* sq_length */
				4520	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4521	(intargfunc) unicode_repeat, /* sq_repeat */
				4522	(intargfunc) unicode_getitem, /* sq_item */
				4523	(intintargfunc) unicode_slice, /* sq_slice */
				4524	0, /* sq_ass_item */
				4525	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4526	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4527	};
				4528
				4529	static int
				4530	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4531	int index,
				4532	const void **ptr)
				4533	{
				4534	if (index != 0) {
				4535	PyErr_SetString(PyExc_SystemError,
				4536	"accessing non-existent unicode segment");
				4537	return -1;
				4538	}
				4539	ptr = (void ) self->str;
				4540	return PyUnicode_GET_DATA_SIZE(self);
				4541	}
				4542
				4543	static int
				4544	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4545	const void **ptr)
				4546	{
				4547	PyErr_SetString(PyExc_TypeError,
				4548	"cannot use unicode as modifyable buffer");
				4549	return -1;
				4550	}
				4551
				4552	static int
				4553	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4554	int *lenp)
				4555	{
				4556	if (lenp)
				4557	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4558	return 1;
				4559	}
				4560
				4561	static int
				4562	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4563	int index,
				4564	const void **ptr)
				4565	{
				4566	PyObject *str;
				4567
				4568	if (index != 0) {
				4569	PyErr_SetString(PyExc_SystemError,
				4570	"accessing non-existent unicode segment");
				4571	return -1;
				4572	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4573	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4574	if (str == NULL)
				4575	return -1;
				4576	ptr = (void ) PyString_AS_STRING(str);
				4577	return PyString_GET_SIZE(str);
				4578	}
				4579
				4580	/* Helpers for PyUnicode_Format() */
				4581
				4582	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4583	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4584	{
				4585	int argidx = *p_argidx;
				4586	if (argidx < arglen) {
				4587	(*p_argidx)++;
				4588	if (arglen < 0)
				4589	return args;
				4590	else
				4591	return PyTuple_GetItem(args, argidx);
				4592	}
				4593	PyErr_SetString(PyExc_TypeError,
				4594	"not enough arguments for format string");
				4595	return NULL;
				4596	}
				4597
				4598	#define F_LJUST (1<<0)
				4599	#define F_SIGN (1<<1)
				4600	#define F_BLANK (1<<2)
				4601	#define F_ALT (1<<3)
				4602	#define F_ZERO (1<<4)
				4603
				4604	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4605	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4606	{
				4607	register int i;
				4608	int len;
				4609	va_list va;
				4610	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4611	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4612
				4613	/* First, format the string as char array, then expand to Py_UNICODE
				4614	array. */
				4615	charbuffer = (char *)buffer;
				4616	len = vsprintf(charbuffer, format, va);
				4617	for (i = len - 1; i >= 0; i--)
				4618	buffer[i] = (Py_UNICODE) charbuffer[i];
				4619
				4620	va_end(va);
				4621	return len;
				4622	}
				4623
				4624	static int
				4625	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4626	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4627	int flags,
				4628	int prec,
				4629	int type,
				4630	PyObject *v)
				4631	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4632	/* fmt = '%#.' + `prec` + `type`
				4633	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4634	char fmt[20];
				4635	double x;
				4636
				4637	x = PyFloat_AsDouble(v);
				4638	if (x == -1.0 && PyErr_Occurred())
				4639	return -1;
				4640	if (prec < 0)
				4641	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4642	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4643	type = 'g';
				4644	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4645	/* worst case length calc to ensure no buffer overrun:
				4646	fmt = %#.<prec>g
				4647	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4648	for any double rep.)
				4649	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4650	If prec=0 the effective precision is 1 (the leading digit is
				4651	always given), therefore increase by one to 10+prec. */
				4652	if (buflen <= (size_t)10 + (size_t)prec) {
				4653	PyErr_SetString(PyExc_OverflowError,
				4654	"formatted float is too long (precision too long?)");
				4655	return -1;
				4656	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4657	return usprintf(buf, fmt, x);
				4658	}
				4659
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4660	static PyObject*
				4661	formatlong(PyObject *val, int flags, int prec, int type)
				4662	{
				4663	char *buf;
				4664	int i, len;
				4665	PyObject str; / temporary string object. */
				4666	PyUnicodeObject *result;
				4667
				4668	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4669	if (!str)
				4670	return NULL;
				4671	result = _PyUnicode_New(len);
				4672	for (i = 0; i < len; i++)
				4673	result->str[i] = buf[i];
				4674	result->str[len] = 0;
				4675	Py_DECREF(str);
				4676	return (PyObject*)result;
				4677	}
				4678
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4679	static int
				4680	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4681	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4682	int flags,
				4683	int prec,
				4684	int type,
				4685	PyObject *v)
				4686	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4687	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4688	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4689	+ 1 + 1 = 24*/
				4690	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4691	long x;
				4692
				4693	x = PyInt_AsLong(v);
				4694	if (x == -1 && PyErr_Occurred())
				4695	return -1;
				4696	if (prec < 0)
				4697	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4698	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4699	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4700	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4701	PyErr_SetString(PyExc_OverflowError,
				4702	"formatted integer is too long (precision too long?)");
				4703	return -1;
				4704	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4705	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4706	return usprintf(buf, fmt, x);
				4707	}
				4708
				4709	static int
				4710	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4711	size_t buflen,
				4712	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4713	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4714	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4715	if (PyUnicode_Check(v)) {
				4716	if (PyUnicode_GET_SIZE(v) != 1)
				4717	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4718	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4719	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4720
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4721	else if (PyString_Check(v)) {
				4722	if (PyString_GET_SIZE(v) != 1)
				4723	goto onError;
				4724	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4725	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4726
				4727	else {
				4728	/* Integer input truncated to a character */
				4729	long x;
				4730	x = PyInt_AsLong(v);
				4731	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4732	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4733	buf[0] = (char) x;
				4734	}
				4735	buf[1] = '\0';
				4736	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4737
				4738	onError:
				4739	PyErr_SetString(PyExc_TypeError,
				4740	"%c requires int or char");
				4741	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4742	}
				4743
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4744	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4745
				4746	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4747	chars are formatted. XXX This is a magic number. Each formatting
				4748	routine does bounds checking to ensure no overflow, but a better
				4749	solution may be to malloc a buffer of appropriate size for each
				4750	format. For now, the current solution is sufficient.
				4751	*/
				4752	#define FORMATBUFLEN (size_t)120
				4753
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4754	PyObject PyUnicode_Format(PyObject format,
				4755	PyObject *args)
				4756	{
				4757	Py_UNICODE fmt, res;
				4758	int fmtcnt, rescnt, reslen, arglen, argidx;
				4759	int args_owned = 0;
				4760	PyUnicodeObject *result = NULL;
				4761	PyObject *dict = NULL;
				4762	PyObject *uformat;
				4763
				4764	if (format == NULL \|\| args == NULL) {
				4765	PyErr_BadInternalCall();
				4766	return NULL;
				4767	}
				4768	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4769	if (uformat == NULL)
				4770	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4771	fmt = PyUnicode_AS_UNICODE(uformat);
				4772	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4773
				4774	reslen = rescnt = fmtcnt + 100;
				4775	result = _PyUnicode_New(reslen);
				4776	if (result == NULL)
				4777	goto onError;
				4778	res = PyUnicode_AS_UNICODE(result);
				4779
				4780	if (PyTuple_Check(args)) {
				4781	arglen = PyTuple_Size(args);
				4782	argidx = 0;
				4783	}
				4784	else {
				4785	arglen = -1;
				4786	argidx = -2;
				4787	}
				4788	if (args->ob_type->tp_as_mapping)
				4789	dict = args;
				4790
				4791	while (--fmtcnt >= 0) {
				4792	if (*fmt != '%') {
				4793	if (--rescnt < 0) {
				4794	rescnt = fmtcnt + 100;
				4795	reslen += rescnt;
				4796	if (_PyUnicode_Resize(result, reslen) < 0)
				4797	return NULL;
				4798	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4799	--rescnt;
				4800	}
				4801	res++ = fmt++;
				4802	}
				4803	else {
				4804	/* Got a format specifier */
				4805	int flags = 0;
				4806	int width = -1;
				4807	int prec = -1;
				4808	int size = 0;
				4809	Py_UNICODE c = '\0';
				4810	Py_UNICODE fill;
				4811	PyObject *v = NULL;
				4812	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4813	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4814	Py_UNICODE sign;
				4815	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4816	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4817
				4818	fmt++;
				4819	if (*fmt == '(') {
				4820	Py_UNICODE *keystart;
				4821	int keylen;
				4822	PyObject *key;
				4823	int pcount = 1;
				4824
				4825	if (dict == NULL) {
				4826	PyErr_SetString(PyExc_TypeError,
				4827	"format requires a mapping");
				4828	goto onError;
				4829	}
				4830	++fmt;
				4831	--fmtcnt;
				4832	keystart = fmt;
				4833	/* Skip over balanced parentheses */
				4834	while (pcount > 0 && --fmtcnt >= 0) {
				4835	if (*fmt == ')')
				4836	--pcount;
				4837	else if (*fmt == '(')
				4838	++pcount;
				4839	fmt++;
				4840	}
				4841	keylen = fmt - keystart - 1;
				4842	if (fmtcnt < 0 \|\| pcount > 0) {
				4843	PyErr_SetString(PyExc_ValueError,
				4844	"incomplete format key");
				4845	goto onError;
				4846	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4847	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4848	then looked up since Python uses strings to hold
				4849	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4850	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4851	key = PyUnicode_EncodeUTF8(keystart,
				4852	keylen,
				4853	NULL);
				4854	if (key == NULL)
				4855	goto onError;
				4856	if (args_owned) {
				4857	Py_DECREF(args);
				4858	args_owned = 0;
				4859	}
				4860	args = PyObject_GetItem(dict, key);
				4861	Py_DECREF(key);
				4862	if (args == NULL) {
				4863	goto onError;
				4864	}
				4865	args_owned = 1;
				4866	arglen = -1;
				4867	argidx = -2;
				4868	}
				4869	while (--fmtcnt >= 0) {
				4870	switch (c = *fmt++) {
				4871	case '-': flags \|= F_LJUST; continue;
				4872	case '+': flags \|= F_SIGN; continue;
				4873	case ' ': flags \|= F_BLANK; continue;
				4874	case '#': flags \|= F_ALT; continue;
				4875	case '0': flags \|= F_ZERO; continue;
				4876	}
				4877	break;
				4878	}
				4879	if (c == '*') {
				4880	v = getnextarg(args, arglen, &argidx);
				4881	if (v == NULL)
				4882	goto onError;
				4883	if (!PyInt_Check(v)) {
				4884	PyErr_SetString(PyExc_TypeError,
				4885	"* wants int");
				4886	goto onError;
				4887	}
				4888	width = PyInt_AsLong(v);
				4889	if (width < 0) {
				4890	flags \|= F_LJUST;
				4891	width = -width;
				4892	}
				4893	if (--fmtcnt >= 0)
				4894	c = *fmt++;
				4895	}
				4896	else if (c >= '0' && c <= '9') {
				4897	width = c - '0';
				4898	while (--fmtcnt >= 0) {
				4899	c = *fmt++;
				4900	if (c < '0' \|\| c > '9')
				4901	break;
				4902	if ((width*10) / 10 != width) {
				4903	PyErr_SetString(PyExc_ValueError,
				4904	"width too big");
				4905	goto onError;
				4906	}
				4907	width = width*10 + (c - '0');
				4908	}
				4909	}
				4910	if (c == '.') {
				4911	prec = 0;
				4912	if (--fmtcnt >= 0)
				4913	c = *fmt++;
				4914	if (c == '*') {
				4915	v = getnextarg(args, arglen, &argidx);
				4916	if (v == NULL)
				4917	goto onError;
				4918	if (!PyInt_Check(v)) {
				4919	PyErr_SetString(PyExc_TypeError,
				4920	"* wants int");
				4921	goto onError;
				4922	}
				4923	prec = PyInt_AsLong(v);
				4924	if (prec < 0)
				4925	prec = 0;
				4926	if (--fmtcnt >= 0)
				4927	c = *fmt++;
				4928	}
				4929	else if (c >= '0' && c <= '9') {
				4930	prec = c - '0';
				4931	while (--fmtcnt >= 0) {
				4932	c = Py_CHARMASK(*fmt++);
				4933	if (c < '0' \|\| c > '9')
				4934	break;
				4935	if ((prec*10) / 10 != prec) {
				4936	PyErr_SetString(PyExc_ValueError,
				4937	"prec too big");
				4938	goto onError;
				4939	}
				4940	prec = prec*10 + (c - '0');
				4941	}
				4942	}
				4943	} /* prec */
				4944	if (fmtcnt >= 0) {
				4945	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4946	size = c;
				4947	if (--fmtcnt >= 0)
				4948	c = *fmt++;
				4949	}
				4950	}
				4951	if (fmtcnt < 0) {
				4952	PyErr_SetString(PyExc_ValueError,
				4953	"incomplete format");
				4954	goto onError;
				4955	}
				4956	if (c != '%') {
				4957	v = getnextarg(args, arglen, &argidx);
				4958	if (v == NULL)
				4959	goto onError;
				4960	}
				4961	sign = 0;
				4962	fill = ' ';
				4963	switch (c) {
				4964
				4965	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4966	pbuf = formatbuf;
				4967	/* presume that buffer length is at least 1 */
				4968	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4969	len = 1;
				4970	break;
				4971
				4972	case 's':
				4973	case 'r':
				4974	if (PyUnicode_Check(v) && c == 's') {
				4975	temp = v;
				4976	Py_INCREF(temp);
				4977	}
				4978	else {
				4979	PyObject *unicode;
				4980	if (c == 's')
				4981	temp = PyObject_Str(v);
				4982	else
				4983	temp = PyObject_Repr(v);
				4984	if (temp == NULL)
				4985	goto onError;
				4986	if (!PyString_Check(temp)) {
				4987	/* XXX Note: this should never happen, since
				4988	PyObject_Repr() and PyObject_Str() assure
				4989	this */
				4990	Py_DECREF(temp);
				4991	PyErr_SetString(PyExc_TypeError,
				4992	"%s argument has non-string str()");
				4993	goto onError;
				4994	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4995	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4996	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4997	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4998	"strict");
				4999	Py_DECREF(temp);
				5000	temp = unicode;
				5001	if (temp == NULL)
				5002	goto onError;
				5003	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5004	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5005	len = PyUnicode_GET_SIZE(temp);
				5006	if (prec >= 0 && len > prec)
				5007	len = prec;
				5008	break;
				5009
				5010	case 'i':
				5011	case 'd':
				5012	case 'u':
				5013	case 'o':
				5014	case 'x':
				5015	case 'X':
				5016	if (c == 'i')
				5017	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5018	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5019	temp = formatlong(v, flags, prec, c);
				5020	if (!temp)
				5021	goto onError;
				5022	pbuf = PyUnicode_AS_UNICODE(temp);
				5023	len = PyUnicode_GET_SIZE(temp);
				5024	/* unbounded ints can always produce
				5025	a sign character! */
				5026	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5027	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5028	else {
				5029	pbuf = formatbuf;
				5030	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5031	flags, prec, c, v);
				5032	if (len < 0)
				5033	goto onError;
				5034	/* only d conversion is signed */
				5035	sign = c == 'd';
				5036	}
				5037	if (flags & F_ZERO)
				5038	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5039	break;
				5040
				5041	case 'e':
				5042	case 'E':
				5043	case 'f':
				5044	case 'g':
				5045	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5046	pbuf = formatbuf;
				5047	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5048	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5049	if (len < 0)
				5050	goto onError;
				5051	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5052	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5053	fill = '0';
				5054	break;
				5055
				5056	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5057	pbuf = formatbuf;
				5058	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5059	if (len < 0)
				5060	goto onError;
				5061	break;
				5062
				5063	default:
				5064	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5065	"unsupported format character '%c' (0x%x) "
				5066	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5067	(31<=c && c<=126) ? c : '?',
				5068	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5069	goto onError;
				5070	}
				5071	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5072	if (pbuf == '-' \|\| pbuf == '+') {
				5073	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5074	len--;
				5075	}
				5076	else if (flags & F_SIGN)
				5077	sign = '+';
				5078	else if (flags & F_BLANK)
				5079	sign = ' ';
				5080	else
				5081	sign = 0;
				5082	}
				5083	if (width < len)
				5084	width = len;
				5085	if (rescnt < width + (sign != 0)) {
				5086	reslen -= rescnt;
				5087	rescnt = width + fmtcnt + 100;
				5088	reslen += rescnt;
				5089	if (_PyUnicode_Resize(result, reslen) < 0)
				5090	return NULL;
				5091	res = PyUnicode_AS_UNICODE(result)
				5092	+ reslen - rescnt;
				5093	}
				5094	if (sign) {
				5095	if (fill != ' ')
				5096	*res++ = sign;
				5097	rescnt--;
				5098	if (width > len)
				5099	width--;
				5100	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5101	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5102	assert(pbuf[0] == '0');
				5103	assert(pbuf[1] == c);
				5104	if (fill != ' ') {
				5105	res++ = pbuf++;
				5106	res++ = pbuf++;
				5107	}
				5108	rescnt -= 2;
				5109	width -= 2;
				5110	if (width < 0)
				5111	width = 0;
				5112	len -= 2;
				5113	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5114	if (width > len && !(flags & F_LJUST)) {
				5115	do {
				5116	--rescnt;
				5117	*res++ = fill;
				5118	} while (--width > len);
				5119	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5120	if (fill == ' ') {
				5121	if (sign)
				5122	*res++ = sign;
				5123	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5124	assert(pbuf[0] == '0');
				5125	assert(pbuf[1] == c);
				5126	res++ = pbuf++;
				5127	res++ = pbuf++;
				5128	}
				5129	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5130	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5131	res += len;
				5132	rescnt -= len;
				5133	while (--width >= len) {
				5134	--rescnt;
				5135	*res++ = ' ';
				5136	}
				5137	if (dict && (argidx < arglen) && c != '%') {
				5138	PyErr_SetString(PyExc_TypeError,
				5139	"not all arguments converted");
				5140	goto onError;
				5141	}
				5142	Py_XDECREF(temp);
				5143	} /* '%' */
				5144	} /* until end */
				5145	if (argidx < arglen && !dict) {
				5146	PyErr_SetString(PyExc_TypeError,
				5147	"not all arguments converted");
				5148	goto onError;
				5149	}
				5150
				5151	if (args_owned) {
				5152	Py_DECREF(args);
				5153	}
				5154	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5155	if (_PyUnicode_Resize(result, reslen - rescnt))
				5156	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5157	return (PyObject *)result;
				5158
				5159	onError:
				5160	Py_XDECREF(result);
				5161	Py_DECREF(uformat);
				5162	if (args_owned) {
				5163	Py_DECREF(args);
				5164	}
				5165	return NULL;
				5166	}
				5167
				5168	static PyBufferProcs unicode_as_buffer = {
				5169	(getreadbufferproc) unicode_buffer_getreadbuf,
				5170	(getwritebufferproc) unicode_buffer_getwritebuf,
				5171	(getsegcountproc) unicode_buffer_getsegcount,
				5172	(getcharbufferproc) unicode_buffer_getcharbuf,
				5173	};
				5174
				5175	PyTypeObject PyUnicode_Type = {
				5176	PyObject_HEAD_INIT(&PyType_Type)
				5177	0, /* ob_size */
				5178	"unicode", /* tp_name */
				5179	sizeof(PyUnicodeObject), /* tp_size */
				5180	0, /* tp_itemsize */
				5181	/* Slots */
				5182	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5183	0, /* tp_print */
				5184	(getattrfunc)unicode_getattr, /* tp_getattr */
				5185	0, /* tp_setattr */
				5186	(cmpfunc) unicode_compare, /* tp_compare */
				5187	(reprfunc) unicode_repr, /* tp_repr */
				5188	0, /* tp_as_number */
				5189	&unicode_as_sequence, /* tp_as_sequence */
				5190	0, /* tp_as_mapping */
				5191	(hashfunc) unicode_hash, /* tp_hash*/
				5192	0, /* tp_call*/
				5193	(reprfunc) unicode_str, /* tp_str */
				5194	(getattrofunc) NULL, /* tp_getattro */
				5195	(setattrofunc) NULL, /* tp_setattro */
				5196	&unicode_as_buffer, /* tp_as_buffer */
				5197	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5198	};
				5199
				5200	/* Initialize the Unicode implementation */
				5201
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5202	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5203	{
				5204	/* Doublecheck the configuration... */
				5205	if (sizeof(Py_UNICODE) != 2)
				5206	Py_FatalError("Unicode configuration error: "
				5207	"sizeof(Py_UNICODE) != 2 bytes");
				5208
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5209	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5210	unicode_freelist = NULL;
				5211	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5212	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5213	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5214	}
				5215
				5216	/* Finalize the Unicode implementation */
				5217
				5218	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5219	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5220	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5221	PyUnicodeObject *u;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5222
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5223	Py_XDECREF(unicode_empty);
				5224	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5225
				5226	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5227	PyUnicodeObject *v = u;
				5228	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5229	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5230	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5231	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5232	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5233	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5234	unicode_freelist = NULL;
				5235	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5236	}