Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 14866ab0526bef7d29a3ded3948dfada9c9d90bb [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
				4	modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
				5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
				69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	79
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	80	/* Limit for the Unicode object free list */
				81
				82	#define MAX_UNICODE_FREELIST_SIZE 1024
				83
				84	/* Limit for the Unicode object free list stay alive optimization.
				85
				86	The implementation will keep allocated Unicode memory intact for
				87	all objects on the free list having a size less than this
				88	limit. This reduces malloc() overhead for small Unicode objects.
				89
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	90	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	91	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92	malloc()-overhead) bytes of unused garbage.
				93
				94	Setting the limit to 0 effectively turns the feature off.
				95
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	96	Note: This is an experimental feature ! If you get core dumps when
				97	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	98
				99	*/
				100
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	101	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
				111	/* --- Globals ------------------------------------------------------------ */
				112
				113	/* The empty Unicode object */
				114	static PyUnicodeObject *unicode_empty = NULL;
				115
				116	/* Free list for Unicode objects */
				117	static PyUnicodeObject *unicode_freelist = NULL;
				118	static int unicode_freelist_size = 0;
				119
				120	/* --- Unicode Object ----------------------------------------------------- */
				121
				122	static
				123	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				124	int length)
				125	{
				126	void *oldstr;
				127
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	130	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	131
				132	/* Resizing unicode_empty is not allowed. */
				133	if (unicode == unicode_empty) {
				134	PyErr_SetString(PyExc_SystemError,
				135	"can't resize empty unicode object");
				136	return -1;
				137	}
				138
				139	/* We allocate one more byte to make sure the string is
				140	Ux0000 terminated -- XXX is this needed ? */
				141	oldstr = unicode->str;
				142	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				143	if (!unicode->str) {
				144	unicode->str = oldstr;
				145	PyErr_NoMemory();
				146	return -1;
				147	}
				148	unicode->str[length] = 0;
				149	unicode->length = length;
				150
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	151	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	152	/* Reset the object caches */
				153	if (unicode->utf8str) {
				154	Py_DECREF(unicode->utf8str);
				155	unicode->utf8str = NULL;
				156	}
				157	unicode->hash = -1;
				158
				159	return 0;
				160	}
				161
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	162	int PyUnicode_Resize(PyObject **unicode,
				163	int length)
				164	{
				165	PyUnicodeObject *v;
				166
				167	if (unicode == NULL) {
				168	PyErr_BadInternalCall();
				169	return -1;
				170	}
				171	v = (PyUnicodeObject )unicode;
				172	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				173	PyErr_BadInternalCall();
				174	return -1;
				175	}
				176	return _PyUnicode_Resize(v, length);
				177	}
				178
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	179	/* We allocate one more byte to make sure the string is
				180	Ux0000 terminated -- XXX is this needed ?
				181
				182	XXX This allocator could further be enhanced by assuring that the
				183	free list never reduces its size below 1.
				184
				185	*/
				186
				187	static
				188	PyUnicodeObject *_PyUnicode_New(int length)
				189	{
				190	register PyUnicodeObject *unicode;
				191
				192	/* Optimization for empty strings */
				193	if (length == 0 && unicode_empty != NULL) {
				194	Py_INCREF(unicode_empty);
				195	return unicode_empty;
				196	}
				197
				198	/* Unicode freelist & memory allocation */
				199	if (unicode_freelist) {
				200	unicode = unicode_freelist;
				201	unicode_freelist = (PyUnicodeObject *)unicode_freelist;
				202	unicode_freelist_size--;
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	203	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	204	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	205	/* Keep-Alive optimization: we only upsize the buffer,
				206	never downsize it. */
				207	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	208	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	209	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	211	}
				212	}
				213	else
				214	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				215	}
				216	else {
				217	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				218	if (unicode == NULL)
				219	return NULL;
				220	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				221	}
				222
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	223	if (!unicode->str) {
				224	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	225	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	226	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	227	unicode->str[length] = 0;
				228	unicode->length = length;
				229	unicode->hash = -1;
				230	unicode->utf8str = NULL;
				231	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	232
				233	onError:
				234	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	235	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	236	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	237	}
				238
				239	static
				240	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				241	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	242	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	243	/* Keep-Alive optimization */
				244	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	246	unicode->str = NULL;
				247	unicode->length = 0;
				248	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	249	if (unicode->utf8str) {
				250	Py_DECREF(unicode->utf8str);
				251	unicode->utf8str = NULL;
				252	}
				253	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	254	(PyUnicodeObject *)unicode = unicode_freelist;
				255	unicode_freelist = unicode;
				256	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	257	}
				258	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	259	PyMem_DEL(unicode->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	260	Py_XDECREF(unicode->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	261	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	262	}
				263	}
				264
				265	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				266	int size)
				267	{
				268	PyUnicodeObject *unicode;
				269
				270	unicode = _PyUnicode_New(size);
				271	if (!unicode)
				272	return NULL;
				273
				274	/* Copy the Unicode data into the new object */
				275	if (u != NULL)
				276	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				277
				278	return (PyObject *)unicode;
				279	}
				280
				281	#ifdef HAVE_WCHAR_H
				282
				283	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				284	int size)
				285	{
				286	PyUnicodeObject *unicode;
				287
				288	if (w == NULL) {
				289	PyErr_BadInternalCall();
				290	return NULL;
				291	}
				292
				293	unicode = _PyUnicode_New(size);
				294	if (!unicode)
				295	return NULL;
				296
				297	/* Copy the wchar_t data into the new object */
				298	#ifdef HAVE_USABLE_WCHAR_T
				299	memcpy(unicode->str, w, size * sizeof(wchar_t));
				300	#else
				301	{
				302	register Py_UNICODE *u;
				303	register int i;
				304	u = PyUnicode_AS_UNICODE(unicode);
				305	for (i = size; i >= 0; i--)
				306	u++ = w++;
				307	}
				308	#endif
				309
				310	return (PyObject *)unicode;
				311	}
				312
				313	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				314	register wchar_t *w,
				315	int size)
				316	{
				317	if (unicode == NULL) {
				318	PyErr_BadInternalCall();
				319	return -1;
				320	}
				321	if (size > PyUnicode_GET_SIZE(unicode))
				322	size = PyUnicode_GET_SIZE(unicode);
				323	#ifdef HAVE_USABLE_WCHAR_T
				324	memcpy(w, unicode->str, size * sizeof(wchar_t));
				325	#else
				326	{
				327	register Py_UNICODE *u;
				328	register int i;
				329	u = PyUnicode_AS_UNICODE(unicode);
				330	for (i = size; i >= 0; i--)
				331	w++ = u++;
				332	}
				333	#endif
				334
				335	return size;
				336	}
				337
				338	#endif
				339
				340	PyObject PyUnicode_FromObject(register PyObject obj)
				341	{
				342	const char *s;
				343	int len;
				344
				345	if (obj == NULL) {
				346	PyErr_BadInternalCall();
				347	return NULL;
				348	}
				349	else if (PyUnicode_Check(obj)) {
				350	Py_INCREF(obj);
				351	return obj;
				352	}
				353	else if (PyString_Check(obj)) {
				354	s = PyString_AS_STRING(obj);
				355	len = PyString_GET_SIZE(obj);
				356	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	357	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				358	/* Overwrite the error message with something more useful in
				359	case of a TypeError. */
				360	if (PyErr_ExceptionMatches(PyExc_TypeError))
				361	PyErr_SetString(PyExc_TypeError,
				362	"coercing to Unicode: need string or charbuffer");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	363	return NULL;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	364	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	365	if (len == 0) {
				366	Py_INCREF(unicode_empty);
				367	return (PyObject *)unicode_empty;
				368	}
				369	return PyUnicode_DecodeUTF8(s, len, "strict");
				370	}
				371
				372	PyObject PyUnicode_Decode(const char s,
				373	int size,
				374	const char *encoding,
				375	const char *errors)
				376	{
				377	PyObject buffer = NULL, unicode;
				378
				379	/* Shortcut for the default encoding UTF-8 */
				380	if (encoding == NULL \|\|
				381	(strcmp(encoding, "utf-8") == 0))
				382	return PyUnicode_DecodeUTF8(s, size, errors);
				383
				384	/* Decode via the codec registry */
				385	buffer = PyBuffer_FromMemory((void *)s, size);
				386	if (buffer == NULL)
				387	goto onError;
				388	unicode = PyCodec_Decode(buffer, encoding, errors);
				389	if (unicode == NULL)
				390	goto onError;
				391	if (!PyUnicode_Check(unicode)) {
				392	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	393	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	394	unicode->ob_type->tp_name);
				395	Py_DECREF(unicode);
				396	goto onError;
				397	}
				398	Py_DECREF(buffer);
				399	return unicode;
				400
				401	onError:
				402	Py_XDECREF(buffer);
				403	return NULL;
				404	}
				405
				406	PyObject PyUnicode_Encode(const Py_UNICODE s,
				407	int size,
				408	const char *encoding,
				409	const char *errors)
				410	{
				411	PyObject v, unicode;
				412
				413	unicode = PyUnicode_FromUnicode(s, size);
				414	if (unicode == NULL)
				415	return NULL;
				416	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				417	Py_DECREF(unicode);
				418	return v;
				419	}
				420
				421	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				422	const char *encoding,
				423	const char *errors)
				424	{
				425	PyObject *v;
				426
				427	if (!PyUnicode_Check(unicode)) {
				428	PyErr_BadArgument();
				429	goto onError;
				430	}
				431	/* Shortcut for the default encoding UTF-8 */
				432	if ((encoding == NULL \|\|
				433	(strcmp(encoding, "utf-8") == 0)) &&
				434	errors == NULL)
				435	return PyUnicode_AsUTF8String(unicode);
				436
				437	/* Encode via the codec registry */
				438	v = PyCodec_Encode(unicode, encoding, errors);
				439	if (v == NULL)
				440	goto onError;
				441	/* XXX Should we really enforce this ? */
				442	if (!PyString_Check(v)) {
				443	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	444	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	445	v->ob_type->tp_name);
				446	Py_DECREF(v);
				447	goto onError;
				448	}
				449	return v;
				450
				451	onError:
				452	return NULL;
				453	}
				454
				455	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				456	{
				457	if (!PyUnicode_Check(unicode)) {
				458	PyErr_BadArgument();
				459	goto onError;
				460	}
				461	return PyUnicode_AS_UNICODE(unicode);
				462
				463	onError:
				464	return NULL;
				465	}
				466
				467	int PyUnicode_GetSize(PyObject *unicode)
				468	{
				469	if (!PyUnicode_Check(unicode)) {
				470	PyErr_BadArgument();
				471	goto onError;
				472	}
				473	return PyUnicode_GET_SIZE(unicode);
				474
				475	onError:
				476	return -1;
				477	}
				478
				479	/* --- UTF-8 Codec -------------------------------------------------------- */
				480
				481	static
				482	char utf8_code_length[256] = {
				483	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				484	illegal prefix. see RFC 2279 for details */
				485	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				486	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				487	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				488	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				489	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				490	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				491	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				492	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				493	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				494	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				495	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				496	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				497	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				498	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				499	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				500	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				501	};
				502
				503	static
				504	int utf8_decoding_error(const char **source,
				505	Py_UNICODE **dest,
				506	const char *errors,
				507	const char *details)
				508	{
				509	if ((errors == NULL) \|\|
				510	(strcmp(errors,"strict") == 0)) {
				511	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	512	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	513	details);
				514	return -1;
				515	}
				516	else if (strcmp(errors,"ignore") == 0) {
				517	(*source)++;
				518	return 0;
				519	}
				520	else if (strcmp(errors,"replace") == 0) {
				521	(*source)++;
				522	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				523	(*dest)++;
				524	return 0;
				525	}
				526	else {
				527	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	528	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	529	errors);
				530	return -1;
				531	}
				532	}
				533
				534	#define UTF8_ERROR(details) do { \
				535	if (utf8_decoding_error(&s, &p, errors, details)) \
				536	goto onError; \
				537	continue; \
				538	} while (0)
				539
				540	PyObject PyUnicode_DecodeUTF8(const char s,
				541	int size,
				542	const char *errors)
				543	{
				544	int n;
				545	const char *e;
				546	PyUnicodeObject *unicode;
				547	Py_UNICODE *p;
				548
				549	/* Note: size will always be longer than the resulting Unicode
				550	character count */
				551	unicode = _PyUnicode_New(size);
				552	if (!unicode)
				553	return NULL;
				554	if (size == 0)
				555	return (PyObject *)unicode;
				556
				557	/* Unpack UTF-8 encoded data */
				558	p = unicode->str;
				559	e = s + size;
				560
				561	while (s < e) {
				562	register Py_UNICODE ch = (unsigned char)*s;
				563
				564	if (ch < 0x80) {
				565	*p++ = ch;
				566	s++;
				567	continue;
				568	}
				569
				570	n = utf8_code_length[ch];
				571
				572	if (s + n > e)
				573	UTF8_ERROR("unexpected end of data");
				574
				575	switch (n) {
				576
				577	case 0:
				578	UTF8_ERROR("unexpected code byte");
				579	break;
				580
				581	case 1:
				582	UTF8_ERROR("internal error");
				583	break;
				584
				585	case 2:
				586	if ((s[1] & 0xc0) != 0x80)
				587	UTF8_ERROR("invalid data");
				588	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				589	if (ch < 0x80)
				590	UTF8_ERROR("illegal encoding");
				591	else
				592	*p++ = ch;
				593	break;
				594
				595	case 3:
				596	if ((s[1] & 0xc0) != 0x80 \|\|
				597	(s[2] & 0xc0) != 0x80)
				598	UTF8_ERROR("invalid data");
				599	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				600	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				601	UTF8_ERROR("illegal encoding");
				602	else
				603	*p++ = ch;
				604	break;
				605
				606	default:
				607	/* Other sizes are only needed for UCS-4 */
				608	UTF8_ERROR("unsupported Unicode code range");
				609	}
				610	s += n;
				611	}
				612
				613	/* Adjust length */
				614	if (_PyUnicode_Resize(unicode, p - unicode->str))
				615	goto onError;
				616
				617	return (PyObject *)unicode;
				618
				619	onError:
				620	Py_DECREF(unicode);
				621	return NULL;
				622	}
				623
				624	#undef UTF8_ERROR
				625
				626	static
				627	int utf8_encoding_error(const Py_UNICODE **source,
				628	char **dest,
				629	const char *errors,
				630	const char *details)
				631	{
				632	if ((errors == NULL) \|\|
				633	(strcmp(errors,"strict") == 0)) {
				634	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	635	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	636	details);
				637	return -1;
				638	}
				639	else if (strcmp(errors,"ignore") == 0) {
				640	return 0;
				641	}
				642	else if (strcmp(errors,"replace") == 0) {
				643	**dest = '?';
				644	(*dest)++;
				645	return 0;
				646	}
				647	else {
				648	PyErr_Format(PyExc_ValueError,
				649	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	650	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	651	errors);
				652	return -1;
				653	}
				654	}
				655
				656	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				657	int size,
				658	const char *errors)
				659	{
				660	PyObject *v;
				661	char *p;
				662	char *q;
				663
				664	v = PyString_FromStringAndSize(NULL, 3 * size);
				665	if (v == NULL)
				666	return NULL;
				667	if (size == 0)
				668	goto done;
				669
				670	p = q = PyString_AS_STRING(v);
				671	while (size-- > 0) {
				672	Py_UNICODE ch = *s++;
				673	if (ch < 0x80)
				674	*p++ = (char) ch;
				675	else if (ch < 0x0800) {
				676	*p++ = 0xc0 \| (ch >> 6);
				677	*p++ = 0x80 \| (ch & 0x3f);
				678	} else if (0xD800 <= ch && ch <= 0xDFFF) {
				679	/* These byte ranges are reserved for UTF-16 surrogate
				680	bytes which the Python implementation currently does
				681	not support. */
				682	printf("code range problem: U+%04x\n", ch);
				683	if (utf8_encoding_error(&s, &p, errors,
				684	"unsupported code range"))
				685	goto onError;
				686	} else {
				687	*p++ = 0xe0 \| (ch >> 12);
				688	*p++ = 0x80 \| ((ch >> 6) & 0x3f);
				689	*p++ = 0x80 \| (ch & 0x3f);
				690	}
				691	}
				692	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	693	if (_PyString_Resize(&v, p - q))
				694	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	695
				696	done:
				697	return v;
				698
				699	onError:
				700	Py_DECREF(v);
				701	return NULL;
				702	}
				703
				704	/* Return a Python string holding the UTF-8 encoded value of the
				705	Unicode object.
				706
				707	The resulting string is cached in the Unicode object for subsequent
				708	usage by this function. The cached version is needed to implement
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	709	the character buffer interface and will live (at least) as long as
				710	the Unicode object itself.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	711
				712	The refcount of the string is not incremented.
				713
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	714	* Exported for internal use by the interpreter only !!! *
				715
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	716	*/
				717
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	718	PyObject _PyUnicode_AsUTF8String(PyObject unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	719	const char *errors)
				720	{
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	721	PyObject v = ((PyUnicodeObject )unicode)->utf8str;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	722
				723	if (v)
				724	return v;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	725	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				726	PyUnicode_GET_SIZE(unicode),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	727	errors);
				728	if (v && errors == NULL)
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	729	((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	730	return v;
				731	}
				732
				733	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				734	{
				735	PyObject *str;
				736
				737	if (!PyUnicode_Check(unicode)) {
				738	PyErr_BadArgument();
				739	return NULL;
				740	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	741	str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	742	if (str == NULL)
				743	return NULL;
				744	Py_INCREF(str);
				745	return str;
				746	}
				747
				748	/* --- UTF-16 Codec ------------------------------------------------------- */
				749
				750	static
				751	int utf16_decoding_error(const Py_UNICODE **source,
				752	Py_UNICODE **dest,
				753	const char *errors,
				754	const char *details)
				755	{
				756	if ((errors == NULL) \|\|
				757	(strcmp(errors,"strict") == 0)) {
				758	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	759	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	760	details);
				761	return -1;
				762	}
				763	else if (strcmp(errors,"ignore") == 0) {
				764	return 0;
				765	}
				766	else if (strcmp(errors,"replace") == 0) {
				767	if (dest) {
				768	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				769	(*dest)++;
				770	}
				771	return 0;
				772	}
				773	else {
				774	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	775	"UTF-16 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	776	errors);
				777	return -1;
				778	}
				779	}
				780
				781	#define UTF16_ERROR(details) do { \
				782	if (utf16_decoding_error(&q, &p, errors, details)) \
				783	goto onError; \
				784	continue; \
				785	} while(0)
				786
				787	PyObject PyUnicode_DecodeUTF16(const char s,
				788	int size,
				789	const char *errors,
				790	int *byteorder)
				791	{
				792	PyUnicodeObject *unicode;
				793	Py_UNICODE *p;
				794	const Py_UNICODE q, e;
				795	int bo = 0;
				796
				797	/* size should be an even number */
				798	if (size % sizeof(Py_UNICODE) != 0) {
				799	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				800	return NULL;
				801	/* The remaining input chars are ignored if we fall through
				802	here... */
				803	}
				804
				805	/* Note: size will always be longer than the resulting Unicode
				806	character count */
				807	unicode = _PyUnicode_New(size);
				808	if (!unicode)
				809	return NULL;
				810	if (size == 0)
				811	return (PyObject *)unicode;
				812
				813	/* Unpack UTF-16 encoded data */
				814	p = unicode->str;
				815	q = (Py_UNICODE *)s;
				816	e = q + (size / sizeof(Py_UNICODE));
				817
				818	if (byteorder)
				819	bo = *byteorder;
				820
				821	while (q < e) {
				822	register Py_UNICODE ch = *q++;
				823
				824	/* Check for BOM marks (U+FEFF) in the input and adjust
				825	current byte order setting accordingly. Swap input
				826	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				827	!) */
				828	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				829	if (ch == 0xFEFF) {
				830	bo = -1;
				831	continue;
				832	} else if (ch == 0xFFFE) {
				833	bo = 1;
				834	continue;
				835	}
				836	if (bo == 1)
				837	ch = (ch >> 8) \| (ch << 8);
				838	#else
				839	if (ch == 0xFEFF) {
				840	bo = 1;
				841	continue;
				842	} else if (ch == 0xFFFE) {
				843	bo = -1;
				844	continue;
				845	}
				846	if (bo == -1)
				847	ch = (ch >> 8) \| (ch << 8);
				848	#endif
				849	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				850	*p++ = ch;
				851	continue;
				852	}
				853
				854	/* UTF-16 code pair: */
				855	if (q >= e)
				856	UTF16_ERROR("unexpected end of data");
				857	if (0xDC00 <= q && q <= 0xDFFF) {
				858	q++;
				859	if (0xD800 <= q && q <= 0xDBFF)
				860	/* This is valid data (a UTF-16 surrogate pair), but
				861	we are not able to store this information since our
				862	Py_UNICODE type only has 16 bits... this might
				863	change someday, even though it's unlikely. */
				864	UTF16_ERROR("code pairs are not supported");
				865	else
				866	continue;
				867	}
				868	UTF16_ERROR("illegal encoding");
				869	}
				870
				871	if (byteorder)
				872	*byteorder = bo;
				873
				874	/* Adjust length */
				875	if (_PyUnicode_Resize(unicode, p - unicode->str))
				876	goto onError;
				877
				878	return (PyObject *)unicode;
				879
				880	onError:
				881	Py_DECREF(unicode);
				882	return NULL;
				883	}
				884
				885	#undef UTF16_ERROR
				886
				887	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				888	int size,
				889	const char *errors,
				890	int byteorder)
				891	{
				892	PyObject *v;
				893	Py_UNICODE *p;
				894	char *q;
				895
				896	/* We don't create UTF-16 pairs... */
				897	v = PyString_FromStringAndSize(NULL,
				898	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				899	if (v == NULL)
				900	return NULL;
				901	if (size == 0)
				902	goto done;
				903
				904	q = PyString_AS_STRING(v);
				905	p = (Py_UNICODE *)q;
				906
				907	if (byteorder == 0)
				908	*p++ = 0xFEFF;
				909	if (byteorder == 0 \|\|
				910	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				911	byteorder == -1
				912	#else
				913	byteorder == 1
				914	#endif
				915	)
				916	memcpy(p, s, size * sizeof(Py_UNICODE));
				917	else
				918	while (size-- > 0) {
				919	Py_UNICODE ch = *s++;
				920	*p++ = (ch >> 8) \| (ch << 8);
				921	}
				922	done:
				923	return v;
				924	}
				925
				926	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				927	{
				928	if (!PyUnicode_Check(unicode)) {
				929	PyErr_BadArgument();
				930	return NULL;
				931	}
				932	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				933	PyUnicode_GET_SIZE(unicode),
				934	NULL,
				935	0);
				936	}
				937
				938	/* --- Unicode Escape Codec ----------------------------------------------- */
				939
				940	static
				941	int unicodeescape_decoding_error(const char **source,
				942	unsigned int *x,
				943	const char *errors,
				944	const char *details)
				945	{
				946	if ((errors == NULL) \|\|
				947	(strcmp(errors,"strict") == 0)) {
				948	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	949	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	950	details);
				951	return -1;
				952	}
				953	else if (strcmp(errors,"ignore") == 0) {
				954	return 0;
				955	}
				956	else if (strcmp(errors,"replace") == 0) {
				957	*x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
				958	return 0;
				959	}
				960	else {
				961	PyErr_Format(PyExc_ValueError,
				962	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	963	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	964	errors);
				965	return -1;
				966	}
				967	}
				968
				969	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				970	int size,
				971	const char *errors)
				972	{
				973	PyUnicodeObject *v;
				974	Py_UNICODE p = NULL, buf = NULL;
				975	const char *end;
				976
				977	/* Escaped strings will always be longer than the resulting
				978	Unicode string, so we start with size here and then reduce the
				979	length after conversion to the true value. */
				980	v = _PyUnicode_New(size);
				981	if (v == NULL)
				982	goto onError;
				983	if (size == 0)
				984	return (PyObject *)v;
				985	p = buf = PyUnicode_AS_UNICODE(v);
				986	end = s + size;
				987	while (s < end) {
				988	unsigned char c;
				989	unsigned int x;
				990	int i;
				991
				992	/* Non-escape characters are interpreted as Unicode ordinals */
				993	if (*s != '\\') {
				994	p++ = (unsigned char)s++;
				995	continue;
				996	}
				997
				998	/* \ - Escapes */
				999	s++;
				1000	switch (*s++) {
				1001
				1002	/* \x escapes */
				1003	case '\n': break;
				1004	case '\\': *p++ = '\\'; break;
				1005	case '\'': *p++ = '\''; break;
				1006	case '\"': *p++ = '\"'; break;
				1007	case 'b': *p++ = '\b'; break;
				1008	case 'f': p++ = '\014'; break; / FF */
				1009	case 't': *p++ = '\t'; break;
				1010	case 'n': *p++ = '\n'; break;
				1011	case 'r': *p++ = '\r'; break;
				1012	case 'v': p++ = '\013'; break; / VT */
				1013	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1014
				1015	/* \OOO (octal) escapes */
				1016	case '0': case '1': case '2': case '3':
				1017	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1018	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1019	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1020	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1021	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1022	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1023	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1024	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1025	break;
				1026
				1027	/* \xXXXX escape with 0-4 hex digits */
				1028	case 'x':
				1029	x = 0;
				1030	c = (unsigned char)*s;
				1031	if (isxdigit(c)) {
				1032	do {
				1033	x = (x<<4) & ~0xF;
				1034	if ('0' <= c && c <= '9')
				1035	x += c - '0';
				1036	else if ('a' <= c && c <= 'f')
				1037	x += 10 + c - 'a';
				1038	else
				1039	x += 10 + c - 'A';
				1040	c = (unsigned char)*++s;
				1041	} while (isxdigit(c));
				1042	*p++ = x;
				1043	} else {
				1044	*p++ = '\\';
				1045	*p++ = (unsigned char)s[-1];
				1046	}
				1047	break;
				1048
				1049	/* \uXXXX with 4 hex digits */
				1050	case 'u':
				1051	for (x = 0, i = 0; i < 4; i++) {
				1052	c = (unsigned char)s[i];
				1053	if (!isxdigit(c)) {
				1054	if (unicodeescape_decoding_error(&s, &x, errors,
				1055	"truncated \\uXXXX"))
				1056	goto onError;
				1057	i++;
				1058	break;
				1059	}
				1060	x = (x<<4) & ~0xF;
				1061	if (c >= '0' && c <= '9')
				1062	x += c - '0';
				1063	else if (c >= 'a' && c <= 'f')
				1064	x += 10 + c - 'a';
				1065	else
				1066	x += 10 + c - 'A';
				1067	}
				1068	s += i;
				1069	*p++ = x;
				1070	break;
				1071
				1072	default:
				1073	*p++ = '\\';
				1074	*p++ = (unsigned char)s[-1];
				1075	break;
				1076	}
				1077	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1078	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1079	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1080	return (PyObject *)v;
				1081
				1082	onError:
				1083	Py_XDECREF(v);
				1084	return NULL;
				1085	}
				1086
				1087	/* Return a Unicode-Escape string version of the Unicode object.
				1088
				1089	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1090	appropriate.
				1091
				1092	*/
				1093
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1094	static const Py_UNICODE findchar(const Py_UNICODE s,
				1095	int size,
				1096	Py_UNICODE ch);
				1097
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1098	static
				1099	PyObject unicodeescape_string(const Py_UNICODE s,
				1100	int size,
				1101	int quotes)
				1102	{
				1103	PyObject *repr;
				1104	char *p;
				1105	char *q;
				1106
				1107	static const char *hexdigit = "0123456789ABCDEF";
				1108
				1109	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1110	if (repr == NULL)
				1111	return NULL;
				1112
				1113	p = q = PyString_AS_STRING(repr);
				1114
				1115	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1116	*p++ = 'u';
				1117	*p++ = (findchar(s, size, '\'') &&
				1118	!findchar(s, size, '"')) ? '"' : '\'';
				1119	}
				1120	while (size-- > 0) {
				1121	Py_UNICODE ch = *s++;
				1122	/* Escape quotes */
				1123	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1124	*p++ = '\\';
				1125	*p++ = (char) ch;
				1126	}
				1127	/* Map 16-bit characters to '\uxxxx' */
				1128	else if (ch >= 256) {
				1129	*p++ = '\\';
				1130	*p++ = 'u';
				1131	*p++ = hexdigit[(ch >> 12) & 0xf];
				1132	*p++ = hexdigit[(ch >> 8) & 0xf];
				1133	*p++ = hexdigit[(ch >> 4) & 0xf];
				1134	*p++ = hexdigit[ch & 15];
				1135	}
				1136	/* Map non-printable US ASCII to '\ooo' */
				1137	else if (ch < ' ' \|\| ch >= 128) {
				1138	*p++ = '\\';
				1139	*p++ = hexdigit[(ch >> 6) & 7];
				1140	*p++ = hexdigit[(ch >> 3) & 7];
				1141	*p++ = hexdigit[ch & 7];
				1142	}
				1143	/* Copy everything else as-is */
				1144	else
				1145	*p++ = (char) ch;
				1146	}
				1147	if (quotes)
				1148	*p++ = q[1];
				1149
				1150	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1151	if (_PyString_Resize(&repr, p - q))
				1152	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1153
				1154	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1155
				1156	onError:
				1157	Py_DECREF(repr);
				1158	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1159	}
				1160
				1161	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1162	int size)
				1163	{
				1164	return unicodeescape_string(s, size, 0);
				1165	}
				1166
				1167	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1168	{
				1169	if (!PyUnicode_Check(unicode)) {
				1170	PyErr_BadArgument();
				1171	return NULL;
				1172	}
				1173	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1174	PyUnicode_GET_SIZE(unicode));
				1175	}
				1176
				1177	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1178
				1179	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1180	int size,
				1181	const char *errors)
				1182	{
				1183	PyUnicodeObject *v;
				1184	Py_UNICODE p, buf;
				1185	const char *end;
				1186	const char *bs;
				1187
				1188	/* Escaped strings will always be longer than the resulting
				1189	Unicode string, so we start with size here and then reduce the
				1190	length after conversion to the true value. */
				1191	v = _PyUnicode_New(size);
				1192	if (v == NULL)
				1193	goto onError;
				1194	if (size == 0)
				1195	return (PyObject *)v;
				1196	p = buf = PyUnicode_AS_UNICODE(v);
				1197	end = s + size;
				1198	while (s < end) {
				1199	unsigned char c;
				1200	unsigned int x;
				1201	int i;
				1202
				1203	/* Non-escape characters are interpreted as Unicode ordinals */
				1204	if (*s != '\\') {
				1205	p++ = (unsigned char)s++;
				1206	continue;
				1207	}
				1208
				1209	/* \u-escapes are only interpreted iff the number of leading
				1210	backslashes if odd */
				1211	bs = s;
				1212	for (;s < end;) {
				1213	if (*s != '\\')
				1214	break;
				1215	p++ = (unsigned char)s++;
				1216	}
				1217	if (((s - bs) & 1) == 0 \|\|
				1218	s >= end \|\|
				1219	*s != 'u') {
				1220	continue;
				1221	}
				1222	p--;
				1223	s++;
				1224
				1225	/* \uXXXX with 4 hex digits */
				1226	for (x = 0, i = 0; i < 4; i++) {
				1227	c = (unsigned char)s[i];
				1228	if (!isxdigit(c)) {
				1229	if (unicodeescape_decoding_error(&s, &x, errors,
				1230	"truncated \\uXXXX"))
				1231	goto onError;
				1232	i++;
				1233	break;
				1234	}
				1235	x = (x<<4) & ~0xF;
				1236	if (c >= '0' && c <= '9')
				1237	x += c - '0';
				1238	else if (c >= 'a' && c <= 'f')
				1239	x += 10 + c - 'a';
				1240	else
				1241	x += 10 + c - 'A';
				1242	}
				1243	s += i;
				1244	*p++ = x;
				1245	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1246	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1247	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1248	return (PyObject *)v;
				1249
				1250	onError:
				1251	Py_XDECREF(v);
				1252	return NULL;
				1253	}
				1254
				1255	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1256	int size)
				1257	{
				1258	PyObject *repr;
				1259	char *p;
				1260	char *q;
				1261
				1262	static const char *hexdigit = "0123456789ABCDEF";
				1263
				1264	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1265	if (repr == NULL)
				1266	return NULL;
				1267
				1268	p = q = PyString_AS_STRING(repr);
				1269	while (size-- > 0) {
				1270	Py_UNICODE ch = *s++;
				1271	/* Map 16-bit characters to '\uxxxx' */
				1272	if (ch >= 256) {
				1273	*p++ = '\\';
				1274	*p++ = 'u';
				1275	*p++ = hexdigit[(ch >> 12) & 0xf];
				1276	*p++ = hexdigit[(ch >> 8) & 0xf];
				1277	*p++ = hexdigit[(ch >> 4) & 0xf];
				1278	*p++ = hexdigit[ch & 15];
				1279	}
				1280	/* Copy everything else as-is */
				1281	else
				1282	*p++ = (char) ch;
				1283	}
				1284	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1285	if (_PyString_Resize(&repr, p - q))
				1286	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1287
				1288	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1289
				1290	onError:
				1291	Py_DECREF(repr);
				1292	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1293	}
				1294
				1295	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1296	{
				1297	if (!PyUnicode_Check(unicode)) {
				1298	PyErr_BadArgument();
				1299	return NULL;
				1300	}
				1301	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1302	PyUnicode_GET_SIZE(unicode));
				1303	}
				1304
				1305	/* --- Latin-1 Codec ------------------------------------------------------ */
				1306
				1307	PyObject PyUnicode_DecodeLatin1(const char s,
				1308	int size,
				1309	const char *errors)
				1310	{
				1311	PyUnicodeObject *v;
				1312	Py_UNICODE *p;
				1313
				1314	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1315	v = _PyUnicode_New(size);
				1316	if (v == NULL)
				1317	goto onError;
				1318	if (size == 0)
				1319	return (PyObject *)v;
				1320	p = PyUnicode_AS_UNICODE(v);
				1321	while (size-- > 0)
				1322	p++ = (unsigned char)s++;
				1323	return (PyObject *)v;
				1324
				1325	onError:
				1326	Py_XDECREF(v);
				1327	return NULL;
				1328	}
				1329
				1330	static
				1331	int latin1_encoding_error(const Py_UNICODE **source,
				1332	char **dest,
				1333	const char *errors,
				1334	const char *details)
				1335	{
				1336	if ((errors == NULL) \|\|
				1337	(strcmp(errors,"strict") == 0)) {
				1338	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1339	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1340	details);
				1341	return -1;
				1342	}
				1343	else if (strcmp(errors,"ignore") == 0) {
				1344	return 0;
				1345	}
				1346	else if (strcmp(errors,"replace") == 0) {
				1347	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1348	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1349	return 0;
				1350	}
				1351	else {
				1352	PyErr_Format(PyExc_ValueError,
				1353	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1354	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1355	errors);
				1356	return -1;
				1357	}
				1358	}
				1359
				1360	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1361	int size,
				1362	const char *errors)
				1363	{
				1364	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1365	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1366	repr = PyString_FromStringAndSize(NULL, size);
				1367	if (repr == NULL)
				1368	return NULL;
				1369
				1370	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1371	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1372	while (size-- > 0) {
				1373	Py_UNICODE ch = *p++;
				1374	if (ch >= 256) {
				1375	if (latin1_encoding_error(&p, &s, errors,
				1376	"ordinal not in range(256)"))
				1377	goto onError;
				1378	}
				1379	else
				1380	*s++ = (char)ch;
				1381	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1382	/* Resize if error handling skipped some characters */
				1383	if (s - start < PyString_GET_SIZE(repr))
				1384	if (_PyString_Resize(&repr, s - start))
				1385	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1386	return repr;
				1387
				1388	onError:
				1389	Py_DECREF(repr);
				1390	return NULL;
				1391	}
				1392
				1393	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1394	{
				1395	if (!PyUnicode_Check(unicode)) {
				1396	PyErr_BadArgument();
				1397	return NULL;
				1398	}
				1399	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1400	PyUnicode_GET_SIZE(unicode),
				1401	NULL);
				1402	}
				1403
				1404	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1405
				1406	static
				1407	int ascii_decoding_error(const char **source,
				1408	Py_UNICODE **dest,
				1409	const char *errors,
				1410	const char *details)
				1411	{
				1412	if ((errors == NULL) \|\|
				1413	(strcmp(errors,"strict") == 0)) {
				1414	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1415	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1416	details);
				1417	return -1;
				1418	}
				1419	else if (strcmp(errors,"ignore") == 0) {
				1420	return 0;
				1421	}
				1422	else if (strcmp(errors,"replace") == 0) {
				1423	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1424	(*dest)++;
				1425	return 0;
				1426	}
				1427	else {
				1428	PyErr_Format(PyExc_ValueError,
				1429	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1430	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1431	errors);
				1432	return -1;
				1433	}
				1434	}
				1435
				1436	PyObject PyUnicode_DecodeASCII(const char s,
				1437	int size,
				1438	const char *errors)
				1439	{
				1440	PyUnicodeObject *v;
				1441	Py_UNICODE *p;
				1442
				1443	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1444	v = _PyUnicode_New(size);
				1445	if (v == NULL)
				1446	goto onError;
				1447	if (size == 0)
				1448	return (PyObject *)v;
				1449	p = PyUnicode_AS_UNICODE(v);
				1450	while (size-- > 0) {
				1451	register unsigned char c;
				1452
				1453	c = (unsigned char)*s++;
				1454	if (c < 128)
				1455	*p++ = c;
				1456	else if (ascii_decoding_error(&s, &p, errors,
				1457	"ordinal not in range(128)"))
				1458	goto onError;
				1459	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1460	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1461	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1462	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1463	return (PyObject *)v;
				1464
				1465	onError:
				1466	Py_XDECREF(v);
				1467	return NULL;
				1468	}
				1469
				1470	static
				1471	int ascii_encoding_error(const Py_UNICODE **source,
				1472	char **dest,
				1473	const char *errors,
				1474	const char *details)
				1475	{
				1476	if ((errors == NULL) \|\|
				1477	(strcmp(errors,"strict") == 0)) {
				1478	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1479	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1480	details);
				1481	return -1;
				1482	}
				1483	else if (strcmp(errors,"ignore") == 0) {
				1484	return 0;
				1485	}
				1486	else if (strcmp(errors,"replace") == 0) {
				1487	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1488	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1489	return 0;
				1490	}
				1491	else {
				1492	PyErr_Format(PyExc_ValueError,
				1493	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1494	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1495	errors);
				1496	return -1;
				1497	}
				1498	}
				1499
				1500	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1501	int size,
				1502	const char *errors)
				1503	{
				1504	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1505	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1506	repr = PyString_FromStringAndSize(NULL, size);
				1507	if (repr == NULL)
				1508	return NULL;
				1509
				1510	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1511	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1512	while (size-- > 0) {
				1513	Py_UNICODE ch = *p++;
				1514	if (ch >= 128) {
				1515	if (ascii_encoding_error(&p, &s, errors,
				1516	"ordinal not in range(128)"))
				1517	goto onError;
				1518	}
				1519	else
				1520	*s++ = (char)ch;
				1521	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1522	/* Resize if error handling skipped some characters */
				1523	if (s - start < PyString_GET_SIZE(repr))
				1524	if (_PyString_Resize(&repr, s - start))
				1525	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1526	return repr;
				1527
				1528	onError:
				1529	Py_DECREF(repr);
				1530	return NULL;
				1531	}
				1532
				1533	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1534	{
				1535	if (!PyUnicode_Check(unicode)) {
				1536	PyErr_BadArgument();
				1537	return NULL;
				1538	}
				1539	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1540	PyUnicode_GET_SIZE(unicode),
				1541	NULL);
				1542	}
				1543
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1544	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1545
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1546	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1547
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1548	PyObject PyUnicode_DecodeMBCS(const char s,
				1549	int size,
				1550	const char *errors)
				1551	{
				1552	PyUnicodeObject *v;
				1553	Py_UNICODE *p;
				1554
				1555	/* First get the size of the result */
				1556	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame^]	1557	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1558	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1559
				1560	v = _PyUnicode_New(usize);
				1561	if (v == NULL)
				1562	return NULL;
				1563	if (usize == 0)
				1564	return (PyObject *)v;
				1565	p = PyUnicode_AS_UNICODE(v);
				1566	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1567	Py_DECREF(v);
				1568	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1569	}
				1570
				1571	return (PyObject *)v;
				1572	}
				1573
				1574	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1575	int size,
				1576	const char *errors)
				1577	{
				1578	PyObject *repr;
				1579	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame^]	1580	DWORD mbcssize;
				1581
				1582	/* If there are no characters, bail now! */
				1583	if (size==0)
				1584	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1585
				1586	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame^]	1587	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1588	if (mbcssize==0)
				1589	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1590
				1591	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1592	if (repr == NULL)
				1593	return NULL;
				1594	if (mbcssize==0)
				1595	return repr;
				1596
				1597	/* Do the conversion */
				1598	s = PyString_AS_STRING(repr);
				1599	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1600	Py_DECREF(repr);
				1601	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1602	}
				1603	return repr;
				1604	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1605
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1606	#endif /* MS_WIN32 */
				1607
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1608	/* --- Character Mapping Codec -------------------------------------------- */
				1609
				1610	static
				1611	int charmap_decoding_error(const char **source,
				1612	Py_UNICODE **dest,
				1613	const char *errors,
				1614	const char *details)
				1615	{
				1616	if ((errors == NULL) \|\|
				1617	(strcmp(errors,"strict") == 0)) {
				1618	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1619	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1620	details);
				1621	return -1;
				1622	}
				1623	else if (strcmp(errors,"ignore") == 0) {
				1624	return 0;
				1625	}
				1626	else if (strcmp(errors,"replace") == 0) {
				1627	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1628	(*dest)++;
				1629	return 0;
				1630	}
				1631	else {
				1632	PyErr_Format(PyExc_ValueError,
				1633	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1634	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1635	errors);
				1636	return -1;
				1637	}
				1638	}
				1639
				1640	PyObject PyUnicode_DecodeCharmap(const char s,
				1641	int size,
				1642	PyObject *mapping,
				1643	const char *errors)
				1644	{
				1645	PyUnicodeObject *v;
				1646	Py_UNICODE *p;
				1647
				1648	/* Default to Latin-1 */
				1649	if (mapping == NULL)
				1650	return PyUnicode_DecodeLatin1(s, size, errors);
				1651
				1652	v = _PyUnicode_New(size);
				1653	if (v == NULL)
				1654	goto onError;
				1655	if (size == 0)
				1656	return (PyObject *)v;
				1657	p = PyUnicode_AS_UNICODE(v);
				1658	while (size-- > 0) {
				1659	unsigned char ch = *s++;
				1660	PyObject w, x;
				1661
				1662	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1663	w = PyInt_FromLong((long)ch);
				1664	if (w == NULL)
				1665	goto onError;
				1666	x = PyObject_GetItem(mapping, w);
				1667	Py_DECREF(w);
				1668	if (x == NULL) {
				1669	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1670	/* No mapping found: default to Latin-1 mapping */
				1671	PyErr_Clear();
				1672	*p++ = (Py_UNICODE)ch;
				1673	continue;
				1674	}
				1675	goto onError;
				1676	}
				1677
				1678	/* Apply mapping */
				1679	if (PyInt_Check(x)) {
				1680	int value = PyInt_AS_LONG(x);
				1681	if (value < 0 \|\| value > 65535) {
				1682	PyErr_SetString(PyExc_TypeError,
				1683	"character mapping must be in range(65336)");
				1684	Py_DECREF(x);
				1685	goto onError;
				1686	}
				1687	*p++ = (Py_UNICODE)value;
				1688	}
				1689	else if (x == Py_None) {
				1690	/* undefined mapping */
				1691	if (charmap_decoding_error(&s, &p, errors,
				1692	"character maps to <undefined>")) {
				1693	Py_DECREF(x);
				1694	goto onError;
				1695	}
				1696	}
				1697	else if (PyUnicode_Check(x)) {
				1698	if (PyUnicode_GET_SIZE(x) != 1) {
				1699	/* 1-n mapping */
				1700	PyErr_SetString(PyExc_NotImplementedError,
				1701	"1-n mappings are currently not implemented");
				1702	Py_DECREF(x);
				1703	goto onError;
				1704	}
				1705	p++ = PyUnicode_AS_UNICODE(x);
				1706	}
				1707	else {
				1708	/* wrong return value */
				1709	PyErr_SetString(PyExc_TypeError,
				1710	"character mapping must return integer, None or unicode");
				1711	Py_DECREF(x);
				1712	goto onError;
				1713	}
				1714	Py_DECREF(x);
				1715	}
				1716	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1717	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1718	goto onError;
				1719	return (PyObject *)v;
				1720
				1721	onError:
				1722	Py_XDECREF(v);
				1723	return NULL;
				1724	}
				1725
				1726	static
				1727	int charmap_encoding_error(const Py_UNICODE **source,
				1728	char **dest,
				1729	const char *errors,
				1730	const char *details)
				1731	{
				1732	if ((errors == NULL) \|\|
				1733	(strcmp(errors,"strict") == 0)) {
				1734	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1735	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1736	details);
				1737	return -1;
				1738	}
				1739	else if (strcmp(errors,"ignore") == 0) {
				1740	return 0;
				1741	}
				1742	else if (strcmp(errors,"replace") == 0) {
				1743	**dest = '?';
				1744	(*dest)++;
				1745	return 0;
				1746	}
				1747	else {
				1748	PyErr_Format(PyExc_ValueError,
				1749	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1750	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1751	errors);
				1752	return -1;
				1753	}
				1754	}
				1755
				1756	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1757	int size,
				1758	PyObject *mapping,
				1759	const char *errors)
				1760	{
				1761	PyObject *v;
				1762	char *s;
				1763
				1764	/* Default to Latin-1 */
				1765	if (mapping == NULL)
				1766	return PyUnicode_EncodeLatin1(p, size, errors);
				1767
				1768	v = PyString_FromStringAndSize(NULL, size);
				1769	if (v == NULL)
				1770	return NULL;
				1771	s = PyString_AS_STRING(v);
				1772	while (size-- > 0) {
				1773	Py_UNICODE ch = *p++;
				1774	PyObject w, x;
				1775
				1776	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				1777	w = PyInt_FromLong((long)ch);
				1778	if (w == NULL)
				1779	goto onError;
				1780	x = PyObject_GetItem(mapping, w);
				1781	Py_DECREF(w);
				1782	if (x == NULL) {
				1783	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1784	/* No mapping found: default to Latin-1 mapping if possible */
				1785	PyErr_Clear();
				1786	if (ch < 256) {
				1787	*s++ = (char)ch;
				1788	continue;
				1789	}
				1790	else if (!charmap_encoding_error(&p, &s, errors,
				1791	"missing character mapping"))
				1792	continue;
				1793	}
				1794	goto onError;
				1795	}
				1796
				1797	/* Apply mapping */
				1798	if (PyInt_Check(x)) {
				1799	int value = PyInt_AS_LONG(x);
				1800	if (value < 0 \|\| value > 255) {
				1801	PyErr_SetString(PyExc_TypeError,
				1802	"character mapping must be in range(256)");
				1803	Py_DECREF(x);
				1804	goto onError;
				1805	}
				1806	*s++ = (char)value;
				1807	}
				1808	else if (x == Py_None) {
				1809	/* undefined mapping */
				1810	if (charmap_encoding_error(&p, &s, errors,
				1811	"character maps to <undefined>")) {
				1812	Py_DECREF(x);
				1813	goto onError;
				1814	}
				1815	}
				1816	else if (PyString_Check(x)) {
				1817	if (PyString_GET_SIZE(x) != 1) {
				1818	/* 1-n mapping */
				1819	PyErr_SetString(PyExc_NotImplementedError,
				1820	"1-n mappings are currently not implemented");
				1821	Py_DECREF(x);
				1822	goto onError;
				1823	}
				1824	s++ = PyString_AS_STRING(x);
				1825	}
				1826	else {
				1827	/* wrong return value */
				1828	PyErr_SetString(PyExc_TypeError,
				1829	"character mapping must return integer, None or unicode");
				1830	Py_DECREF(x);
				1831	goto onError;
				1832	}
				1833	Py_DECREF(x);
				1834	}
				1835	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				1836	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				1837	goto onError;
				1838	return v;
				1839
				1840	onError:
				1841	Py_DECREF(v);
				1842	return NULL;
				1843	}
				1844
				1845	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				1846	PyObject *mapping)
				1847	{
				1848	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				1849	PyErr_BadArgument();
				1850	return NULL;
				1851	}
				1852	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				1853	PyUnicode_GET_SIZE(unicode),
				1854	mapping,
				1855	NULL);
				1856	}
				1857
				1858	static
				1859	int translate_error(const Py_UNICODE **source,
				1860	Py_UNICODE **dest,
				1861	const char *errors,
				1862	const char *details)
				1863	{
				1864	if ((errors == NULL) \|\|
				1865	(strcmp(errors,"strict") == 0)) {
				1866	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1867	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1868	details);
				1869	return -1;
				1870	}
				1871	else if (strcmp(errors,"ignore") == 0) {
				1872	return 0;
				1873	}
				1874	else if (strcmp(errors,"replace") == 0) {
				1875	**dest = '?';
				1876	(*dest)++;
				1877	return 0;
				1878	}
				1879	else {
				1880	PyErr_Format(PyExc_ValueError,
				1881	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1882	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1883	errors);
				1884	return -1;
				1885	}
				1886	}
				1887
				1888	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				1889	int size,
				1890	PyObject *mapping,
				1891	const char *errors)
				1892	{
				1893	PyUnicodeObject *v;
				1894	Py_UNICODE *p;
				1895
				1896	if (mapping == NULL) {
				1897	PyErr_BadArgument();
				1898	return NULL;
				1899	}
				1900
				1901	/* Output will never be longer than input */
				1902	v = _PyUnicode_New(size);
				1903	if (v == NULL)
				1904	goto onError;
				1905	if (size == 0)
				1906	goto done;
				1907	p = PyUnicode_AS_UNICODE(v);
				1908	while (size-- > 0) {
				1909	Py_UNICODE ch = *s++;
				1910	PyObject w, x;
				1911
				1912	/* Get mapping */
				1913	w = PyInt_FromLong(ch);
				1914	if (w == NULL)
				1915	goto onError;
				1916	x = PyObject_GetItem(mapping, w);
				1917	Py_DECREF(w);
				1918	if (x == NULL) {
				1919	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1920	/* No mapping found: default to 1-1 mapping */
				1921	PyErr_Clear();
				1922	*p++ = ch;
				1923	continue;
				1924	}
				1925	goto onError;
				1926	}
				1927
				1928	/* Apply mapping */
				1929	if (PyInt_Check(x))
				1930	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				1931	else if (x == Py_None) {
				1932	/* undefined mapping */
				1933	if (translate_error(&s, &p, errors,
				1934	"character maps to <undefined>")) {
				1935	Py_DECREF(x);
				1936	goto onError;
				1937	}
				1938	}
				1939	else if (PyUnicode_Check(x)) {
				1940	if (PyUnicode_GET_SIZE(x) != 1) {
				1941	/* 1-n mapping */
				1942	PyErr_SetString(PyExc_NotImplementedError,
				1943	"1-n mappings are currently not implemented");
				1944	Py_DECREF(x);
				1945	goto onError;
				1946	}
				1947	p++ = PyUnicode_AS_UNICODE(x);
				1948	}
				1949	else {
				1950	/* wrong return value */
				1951	PyErr_SetString(PyExc_TypeError,
				1952	"translate mapping must return integer, None or unicode");
				1953	Py_DECREF(x);
				1954	goto onError;
				1955	}
				1956	Py_DECREF(x);
				1957	}
				1958	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1959	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1960	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1961
				1962	done:
				1963	return (PyObject *)v;
				1964
				1965	onError:
				1966	Py_XDECREF(v);
				1967	return NULL;
				1968	}
				1969
				1970	PyObject PyUnicode_Translate(PyObject str,
				1971	PyObject *mapping,
				1972	const char *errors)
				1973	{
				1974	PyObject *result;
				1975
				1976	str = PyUnicode_FromObject(str);
				1977	if (str == NULL)
				1978	goto onError;
				1979	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				1980	PyUnicode_GET_SIZE(str),
				1981	mapping,
				1982	errors);
				1983	Py_DECREF(str);
				1984	return result;
				1985
				1986	onError:
				1987	Py_XDECREF(str);
				1988	return NULL;
				1989	}
				1990
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	1991	/* --- Decimal Encoder ---------------------------------------------------- */
				1992
				1993	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				1994	int length,
				1995	char *output,
				1996	const char *errors)
				1997	{
				1998	Py_UNICODE p, end;
				1999
				2000	if (output == NULL) {
				2001	PyErr_BadArgument();
				2002	return -1;
				2003	}
				2004
				2005	p = s;
				2006	end = s + length;
				2007	while (p < end) {
				2008	register Py_UNICODE ch = *p++;
				2009	int decimal;
				2010
				2011	if (Py_UNICODE_ISSPACE(ch)) {
				2012	*output++ = ' ';
				2013	continue;
				2014	}
				2015	decimal = Py_UNICODE_TODECIMAL(ch);
				2016	if (decimal >= 0) {
				2017	*output++ = '0' + decimal;
				2018	continue;
				2019	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2020	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2021	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2022	continue;
				2023	}
				2024	/* All other characters are considered invalid */
				2025	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2026	PyErr_SetString(PyExc_ValueError,
				2027	"invalid decimal Unicode string");
				2028	goto onError;
				2029	}
				2030	else if (strcmp(errors, "ignore") == 0)
				2031	continue;
				2032	else if (strcmp(errors, "replace") == 0) {
				2033	*output++ = '?';
				2034	continue;
				2035	}
				2036	}
				2037	/* 0-terminate the output string */
				2038	*output++ = '\0';
				2039	return 0;
				2040
				2041	onError:
				2042	return -1;
				2043	}
				2044
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2045	/* --- Helpers ------------------------------------------------------------ */
				2046
				2047	static
				2048	int count(PyUnicodeObject *self,
				2049	int start,
				2050	int end,
				2051	PyUnicodeObject *substring)
				2052	{
				2053	int count = 0;
				2054
				2055	end -= substring->length;
				2056
				2057	while (start <= end)
				2058	if (Py_UNICODE_MATCH(self, start, substring)) {
				2059	count++;
				2060	start += substring->length;
				2061	} else
				2062	start++;
				2063
				2064	return count;
				2065	}
				2066
				2067	int PyUnicode_Count(PyObject *str,
				2068	PyObject *substr,
				2069	int start,
				2070	int end)
				2071	{
				2072	int result;
				2073
				2074	str = PyUnicode_FromObject(str);
				2075	if (str == NULL)
				2076	return -1;
				2077	substr = PyUnicode_FromObject(substr);
				2078	if (substr == NULL) {
				2079	Py_DECREF(substr);
				2080	return -1;
				2081	}
				2082
				2083	result = count((PyUnicodeObject *)str,
				2084	start, end,
				2085	(PyUnicodeObject *)substr);
				2086
				2087	Py_DECREF(str);
				2088	Py_DECREF(substr);
				2089	return result;
				2090	}
				2091
				2092	static
				2093	int findstring(PyUnicodeObject *self,
				2094	PyUnicodeObject *substring,
				2095	int start,
				2096	int end,
				2097	int direction)
				2098	{
				2099	if (start < 0)
				2100	start += self->length;
				2101	if (start < 0)
				2102	start = 0;
				2103
				2104	if (substring->length == 0)
				2105	return start;
				2106
				2107	if (end > self->length)
				2108	end = self->length;
				2109	if (end < 0)
				2110	end += self->length;
				2111	if (end < 0)
				2112	end = 0;
				2113
				2114	end -= substring->length;
				2115
				2116	if (direction < 0) {
				2117	for (; end >= start; end--)
				2118	if (Py_UNICODE_MATCH(self, end, substring))
				2119	return end;
				2120	} else {
				2121	for (; start <= end; start++)
				2122	if (Py_UNICODE_MATCH(self, start, substring))
				2123	return start;
				2124	}
				2125
				2126	return -1;
				2127	}
				2128
				2129	int PyUnicode_Find(PyObject *str,
				2130	PyObject *substr,
				2131	int start,
				2132	int end,
				2133	int direction)
				2134	{
				2135	int result;
				2136
				2137	str = PyUnicode_FromObject(str);
				2138	if (str == NULL)
				2139	return -1;
				2140	substr = PyUnicode_FromObject(substr);
				2141	if (substr == NULL) {
				2142	Py_DECREF(substr);
				2143	return -1;
				2144	}
				2145
				2146	result = findstring((PyUnicodeObject *)str,
				2147	(PyUnicodeObject *)substr,
				2148	start, end, direction);
				2149	Py_DECREF(str);
				2150	Py_DECREF(substr);
				2151	return result;
				2152	}
				2153
				2154	static
				2155	int tailmatch(PyUnicodeObject *self,
				2156	PyUnicodeObject *substring,
				2157	int start,
				2158	int end,
				2159	int direction)
				2160	{
				2161	if (start < 0)
				2162	start += self->length;
				2163	if (start < 0)
				2164	start = 0;
				2165
				2166	if (substring->length == 0)
				2167	return 1;
				2168
				2169	if (end > self->length)
				2170	end = self->length;
				2171	if (end < 0)
				2172	end += self->length;
				2173	if (end < 0)
				2174	end = 0;
				2175
				2176	end -= substring->length;
				2177	if (end < start)
				2178	return 0;
				2179
				2180	if (direction > 0) {
				2181	if (Py_UNICODE_MATCH(self, end, substring))
				2182	return 1;
				2183	} else {
				2184	if (Py_UNICODE_MATCH(self, start, substring))
				2185	return 1;
				2186	}
				2187
				2188	return 0;
				2189	}
				2190
				2191	int PyUnicode_Tailmatch(PyObject *str,
				2192	PyObject *substr,
				2193	int start,
				2194	int end,
				2195	int direction)
				2196	{
				2197	int result;
				2198
				2199	str = PyUnicode_FromObject(str);
				2200	if (str == NULL)
				2201	return -1;
				2202	substr = PyUnicode_FromObject(substr);
				2203	if (substr == NULL) {
				2204	Py_DECREF(substr);
				2205	return -1;
				2206	}
				2207
				2208	result = tailmatch((PyUnicodeObject *)str,
				2209	(PyUnicodeObject *)substr,
				2210	start, end, direction);
				2211	Py_DECREF(str);
				2212	Py_DECREF(substr);
				2213	return result;
				2214	}
				2215
				2216	static
				2217	const Py_UNICODE findchar(const Py_UNICODE s,
				2218	int size,
				2219	Py_UNICODE ch)
				2220	{
				2221	/* like wcschr, but doesn't stop at NULL characters */
				2222
				2223	while (size-- > 0) {
				2224	if (*s == ch)
				2225	return s;
				2226	s++;
				2227	}
				2228
				2229	return NULL;
				2230	}
				2231
				2232	/* Apply fixfct filter to the Unicode object self and return a
				2233	reference to the modified object */
				2234
				2235	static
				2236	PyObject fixup(PyUnicodeObject self,
				2237	int (fixfct)(PyUnicodeObject s))
				2238	{
				2239
				2240	PyUnicodeObject *u;
				2241
				2242	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2243	self->length);
				2244	if (u == NULL)
				2245	return NULL;
				2246	if (!fixfct(u)) {
				2247	/* fixfct should return TRUE if it modified the buffer. If
				2248	FALSE, return a reference to the original buffer instead
				2249	(to save space, not time) */
				2250	Py_INCREF(self);
				2251	Py_DECREF(u);
				2252	return (PyObject*) self;
				2253	}
				2254	return (PyObject*) u;
				2255	}
				2256
				2257	static
				2258	int fixupper(PyUnicodeObject *self)
				2259	{
				2260	int len = self->length;
				2261	Py_UNICODE *s = self->str;
				2262	int status = 0;
				2263
				2264	while (len-- > 0) {
				2265	register Py_UNICODE ch;
				2266
				2267	ch = Py_UNICODE_TOUPPER(*s);
				2268	if (ch != *s) {
				2269	status = 1;
				2270	*s = ch;
				2271	}
				2272	s++;
				2273	}
				2274
				2275	return status;
				2276	}
				2277
				2278	static
				2279	int fixlower(PyUnicodeObject *self)
				2280	{
				2281	int len = self->length;
				2282	Py_UNICODE *s = self->str;
				2283	int status = 0;
				2284
				2285	while (len-- > 0) {
				2286	register Py_UNICODE ch;
				2287
				2288	ch = Py_UNICODE_TOLOWER(*s);
				2289	if (ch != *s) {
				2290	status = 1;
				2291	*s = ch;
				2292	}
				2293	s++;
				2294	}
				2295
				2296	return status;
				2297	}
				2298
				2299	static
				2300	int fixswapcase(PyUnicodeObject *self)
				2301	{
				2302	int len = self->length;
				2303	Py_UNICODE *s = self->str;
				2304	int status = 0;
				2305
				2306	while (len-- > 0) {
				2307	if (Py_UNICODE_ISUPPER(*s)) {
				2308	s = Py_UNICODE_TOLOWER(s);
				2309	status = 1;
				2310	} else if (Py_UNICODE_ISLOWER(*s)) {
				2311	s = Py_UNICODE_TOUPPER(s);
				2312	status = 1;
				2313	}
				2314	s++;
				2315	}
				2316
				2317	return status;
				2318	}
				2319
				2320	static
				2321	int fixcapitalize(PyUnicodeObject *self)
				2322	{
				2323	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2324	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2325	return 1;
				2326	}
				2327	return 0;
				2328	}
				2329
				2330	static
				2331	int fixtitle(PyUnicodeObject *self)
				2332	{
				2333	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2334	register Py_UNICODE *e;
				2335	int previous_is_cased;
				2336
				2337	/* Shortcut for single character strings */
				2338	if (PyUnicode_GET_SIZE(self) == 1) {
				2339	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2340	if (*p != ch) {
				2341	*p = ch;
				2342	return 1;
				2343	}
				2344	else
				2345	return 0;
				2346	}
				2347
				2348	e = p + PyUnicode_GET_SIZE(self);
				2349	previous_is_cased = 0;
				2350	for (; p < e; p++) {
				2351	register const Py_UNICODE ch = *p;
				2352
				2353	if (previous_is_cased)
				2354	*p = Py_UNICODE_TOLOWER(ch);
				2355	else
				2356	*p = Py_UNICODE_TOTITLE(ch);
				2357
				2358	if (Py_UNICODE_ISLOWER(ch) \|\|
				2359	Py_UNICODE_ISUPPER(ch) \|\|
				2360	Py_UNICODE_ISTITLE(ch))
				2361	previous_is_cased = 1;
				2362	else
				2363	previous_is_cased = 0;
				2364	}
				2365	return 1;
				2366	}
				2367
				2368	PyObject PyUnicode_Join(PyObject separator,
				2369	PyObject *seq)
				2370	{
				2371	Py_UNICODE *sep;
				2372	int seplen;
				2373	PyUnicodeObject *res = NULL;
				2374	int reslen = 0;
				2375	Py_UNICODE *p;
				2376	int seqlen = 0;
				2377	int sz = 100;
				2378	int i;
				2379
				2380	seqlen = PySequence_Length(seq);
				2381	if (seqlen < 0 && PyErr_Occurred())
				2382	return NULL;
				2383
				2384	if (separator == NULL) {
				2385	Py_UNICODE blank = ' ';
				2386	sep = &blank;
				2387	seplen = 1;
				2388	}
				2389	else {
				2390	separator = PyUnicode_FromObject(separator);
				2391	if (separator == NULL)
				2392	return NULL;
				2393	sep = PyUnicode_AS_UNICODE(separator);
				2394	seplen = PyUnicode_GET_SIZE(separator);
				2395	}
				2396
				2397	res = _PyUnicode_New(sz);
				2398	if (res == NULL)
				2399	goto onError;
				2400	p = PyUnicode_AS_UNICODE(res);
				2401	reslen = 0;
				2402
				2403	for (i = 0; i < seqlen; i++) {
				2404	int itemlen;
				2405	PyObject *item;
				2406
				2407	item = PySequence_GetItem(seq, i);
				2408	if (item == NULL)
				2409	goto onError;
				2410	if (!PyUnicode_Check(item)) {
				2411	PyObject *v;
				2412	v = PyUnicode_FromObject(item);
				2413	Py_DECREF(item);
				2414	item = v;
				2415	if (item == NULL)
				2416	goto onError;
				2417	}
				2418	itemlen = PyUnicode_GET_SIZE(item);
				2419	while (reslen + itemlen + seplen >= sz) {
				2420	if (_PyUnicode_Resize(res, sz*2))
				2421	goto onError;
				2422	sz *= 2;
				2423	p = PyUnicode_AS_UNICODE(res) + reslen;
				2424	}
				2425	if (i > 0) {
				2426	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2427	p += seplen;
				2428	reslen += seplen;
				2429	}
				2430	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2431	p += itemlen;
				2432	reslen += itemlen;
				2433	Py_DECREF(item);
				2434	}
				2435	if (_PyUnicode_Resize(res, reslen))
				2436	goto onError;
				2437
				2438	Py_XDECREF(separator);
				2439	return (PyObject *)res;
				2440
				2441	onError:
				2442	Py_XDECREF(separator);
				2443	Py_DECREF(res);
				2444	return NULL;
				2445	}
				2446
				2447	static
				2448	PyUnicodeObject pad(PyUnicodeObject self,
				2449	int left,
				2450	int right,
				2451	Py_UNICODE fill)
				2452	{
				2453	PyUnicodeObject *u;
				2454
				2455	if (left < 0)
				2456	left = 0;
				2457	if (right < 0)
				2458	right = 0;
				2459
				2460	if (left == 0 && right == 0) {
				2461	Py_INCREF(self);
				2462	return self;
				2463	}
				2464
				2465	u = _PyUnicode_New(left + self->length + right);
				2466	if (u) {
				2467	if (left)
				2468	Py_UNICODE_FILL(u->str, fill, left);
				2469	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2470	if (right)
				2471	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2472	}
				2473
				2474	return u;
				2475	}
				2476
				2477	#define SPLIT_APPEND(data, left, right) \
				2478	str = PyUnicode_FromUnicode(data + left, right - left); \
				2479	if (!str) \
				2480	goto onError; \
				2481	if (PyList_Append(list, str)) { \
				2482	Py_DECREF(str); \
				2483	goto onError; \
				2484	} \
				2485	else \
				2486	Py_DECREF(str);
				2487
				2488	static
				2489	PyObject split_whitespace(PyUnicodeObject self,
				2490	PyObject *list,
				2491	int maxcount)
				2492	{
				2493	register int i;
				2494	register int j;
				2495	int len = self->length;
				2496	PyObject *str;
				2497
				2498	for (i = j = 0; i < len; ) {
				2499	/* find a token */
				2500	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2501	i++;
				2502	j = i;
				2503	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2504	i++;
				2505	if (j < i) {
				2506	if (maxcount-- <= 0)
				2507	break;
				2508	SPLIT_APPEND(self->str, j, i);
				2509	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2510	i++;
				2511	j = i;
				2512	}
				2513	}
				2514	if (j < len) {
				2515	SPLIT_APPEND(self->str, j, len);
				2516	}
				2517	return list;
				2518
				2519	onError:
				2520	Py_DECREF(list);
				2521	return NULL;
				2522	}
				2523
				2524	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2525	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2526	{
				2527	register int i;
				2528	register int j;
				2529	int len;
				2530	PyObject *list;
				2531	PyObject *str;
				2532	Py_UNICODE *data;
				2533
				2534	string = PyUnicode_FromObject(string);
				2535	if (string == NULL)
				2536	return NULL;
				2537	data = PyUnicode_AS_UNICODE(string);
				2538	len = PyUnicode_GET_SIZE(string);
				2539
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2540	list = PyList_New(0);
				2541	if (!list)
				2542	goto onError;
				2543
				2544	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2545	int eol;
				2546
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2547	/* Find a line and append it */
				2548	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2549	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2550
				2551	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2552	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2553	if (i < len) {
				2554	if (data[i] == '\r' && i + 1 < len &&
				2555	data[i+1] == '\n')
				2556	i += 2;
				2557	else
				2558	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2559	if (keepends)
				2560	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2561	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2562	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2563	j = i;
				2564	}
				2565	if (j < len) {
				2566	SPLIT_APPEND(data, j, len);
				2567	}
				2568
				2569	Py_DECREF(string);
				2570	return list;
				2571
				2572	onError:
				2573	Py_DECREF(list);
				2574	Py_DECREF(string);
				2575	return NULL;
				2576	}
				2577
				2578	static
				2579	PyObject split_char(PyUnicodeObject self,
				2580	PyObject *list,
				2581	Py_UNICODE ch,
				2582	int maxcount)
				2583	{
				2584	register int i;
				2585	register int j;
				2586	int len = self->length;
				2587	PyObject *str;
				2588
				2589	for (i = j = 0; i < len; ) {
				2590	if (self->str[i] == ch) {
				2591	if (maxcount-- <= 0)
				2592	break;
				2593	SPLIT_APPEND(self->str, j, i);
				2594	i = j = i + 1;
				2595	} else
				2596	i++;
				2597	}
				2598	if (j <= len) {
				2599	SPLIT_APPEND(self->str, j, len);
				2600	}
				2601	return list;
				2602
				2603	onError:
				2604	Py_DECREF(list);
				2605	return NULL;
				2606	}
				2607
				2608	static
				2609	PyObject split_substring(PyUnicodeObject self,
				2610	PyObject *list,
				2611	PyUnicodeObject *substring,
				2612	int maxcount)
				2613	{
				2614	register int i;
				2615	register int j;
				2616	int len = self->length;
				2617	int sublen = substring->length;
				2618	PyObject *str;
				2619
				2620	for (i = j = 0; i < len - sublen; ) {
				2621	if (Py_UNICODE_MATCH(self, i, substring)) {
				2622	if (maxcount-- <= 0)
				2623	break;
				2624	SPLIT_APPEND(self->str, j, i);
				2625	i = j = i + sublen;
				2626	} else
				2627	i++;
				2628	}
				2629	if (j <= len) {
				2630	SPLIT_APPEND(self->str, j, len);
				2631	}
				2632	return list;
				2633
				2634	onError:
				2635	Py_DECREF(list);
				2636	return NULL;
				2637	}
				2638
				2639	#undef SPLIT_APPEND
				2640
				2641	static
				2642	PyObject split(PyUnicodeObject self,
				2643	PyUnicodeObject *substring,
				2644	int maxcount)
				2645	{
				2646	PyObject *list;
				2647
				2648	if (maxcount < 0)
				2649	maxcount = INT_MAX;
				2650
				2651	list = PyList_New(0);
				2652	if (!list)
				2653	return NULL;
				2654
				2655	if (substring == NULL)
				2656	return split_whitespace(self,list,maxcount);
				2657
				2658	else if (substring->length == 1)
				2659	return split_char(self,list,substring->str[0],maxcount);
				2660
				2661	else if (substring->length == 0) {
				2662	Py_DECREF(list);
				2663	PyErr_SetString(PyExc_ValueError, "empty separator");
				2664	return NULL;
				2665	}
				2666	else
				2667	return split_substring(self,list,substring,maxcount);
				2668	}
				2669
				2670	static
				2671	PyObject strip(PyUnicodeObject self,
				2672	int left,
				2673	int right)
				2674	{
				2675	Py_UNICODE *p = self->str;
				2676	int start = 0;
				2677	int end = self->length;
				2678
				2679	if (left)
				2680	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2681	start++;
				2682
				2683	if (right)
				2684	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2685	end--;
				2686
				2687	if (start == 0 && end == self->length) {
				2688	/* couldn't strip anything off, return original string */
				2689	Py_INCREF(self);
				2690	return (PyObject*) self;
				2691	}
				2692
				2693	return (PyObject*) PyUnicode_FromUnicode(
				2694	self->str + start,
				2695	end - start
				2696	);
				2697	}
				2698
				2699	static
				2700	PyObject replace(PyUnicodeObject self,
				2701	PyUnicodeObject *str1,
				2702	PyUnicodeObject *str2,
				2703	int maxcount)
				2704	{
				2705	PyUnicodeObject *u;
				2706
				2707	if (maxcount < 0)
				2708	maxcount = INT_MAX;
				2709
				2710	if (str1->length == 1 && str2->length == 1) {
				2711	int i;
				2712
				2713	/* replace characters */
				2714	if (!findchar(self->str, self->length, str1->str[0])) {
				2715	/* nothing to replace, return original string */
				2716	Py_INCREF(self);
				2717	u = self;
				2718	} else {
				2719	Py_UNICODE u1 = str1->str[0];
				2720	Py_UNICODE u2 = str2->str[0];
				2721
				2722	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2723	self->str,
				2724	self->length
				2725	);
				2726	if (u)
				2727	for (i = 0; i < u->length; i++)
				2728	if (u->str[i] == u1) {
				2729	if (--maxcount < 0)
				2730	break;
				2731	u->str[i] = u2;
				2732	}
				2733	}
				2734
				2735	} else {
				2736	int n, i;
				2737	Py_UNICODE *p;
				2738
				2739	/* replace strings */
				2740	n = count(self, 0, self->length, str1);
				2741	if (n > maxcount)
				2742	n = maxcount;
				2743	if (n == 0) {
				2744	/* nothing to replace, return original string */
				2745	Py_INCREF(self);
				2746	u = self;
				2747	} else {
				2748	u = _PyUnicode_New(
				2749	self->length + n * (str2->length - str1->length));
				2750	if (u) {
				2751	i = 0;
				2752	p = u->str;
				2753	while (i <= self->length - str1->length)
				2754	if (Py_UNICODE_MATCH(self, i, str1)) {
				2755	/* replace string segment */
				2756	Py_UNICODE_COPY(p, str2->str, str2->length);
				2757	p += str2->length;
				2758	i += str1->length;
				2759	if (--n <= 0) {
				2760	/* copy remaining part */
				2761	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				2762	break;
				2763	}
				2764	} else
				2765	*p++ = self->str[i++];
				2766	}
				2767	}
				2768	}
				2769
				2770	return (PyObject *) u;
				2771	}
				2772
				2773	/* --- Unicode Object Methods --------------------------------------------- */
				2774
				2775	static char title__doc__[] =
				2776	"S.title() -> unicode\n\
				2777	\n\
				2778	Return a titlecased version of S, i.e. words start with title case\n\
				2779	characters, all remaining cased characters have lower case.";
				2780
				2781	static PyObject*
				2782	unicode_title(PyUnicodeObject self, PyObject args)
				2783	{
				2784	if (!PyArg_NoArgs(args))
				2785	return NULL;
				2786	return fixup(self, fixtitle);
				2787	}
				2788
				2789	static char capitalize__doc__[] =
				2790	"S.capitalize() -> unicode\n\
				2791	\n\
				2792	Return a capitalized version of S, i.e. make the first character\n\
				2793	have upper case.";
				2794
				2795	static PyObject*
				2796	unicode_capitalize(PyUnicodeObject self, PyObject args)
				2797	{
				2798	if (!PyArg_NoArgs(args))
				2799	return NULL;
				2800	return fixup(self, fixcapitalize);
				2801	}
				2802
				2803	#if 0
				2804	static char capwords__doc__[] =
				2805	"S.capwords() -> unicode\n\
				2806	\n\
				2807	Apply .capitalize() to all words in S and return the result with\n\
				2808	normalized whitespace (all whitespace strings are replaced by ' ').";
				2809
				2810	static PyObject*
				2811	unicode_capwords(PyUnicodeObject self, PyObject args)
				2812	{
				2813	PyObject *list;
				2814	PyObject *item;
				2815	int i;
				2816
				2817	if (!PyArg_NoArgs(args))
				2818	return NULL;
				2819
				2820	/* Split into words */
				2821	list = split(self, NULL, -1);
				2822	if (!list)
				2823	return NULL;
				2824
				2825	/* Capitalize each word */
				2826	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				2827	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				2828	fixcapitalize);
				2829	if (item == NULL)
				2830	goto onError;
				2831	Py_DECREF(PyList_GET_ITEM(list, i));
				2832	PyList_SET_ITEM(list, i, item);
				2833	}
				2834
				2835	/* Join the words to form a new string */
				2836	item = PyUnicode_Join(NULL, list);
				2837
				2838	onError:
				2839	Py_DECREF(list);
				2840	return (PyObject *)item;
				2841	}
				2842	#endif
				2843
				2844	static char center__doc__[] =
				2845	"S.center(width) -> unicode\n\
				2846	\n\
				2847	Return S centered in a Unicode string of length width. Padding is done\n\
				2848	using spaces.";
				2849
				2850	static PyObject *
				2851	unicode_center(PyUnicodeObject self, PyObject args)
				2852	{
				2853	int marg, left;
				2854	int width;
				2855
				2856	if (!PyArg_ParseTuple(args, "i:center", &width))
				2857	return NULL;
				2858
				2859	if (self->length >= width) {
				2860	Py_INCREF(self);
				2861	return (PyObject*) self;
				2862	}
				2863
				2864	marg = width - self->length;
				2865	left = marg / 2 + (marg & width & 1);
				2866
				2867	return (PyObject*) pad(self, left, marg - left, ' ');
				2868	}
				2869
				2870	static int
				2871	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				2872	{
				2873	int len1, len2;
				2874	Py_UNICODE *s1 = str1->str;
				2875	Py_UNICODE *s2 = str2->str;
				2876
				2877	len1 = str1->length;
				2878	len2 = str2->length;
				2879
				2880	while (len1 > 0 && len2 > 0) {
				2881	int cmp = (s1++) - (s2++);
				2882	if (cmp)
				2883	/* This should make Christian happy! */
				2884	return (cmp < 0) ? -1 : (cmp != 0);
				2885	len1--, len2--;
				2886	}
				2887
				2888	return (len1 < len2) ? -1 : (len1 != len2);
				2889	}
				2890
				2891	int PyUnicode_Compare(PyObject *left,
				2892	PyObject *right)
				2893	{
				2894	PyUnicodeObject u = NULL, v = NULL;
				2895	int result;
				2896
				2897	/* Coerce the two arguments */
				2898	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2899	if (u == NULL)
				2900	goto onError;
				2901	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2902	if (v == NULL)
				2903	goto onError;
				2904
				2905	/* Shortcut for emtpy or interned objects */
				2906	if (v == u) {
				2907	Py_DECREF(u);
				2908	Py_DECREF(v);
				2909	return 0;
				2910	}
				2911
				2912	result = unicode_compare(u, v);
				2913
				2914	Py_DECREF(u);
				2915	Py_DECREF(v);
				2916	return result;
				2917
				2918	onError:
				2919	Py_XDECREF(u);
				2920	Py_XDECREF(v);
				2921	return -1;
				2922	}
				2923
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2924	int PyUnicode_Contains(PyObject *container,
				2925	PyObject *element)
				2926	{
				2927	PyUnicodeObject u = NULL, v = NULL;
				2928	int result;
				2929	register const Py_UNICODE p, e;
				2930	register Py_UNICODE ch;
				2931
				2932	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2933	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
				2934	if (v == NULL)
				2935	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2936	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				2937	if (u == NULL) {
				2938	Py_DECREF(v);
				2939	goto onError;
				2940	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2941
				2942	/* Check v in u */
				2943	if (PyUnicode_GET_SIZE(v) != 1) {
				2944	PyErr_SetString(PyExc_TypeError,
				2945	"string member test needs char left operand");
				2946	goto onError;
				2947	}
				2948	ch = *PyUnicode_AS_UNICODE(v);
				2949	p = PyUnicode_AS_UNICODE(u);
				2950	e = p + PyUnicode_GET_SIZE(u);
				2951	result = 0;
				2952	while (p < e) {
				2953	if (*p++ == ch) {
				2954	result = 1;
				2955	break;
				2956	}
				2957	}
				2958
				2959	Py_DECREF(u);
				2960	Py_DECREF(v);
				2961	return result;
				2962
				2963	onError:
				2964	Py_XDECREF(u);
				2965	Py_XDECREF(v);
				2966	return -1;
				2967	}
				2968
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2969	/* Concat to string or Unicode object giving a new Unicode object. */
				2970
				2971	PyObject PyUnicode_Concat(PyObject left,
				2972	PyObject *right)
				2973	{
				2974	PyUnicodeObject u = NULL, v = NULL, *w;
				2975
				2976	/* Coerce the two arguments */
				2977	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2978	if (u == NULL)
				2979	goto onError;
				2980	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2981	if (v == NULL)
				2982	goto onError;
				2983
				2984	/* Shortcuts */
				2985	if (v == unicode_empty) {
				2986	Py_DECREF(v);
				2987	return (PyObject *)u;
				2988	}
				2989	if (u == unicode_empty) {
				2990	Py_DECREF(u);
				2991	return (PyObject *)v;
				2992	}
				2993
				2994	/* Concat the two Unicode strings */
				2995	w = _PyUnicode_New(u->length + v->length);
				2996	if (w == NULL)
				2997	goto onError;
				2998	Py_UNICODE_COPY(w->str, u->str, u->length);
				2999	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3000
				3001	Py_DECREF(u);
				3002	Py_DECREF(v);
				3003	return (PyObject *)w;
				3004
				3005	onError:
				3006	Py_XDECREF(u);
				3007	Py_XDECREF(v);
				3008	return NULL;
				3009	}
				3010
				3011	static char count__doc__[] =
				3012	"S.count(sub[, start[, end]]) -> int\n\
				3013	\n\
				3014	Return the number of occurrences of substring sub in Unicode string\n\
				3015	S[start:end]. Optional arguments start and end are\n\
				3016	interpreted as in slice notation.";
				3017
				3018	static PyObject *
				3019	unicode_count(PyUnicodeObject self, PyObject args)
				3020	{
				3021	PyUnicodeObject *substring;
				3022	int start = 0;
				3023	int end = INT_MAX;
				3024	PyObject *result;
				3025
				3026	if (!PyArg_ParseTuple(args, "O\|ii:count", &substring, &start, &end))
				3027	return NULL;
				3028
				3029	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3030	(PyObject *)substring);
				3031	if (substring == NULL)
				3032	return NULL;
				3033
				3034	if (substring->length == 0) {
				3035	Py_DECREF(substring);
				3036	return PyInt_FromLong((long) 0);
				3037	}
				3038
				3039	if (start < 0)
				3040	start += self->length;
				3041	if (start < 0)
				3042	start = 0;
				3043	if (end > self->length)
				3044	end = self->length;
				3045	if (end < 0)
				3046	end += self->length;
				3047	if (end < 0)
				3048	end = 0;
				3049
				3050	result = PyInt_FromLong((long) count(self, start, end, substring));
				3051
				3052	Py_DECREF(substring);
				3053	return result;
				3054	}
				3055
				3056	static char encode__doc__[] =
				3057	"S.encode([encoding[,errors]]) -> string\n\
				3058	\n\
				3059	Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
				3060	errors may be given to set a different error handling scheme. Default\n\
				3061	is 'strict' meaning that encoding errors raise a ValueError. Other\n\
				3062	possible values are 'ignore' and 'replace'.";
				3063
				3064	static PyObject *
				3065	unicode_encode(PyUnicodeObject self, PyObject args)
				3066	{
				3067	char *encoding = NULL;
				3068	char *errors = NULL;
				3069	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3070	return NULL;
				3071	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3072	}
				3073
				3074	static char expandtabs__doc__[] =
				3075	"S.expandtabs([tabsize]) -> unicode\n\
				3076	\n\
				3077	Return a copy of S where all tab characters are expanded using spaces.\n\
				3078	If tabsize is not given, a tab size of 8 characters is assumed.";
				3079
				3080	static PyObject*
				3081	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3082	{
				3083	Py_UNICODE *e;
				3084	Py_UNICODE *p;
				3085	Py_UNICODE *q;
				3086	int i, j;
				3087	PyUnicodeObject *u;
				3088	int tabsize = 8;
				3089
				3090	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3091	return NULL;
				3092
				3093	/* First pass: determine size of ouput string */
				3094	i = j = 0;
				3095	e = self->str + self->length;
				3096	for (p = self->str; p < e; p++)
				3097	if (*p == '\t') {
				3098	if (tabsize > 0)
				3099	j += tabsize - (j % tabsize);
				3100	}
				3101	else {
				3102	j++;
				3103	if (p == '\n' \|\| p == '\r') {
				3104	i += j;
				3105	j = 0;
				3106	}
				3107	}
				3108
				3109	/* Second pass: create output string and fill it */
				3110	u = _PyUnicode_New(i + j);
				3111	if (!u)
				3112	return NULL;
				3113
				3114	j = 0;
				3115	q = u->str;
				3116
				3117	for (p = self->str; p < e; p++)
				3118	if (*p == '\t') {
				3119	if (tabsize > 0) {
				3120	i = tabsize - (j % tabsize);
				3121	j += i;
				3122	while (i--)
				3123	*q++ = ' ';
				3124	}
				3125	}
				3126	else {
				3127	j++;
				3128	q++ = p;
				3129	if (p == '\n' \|\| p == '\r')
				3130	j = 0;
				3131	}
				3132
				3133	return (PyObject*) u;
				3134	}
				3135
				3136	static char find__doc__[] =
				3137	"S.find(sub [,start [,end]]) -> int\n\
				3138	\n\
				3139	Return the lowest index in S where substring sub is found,\n\
				3140	such that sub is contained within s[start,end]. Optional\n\
				3141	arguments start and end are interpreted as in slice notation.\n\
				3142	\n\
				3143	Return -1 on failure.";
				3144
				3145	static PyObject *
				3146	unicode_find(PyUnicodeObject self, PyObject args)
				3147	{
				3148	PyUnicodeObject *substring;
				3149	int start = 0;
				3150	int end = INT_MAX;
				3151	PyObject *result;
				3152
				3153	if (!PyArg_ParseTuple(args, "O\|ii:find", &substring, &start, &end))
				3154	return NULL;
				3155	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3156	(PyObject *)substring);
				3157	if (substring == NULL)
				3158	return NULL;
				3159
				3160	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3161
				3162	Py_DECREF(substring);
				3163	return result;
				3164	}
				3165
				3166	static PyObject *
				3167	unicode_getitem(PyUnicodeObject *self, int index)
				3168	{
				3169	if (index < 0 \|\| index >= self->length) {
				3170	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3171	return NULL;
				3172	}
				3173
				3174	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3175	}
				3176
				3177	static long
				3178	unicode_hash(PyUnicodeObject *self)
				3179	{
				3180	long hash;
				3181	PyObject *utf8;
				3182
				3183	/* Since Unicode objects compare equal to their UTF-8 string
				3184	counterparts, they should also use the UTF-8 strings as basis
				3185	for their hash value. This is needed to assure that strings and
				3186	Unicode objects behave in the same way as dictionary
				3187	keys. Unfortunately, this costs some performance and also some
				3188	memory if the cached UTF-8 representation is not used later
				3189	on. */
				3190	if (self->hash != -1)
				3191	return self->hash;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	3192	utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3193	if (utf8 == NULL)
				3194	return -1;
				3195	hash = PyObject_Hash(utf8);
				3196	if (hash == -1)
				3197	return -1;
				3198	self->hash = hash;
				3199	return hash;
				3200	}
				3201
				3202	static char index__doc__[] =
				3203	"S.index(sub [,start [,end]]) -> int\n\
				3204	\n\
				3205	Like S.find() but raise ValueError when the substring is not found.";
				3206
				3207	static PyObject *
				3208	unicode_index(PyUnicodeObject self, PyObject args)
				3209	{
				3210	int result;
				3211	PyUnicodeObject *substring;
				3212	int start = 0;
				3213	int end = INT_MAX;
				3214
				3215	if (!PyArg_ParseTuple(args, "O\|ii:index", &substring, &start, &end))
				3216	return NULL;
				3217
				3218	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3219	(PyObject *)substring);
				3220	if (substring == NULL)
				3221	return NULL;
				3222
				3223	result = findstring(self, substring, start, end, 1);
				3224
				3225	Py_DECREF(substring);
				3226	if (result < 0) {
				3227	PyErr_SetString(PyExc_ValueError, "substring not found");
				3228	return NULL;
				3229	}
				3230	return PyInt_FromLong(result);
				3231	}
				3232
				3233	static char islower__doc__[] =
				3234	"S.islower() -> int\n\
				3235	\n\
				3236	Return 1 if all cased characters in S are lowercase and there is\n\
				3237	at least one cased character in S, 0 otherwise.";
				3238
				3239	static PyObject*
				3240	unicode_islower(PyUnicodeObject self, PyObject args)
				3241	{
				3242	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3243	register const Py_UNICODE *e;
				3244	int cased;
				3245
				3246	if (!PyArg_NoArgs(args))
				3247	return NULL;
				3248
				3249	/* Shortcut for single character strings */
				3250	if (PyUnicode_GET_SIZE(self) == 1)
				3251	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3252
				3253	e = p + PyUnicode_GET_SIZE(self);
				3254	cased = 0;
				3255	for (; p < e; p++) {
				3256	register const Py_UNICODE ch = *p;
				3257
				3258	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3259	return PyInt_FromLong(0);
				3260	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3261	cased = 1;
				3262	}
				3263	return PyInt_FromLong(cased);
				3264	}
				3265
				3266	static char isupper__doc__[] =
				3267	"S.isupper() -> int\n\
				3268	\n\
				3269	Return 1 if all cased characters in S are uppercase and there is\n\
				3270	at least one cased character in S, 0 otherwise.";
				3271
				3272	static PyObject*
				3273	unicode_isupper(PyUnicodeObject self, PyObject args)
				3274	{
				3275	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3276	register const Py_UNICODE *e;
				3277	int cased;
				3278
				3279	if (!PyArg_NoArgs(args))
				3280	return NULL;
				3281
				3282	/* Shortcut for single character strings */
				3283	if (PyUnicode_GET_SIZE(self) == 1)
				3284	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3285
				3286	e = p + PyUnicode_GET_SIZE(self);
				3287	cased = 0;
				3288	for (; p < e; p++) {
				3289	register const Py_UNICODE ch = *p;
				3290
				3291	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3292	return PyInt_FromLong(0);
				3293	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3294	cased = 1;
				3295	}
				3296	return PyInt_FromLong(cased);
				3297	}
				3298
				3299	static char istitle__doc__[] =
				3300	"S.istitle() -> int\n\
				3301	\n\
				3302	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3303	may only follow uncased characters and lowercase characters only cased\n\
				3304	ones. Return 0 otherwise.";
				3305
				3306	static PyObject*
				3307	unicode_istitle(PyUnicodeObject self, PyObject args)
				3308	{
				3309	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3310	register const Py_UNICODE *e;
				3311	int cased, previous_is_cased;
				3312
				3313	if (!PyArg_NoArgs(args))
				3314	return NULL;
				3315
				3316	/* Shortcut for single character strings */
				3317	if (PyUnicode_GET_SIZE(self) == 1)
				3318	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3319	(Py_UNICODE_ISUPPER(*p) != 0));
				3320
				3321	e = p + PyUnicode_GET_SIZE(self);
				3322	cased = 0;
				3323	previous_is_cased = 0;
				3324	for (; p < e; p++) {
				3325	register const Py_UNICODE ch = *p;
				3326
				3327	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3328	if (previous_is_cased)
				3329	return PyInt_FromLong(0);
				3330	previous_is_cased = 1;
				3331	cased = 1;
				3332	}
				3333	else if (Py_UNICODE_ISLOWER(ch)) {
				3334	if (!previous_is_cased)
				3335	return PyInt_FromLong(0);
				3336	previous_is_cased = 1;
				3337	cased = 1;
				3338	}
				3339	else
				3340	previous_is_cased = 0;
				3341	}
				3342	return PyInt_FromLong(cased);
				3343	}
				3344
				3345	static char isspace__doc__[] =
				3346	"S.isspace() -> int\n\
				3347	\n\
				3348	Return 1 if there are only whitespace characters in S,\n\
				3349	0 otherwise.";
				3350
				3351	static PyObject*
				3352	unicode_isspace(PyUnicodeObject self, PyObject args)
				3353	{
				3354	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3355	register const Py_UNICODE *e;
				3356
				3357	if (!PyArg_NoArgs(args))
				3358	return NULL;
				3359
				3360	/* Shortcut for single character strings */
				3361	if (PyUnicode_GET_SIZE(self) == 1 &&
				3362	Py_UNICODE_ISSPACE(*p))
				3363	return PyInt_FromLong(1);
				3364
				3365	e = p + PyUnicode_GET_SIZE(self);
				3366	for (; p < e; p++) {
				3367	if (!Py_UNICODE_ISSPACE(*p))
				3368	return PyInt_FromLong(0);
				3369	}
				3370	return PyInt_FromLong(1);
				3371	}
				3372
				3373	static char isdecimal__doc__[] =
				3374	"S.isdecimal() -> int\n\
				3375	\n\
				3376	Return 1 if there are only decimal characters in S,\n\
				3377	0 otherwise.";
				3378
				3379	static PyObject*
				3380	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3381	{
				3382	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3383	register const Py_UNICODE *e;
				3384
				3385	if (!PyArg_NoArgs(args))
				3386	return NULL;
				3387
				3388	/* Shortcut for single character strings */
				3389	if (PyUnicode_GET_SIZE(self) == 1 &&
				3390	Py_UNICODE_ISDECIMAL(*p))
				3391	return PyInt_FromLong(1);
				3392
				3393	e = p + PyUnicode_GET_SIZE(self);
				3394	for (; p < e; p++) {
				3395	if (!Py_UNICODE_ISDECIMAL(*p))
				3396	return PyInt_FromLong(0);
				3397	}
				3398	return PyInt_FromLong(1);
				3399	}
				3400
				3401	static char isdigit__doc__[] =
				3402	"S.isdigit() -> int\n\
				3403	\n\
				3404	Return 1 if there are only digit characters in S,\n\
				3405	0 otherwise.";
				3406
				3407	static PyObject*
				3408	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3409	{
				3410	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3411	register const Py_UNICODE *e;
				3412
				3413	if (!PyArg_NoArgs(args))
				3414	return NULL;
				3415
				3416	/* Shortcut for single character strings */
				3417	if (PyUnicode_GET_SIZE(self) == 1 &&
				3418	Py_UNICODE_ISDIGIT(*p))
				3419	return PyInt_FromLong(1);
				3420
				3421	e = p + PyUnicode_GET_SIZE(self);
				3422	for (; p < e; p++) {
				3423	if (!Py_UNICODE_ISDIGIT(*p))
				3424	return PyInt_FromLong(0);
				3425	}
				3426	return PyInt_FromLong(1);
				3427	}
				3428
				3429	static char isnumeric__doc__[] =
				3430	"S.isnumeric() -> int\n\
				3431	\n\
				3432	Return 1 if there are only numeric characters in S,\n\
				3433	0 otherwise.";
				3434
				3435	static PyObject*
				3436	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3437	{
				3438	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3439	register const Py_UNICODE *e;
				3440
				3441	if (!PyArg_NoArgs(args))
				3442	return NULL;
				3443
				3444	/* Shortcut for single character strings */
				3445	if (PyUnicode_GET_SIZE(self) == 1 &&
				3446	Py_UNICODE_ISNUMERIC(*p))
				3447	return PyInt_FromLong(1);
				3448
				3449	e = p + PyUnicode_GET_SIZE(self);
				3450	for (; p < e; p++) {
				3451	if (!Py_UNICODE_ISNUMERIC(*p))
				3452	return PyInt_FromLong(0);
				3453	}
				3454	return PyInt_FromLong(1);
				3455	}
				3456
				3457	static char join__doc__[] =
				3458	"S.join(sequence) -> unicode\n\
				3459	\n\
				3460	Return a string which is the concatenation of the strings in the\n\
				3461	sequence. The separator between elements is S.";
				3462
				3463	static PyObject*
				3464	unicode_join(PyUnicodeObject self, PyObject args)
				3465	{
				3466	PyObject *data;
				3467	if (!PyArg_ParseTuple(args, "O:join", &data))
				3468	return NULL;
				3469
				3470	return PyUnicode_Join((PyObject *)self, data);
				3471	}
				3472
				3473	static int
				3474	unicode_length(PyUnicodeObject *self)
				3475	{
				3476	return self->length;
				3477	}
				3478
				3479	static char ljust__doc__[] =
				3480	"S.ljust(width) -> unicode\n\
				3481	\n\
				3482	Return S left justified in a Unicode string of length width. Padding is\n\
				3483	done using spaces.";
				3484
				3485	static PyObject *
				3486	unicode_ljust(PyUnicodeObject self, PyObject args)
				3487	{
				3488	int width;
				3489	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3490	return NULL;
				3491
				3492	if (self->length >= width) {
				3493	Py_INCREF(self);
				3494	return (PyObject*) self;
				3495	}
				3496
				3497	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3498	}
				3499
				3500	static char lower__doc__[] =
				3501	"S.lower() -> unicode\n\
				3502	\n\
				3503	Return a copy of the string S converted to lowercase.";
				3504
				3505	static PyObject*
				3506	unicode_lower(PyUnicodeObject self, PyObject args)
				3507	{
				3508	if (!PyArg_NoArgs(args))
				3509	return NULL;
				3510	return fixup(self, fixlower);
				3511	}
				3512
				3513	static char lstrip__doc__[] =
				3514	"S.lstrip() -> unicode\n\
				3515	\n\
				3516	Return a copy of the string S with leading whitespace removed.";
				3517
				3518	static PyObject *
				3519	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3520	{
				3521	if (!PyArg_NoArgs(args))
				3522	return NULL;
				3523	return strip(self, 1, 0);
				3524	}
				3525
				3526	static PyObject*
				3527	unicode_repeat(PyUnicodeObject *str, int len)
				3528	{
				3529	PyUnicodeObject *u;
				3530	Py_UNICODE *p;
				3531
				3532	if (len < 0)
				3533	len = 0;
				3534
				3535	if (len == 1) {
				3536	/* no repeat, return original string */
				3537	Py_INCREF(str);
				3538	return (PyObject*) str;
				3539	}
				3540
				3541	u = _PyUnicode_New(len * str->length);
				3542	if (!u)
				3543	return NULL;
				3544
				3545	p = u->str;
				3546
				3547	while (len-- > 0) {
				3548	Py_UNICODE_COPY(p, str->str, str->length);
				3549	p += str->length;
				3550	}
				3551
				3552	return (PyObject*) u;
				3553	}
				3554
				3555	PyObject PyUnicode_Replace(PyObject obj,
				3556	PyObject *subobj,
				3557	PyObject *replobj,
				3558	int maxcount)
				3559	{
				3560	PyObject *self;
				3561	PyObject *str1;
				3562	PyObject *str2;
				3563	PyObject *result;
				3564
				3565	self = PyUnicode_FromObject(obj);
				3566	if (self == NULL)
				3567	return NULL;
				3568	str1 = PyUnicode_FromObject(subobj);
				3569	if (str1 == NULL) {
				3570	Py_DECREF(self);
				3571	return NULL;
				3572	}
				3573	str2 = PyUnicode_FromObject(replobj);
				3574	if (str2 == NULL) {
				3575	Py_DECREF(self);
				3576	Py_DECREF(str1);
				3577	return NULL;
				3578	}
				3579	result = replace((PyUnicodeObject *)self,
				3580	(PyUnicodeObject *)str1,
				3581	(PyUnicodeObject *)str2,
				3582	maxcount);
				3583	Py_DECREF(self);
				3584	Py_DECREF(str1);
				3585	Py_DECREF(str2);
				3586	return result;
				3587	}
				3588
				3589	static char replace__doc__[] =
				3590	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3591	\n\
				3592	Return a copy of S with all occurrences of substring\n\
				3593	old replaced by new. If the optional argument maxsplit is\n\
				3594	given, only the first maxsplit occurrences are replaced.";
				3595
				3596	static PyObject*
				3597	unicode_replace(PyUnicodeObject self, PyObject args)
				3598	{
				3599	PyUnicodeObject *str1;
				3600	PyUnicodeObject *str2;
				3601	int maxcount = -1;
				3602	PyObject *result;
				3603
				3604	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3605	return NULL;
				3606	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3607	if (str1 == NULL)
				3608	return NULL;
				3609	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3610	if (str2 == NULL)
				3611	return NULL;
				3612
				3613	result = replace(self, str1, str2, maxcount);
				3614
				3615	Py_DECREF(str1);
				3616	Py_DECREF(str2);
				3617	return result;
				3618	}
				3619
				3620	static
				3621	PyObject unicode_repr(PyObject unicode)
				3622	{
				3623	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				3624	PyUnicode_GET_SIZE(unicode),
				3625	1);
				3626	}
				3627
				3628	static char rfind__doc__[] =
				3629	"S.rfind(sub [,start [,end]]) -> int\n\
				3630	\n\
				3631	Return the highest index in S where substring sub is found,\n\
				3632	such that sub is contained within s[start,end]. Optional\n\
				3633	arguments start and end are interpreted as in slice notation.\n\
				3634	\n\
				3635	Return -1 on failure.";
				3636
				3637	static PyObject *
				3638	unicode_rfind(PyUnicodeObject self, PyObject args)
				3639	{
				3640	PyUnicodeObject *substring;
				3641	int start = 0;
				3642	int end = INT_MAX;
				3643	PyObject *result;
				3644
				3645	if (!PyArg_ParseTuple(args, "O\|ii:rfind", &substring, &start, &end))
				3646	return NULL;
				3647	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3648	(PyObject *)substring);
				3649	if (substring == NULL)
				3650	return NULL;
				3651
				3652	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				3653
				3654	Py_DECREF(substring);
				3655	return result;
				3656	}
				3657
				3658	static char rindex__doc__[] =
				3659	"S.rindex(sub [,start [,end]]) -> int\n\
				3660	\n\
				3661	Like S.rfind() but raise ValueError when the substring is not found.";
				3662
				3663	static PyObject *
				3664	unicode_rindex(PyUnicodeObject self, PyObject args)
				3665	{
				3666	int result;
				3667	PyUnicodeObject *substring;
				3668	int start = 0;
				3669	int end = INT_MAX;
				3670
				3671	if (!PyArg_ParseTuple(args, "O\|ii:rindex", &substring, &start, &end))
				3672	return NULL;
				3673	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3674	(PyObject *)substring);
				3675	if (substring == NULL)
				3676	return NULL;
				3677
				3678	result = findstring(self, substring, start, end, -1);
				3679
				3680	Py_DECREF(substring);
				3681	if (result < 0) {
				3682	PyErr_SetString(PyExc_ValueError, "substring not found");
				3683	return NULL;
				3684	}
				3685	return PyInt_FromLong(result);
				3686	}
				3687
				3688	static char rjust__doc__[] =
				3689	"S.rjust(width) -> unicode\n\
				3690	\n\
				3691	Return S right justified in a Unicode string of length width. Padding is\n\
				3692	done using spaces.";
				3693
				3694	static PyObject *
				3695	unicode_rjust(PyUnicodeObject self, PyObject args)
				3696	{
				3697	int width;
				3698	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				3699	return NULL;
				3700
				3701	if (self->length >= width) {
				3702	Py_INCREF(self);
				3703	return (PyObject*) self;
				3704	}
				3705
				3706	return (PyObject*) pad(self, width - self->length, 0, ' ');
				3707	}
				3708
				3709	static char rstrip__doc__[] =
				3710	"S.rstrip() -> unicode\n\
				3711	\n\
				3712	Return a copy of the string S with trailing whitespace removed.";
				3713
				3714	static PyObject *
				3715	unicode_rstrip(PyUnicodeObject self, PyObject args)
				3716	{
				3717	if (!PyArg_NoArgs(args))
				3718	return NULL;
				3719	return strip(self, 0, 1);
				3720	}
				3721
				3722	static PyObject*
				3723	unicode_slice(PyUnicodeObject *self, int start, int end)
				3724	{
				3725	/* standard clamping */
				3726	if (start < 0)
				3727	start = 0;
				3728	if (end < 0)
				3729	end = 0;
				3730	if (end > self->length)
				3731	end = self->length;
				3732	if (start == 0 && end == self->length) {
				3733	/* full slice, return original string */
				3734	Py_INCREF(self);
				3735	return (PyObject*) self;
				3736	}
				3737	if (start > end)
				3738	start = end;
				3739	/* copy slice */
				3740	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				3741	end - start);
				3742	}
				3743
				3744	PyObject PyUnicode_Split(PyObject s,
				3745	PyObject *sep,
				3746	int maxsplit)
				3747	{
				3748	PyObject *result;
				3749
				3750	s = PyUnicode_FromObject(s);
				3751	if (s == NULL)
				3752	return NULL;
				3753	if (sep != NULL) {
				3754	sep = PyUnicode_FromObject(sep);
				3755	if (sep == NULL) {
				3756	Py_DECREF(s);
				3757	return NULL;
				3758	}
				3759	}
				3760
				3761	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				3762
				3763	Py_DECREF(s);
				3764	Py_XDECREF(sep);
				3765	return result;
				3766	}
				3767
				3768	static char split__doc__[] =
				3769	"S.split([sep [,maxsplit]]) -> list of strings\n\
				3770	\n\
				3771	Return a list of the words in S, using sep as the\n\
				3772	delimiter string. If maxsplit is given, at most maxsplit\n\
				3773	splits are done. If sep is not specified, any whitespace string\n\
				3774	is a separator.";
				3775
				3776	static PyObject*
				3777	unicode_split(PyUnicodeObject self, PyObject args)
				3778	{
				3779	PyObject *substring = Py_None;
				3780	int maxcount = -1;
				3781
				3782	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				3783	return NULL;
				3784
				3785	if (substring == Py_None)
				3786	return split(self, NULL, maxcount);
				3787	else if (PyUnicode_Check(substring))
				3788	return split(self, (PyUnicodeObject *)substring, maxcount);
				3789	else
				3790	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				3791	}
				3792
				3793	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3794	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3795	\n\
				3796	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3797	Line breaks are not included in the resulting list unless keepends\n\
				3798	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3799
				3800	static PyObject*
				3801	unicode_splitlines(PyUnicodeObject self, PyObject args)
				3802	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3803	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3804
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3805	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3806	return NULL;
				3807
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3808	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3809	}
				3810
				3811	static
				3812	PyObject unicode_str(PyUnicodeObject self)
				3813	{
				3814	return PyUnicode_AsUTF8String((PyObject *)self);
				3815	}
				3816
				3817	static char strip__doc__[] =
				3818	"S.strip() -> unicode\n\
				3819	\n\
				3820	Return a copy of S with leading and trailing whitespace removed.";
				3821
				3822	static PyObject *
				3823	unicode_strip(PyUnicodeObject self, PyObject args)
				3824	{
				3825	if (!PyArg_NoArgs(args))
				3826	return NULL;
				3827	return strip(self, 1, 1);
				3828	}
				3829
				3830	static char swapcase__doc__[] =
				3831	"S.swapcase() -> unicode\n\
				3832	\n\
				3833	Return a copy of S with uppercase characters converted to lowercase\n\
				3834	and vice versa.";
				3835
				3836	static PyObject*
				3837	unicode_swapcase(PyUnicodeObject self, PyObject args)
				3838	{
				3839	if (!PyArg_NoArgs(args))
				3840	return NULL;
				3841	return fixup(self, fixswapcase);
				3842	}
				3843
				3844	static char translate__doc__[] =
				3845	"S.translate(table) -> unicode\n\
				3846	\n\
				3847	Return a copy of the string S, where all characters have been mapped\n\
				3848	through the given translation table, which must be a mapping of\n\
				3849	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				3850	are left untouched. Characters mapped to None are deleted.";
				3851
				3852	static PyObject*
				3853	unicode_translate(PyUnicodeObject self, PyObject args)
				3854	{
				3855	PyObject *table;
				3856
				3857	if (!PyArg_ParseTuple(args, "O:translate", &table))
				3858	return NULL;
				3859	return PyUnicode_TranslateCharmap(self->str,
				3860	self->length,
				3861	table,
				3862	"ignore");
				3863	}
				3864
				3865	static char upper__doc__[] =
				3866	"S.upper() -> unicode\n\
				3867	\n\
				3868	Return a copy of S converted to uppercase.";
				3869
				3870	static PyObject*
				3871	unicode_upper(PyUnicodeObject self, PyObject args)
				3872	{
				3873	if (!PyArg_NoArgs(args))
				3874	return NULL;
				3875	return fixup(self, fixupper);
				3876	}
				3877
				3878	#if 0
				3879	static char zfill__doc__[] =
				3880	"S.zfill(width) -> unicode\n\
				3881	\n\
				3882	Pad a numeric string x with zeros on the left, to fill a field\n\
				3883	of the specified width. The string x is never truncated.";
				3884
				3885	static PyObject *
				3886	unicode_zfill(PyUnicodeObject self, PyObject args)
				3887	{
				3888	int fill;
				3889	PyUnicodeObject *u;
				3890
				3891	int width;
				3892	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				3893	return NULL;
				3894
				3895	if (self->length >= width) {
				3896	Py_INCREF(self);
				3897	return (PyObject*) self;
				3898	}
				3899
				3900	fill = width - self->length;
				3901
				3902	u = pad(self, fill, 0, '0');
				3903
				3904	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				3905	/* move sign to beginning of string */
				3906	u->str[0] = u->str[fill];
				3907	u->str[fill] = '0';
				3908	}
				3909
				3910	return (PyObject*) u;
				3911	}
				3912	#endif
				3913
				3914	#if 0
				3915	static PyObject*
				3916	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				3917	{
				3918	if (!PyArg_NoArgs(args))
				3919	return NULL;
				3920	return PyInt_FromLong(unicode_freelist_size);
				3921	}
				3922	#endif
				3923
				3924	static char startswith__doc__[] =
				3925	"S.startswith(prefix[, start[, end]]) -> int\n\
				3926	\n\
				3927	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				3928	optional start, test S beginning at that position. With optional end, stop\n\
				3929	comparing S at that position.";
				3930
				3931	static PyObject *
				3932	unicode_startswith(PyUnicodeObject *self,
				3933	PyObject *args)
				3934	{
				3935	PyUnicodeObject *substring;
				3936	int start = 0;
				3937	int end = INT_MAX;
				3938	PyObject *result;
				3939
				3940	if (!PyArg_ParseTuple(args, "O\|ii:startswith", &substring, &start, &end))
				3941	return NULL;
				3942	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3943	(PyObject *)substring);
				3944	if (substring == NULL)
				3945	return NULL;
				3946
				3947	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				3948
				3949	Py_DECREF(substring);
				3950	return result;
				3951	}
				3952
				3953
				3954	static char endswith__doc__[] =
				3955	"S.endswith(suffix[, start[, end]]) -> int\n\
				3956	\n\
				3957	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				3958	optional start, test S beginning at that position. With optional end, stop\n\
				3959	comparing S at that position.";
				3960
				3961	static PyObject *
				3962	unicode_endswith(PyUnicodeObject *self,
				3963	PyObject *args)
				3964	{
				3965	PyUnicodeObject *substring;
				3966	int start = 0;
				3967	int end = INT_MAX;
				3968	PyObject *result;
				3969
				3970	if (!PyArg_ParseTuple(args, "O\|ii:endswith", &substring, &start, &end))
				3971	return NULL;
				3972	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3973	(PyObject *)substring);
				3974	if (substring == NULL)
				3975	return NULL;
				3976
				3977	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				3978
				3979	Py_DECREF(substring);
				3980	return result;
				3981	}
				3982
				3983
				3984	static PyMethodDef unicode_methods[] = {
				3985
				3986	/* Order is according to common usage: often used methods should
				3987	appear first, since lookup is done sequentially. */
				3988
				3989	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				3990	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				3991	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				3992	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				3993	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				3994	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				3995	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				3996	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				3997	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				3998	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				3999	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4000	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4001	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4002	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4003	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4004	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4005	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4006	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4007	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4008	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4009	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4010	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4011	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4012	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4013	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4014	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4015	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4016	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4017	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4018	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4019	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4020	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4021	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
				4022	#if 0
				4023	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4024	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4025	#endif
				4026
				4027	#if 0
				4028	/* This one is just used for debugging the implementation. */
				4029	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4030	#endif
				4031
				4032	{NULL, NULL}
				4033	};
				4034
				4035	static PyObject *
				4036	unicode_getattr(PyUnicodeObject self, char name)
				4037	{
				4038	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4039	}
				4040
				4041	static PySequenceMethods unicode_as_sequence = {
				4042	(inquiry) unicode_length, /* sq_length */
				4043	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4044	(intargfunc) unicode_repeat, /* sq_repeat */
				4045	(intargfunc) unicode_getitem, /* sq_item */
				4046	(intintargfunc) unicode_slice, /* sq_slice */
				4047	0, /* sq_ass_item */
				4048	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4049	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4050	};
				4051
				4052	static int
				4053	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4054	int index,
				4055	const void **ptr)
				4056	{
				4057	if (index != 0) {
				4058	PyErr_SetString(PyExc_SystemError,
				4059	"accessing non-existent unicode segment");
				4060	return -1;
				4061	}
				4062	ptr = (void ) self->str;
				4063	return PyUnicode_GET_DATA_SIZE(self);
				4064	}
				4065
				4066	static int
				4067	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4068	const void **ptr)
				4069	{
				4070	PyErr_SetString(PyExc_TypeError,
				4071	"cannot use unicode as modifyable buffer");
				4072	return -1;
				4073	}
				4074
				4075	static int
				4076	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4077	int *lenp)
				4078	{
				4079	if (lenp)
				4080	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4081	return 1;
				4082	}
				4083
				4084	static int
				4085	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4086	int index,
				4087	const void **ptr)
				4088	{
				4089	PyObject *str;
				4090
				4091	if (index != 0) {
				4092	PyErr_SetString(PyExc_SystemError,
				4093	"accessing non-existent unicode segment");
				4094	return -1;
				4095	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	4096	str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4097	if (str == NULL)
				4098	return -1;
				4099	ptr = (void ) PyString_AS_STRING(str);
				4100	return PyString_GET_SIZE(str);
				4101	}
				4102
				4103	/* Helpers for PyUnicode_Format() */
				4104
				4105	static PyObject *
				4106	getnextarg(args, arglen, p_argidx)
				4107	PyObject *args;
				4108	int arglen;
				4109	int *p_argidx;
				4110	{
				4111	int argidx = *p_argidx;
				4112	if (argidx < arglen) {
				4113	(*p_argidx)++;
				4114	if (arglen < 0)
				4115	return args;
				4116	else
				4117	return PyTuple_GetItem(args, argidx);
				4118	}
				4119	PyErr_SetString(PyExc_TypeError,
				4120	"not enough arguments for format string");
				4121	return NULL;
				4122	}
				4123
				4124	#define F_LJUST (1<<0)
				4125	#define F_SIGN (1<<1)
				4126	#define F_BLANK (1<<2)
				4127	#define F_ALT (1<<3)
				4128	#define F_ZERO (1<<4)
				4129
				4130	static
				4131	#ifdef HAVE_STDARG_PROTOTYPES
				4132	int usprintf(register Py_UNICODE buffer, char format, ...)
				4133	#else
				4134	int usprintf(va_alist) va_dcl
				4135	#endif
				4136	{
				4137	register int i;
				4138	int len;
				4139	va_list va;
				4140	char *charbuffer;
				4141	#ifdef HAVE_STDARG_PROTOTYPES
				4142	va_start(va, format);
				4143	#else
				4144	Py_UNICODE *args;
				4145	char *format;
				4146
				4147	va_start(va);
				4148	buffer = va_arg(va, Py_UNICODE *);
				4149	format = va_arg(va, char *);
				4150	#endif
				4151
				4152	/* First, format the string as char array, then expand to Py_UNICODE
				4153	array. */
				4154	charbuffer = (char *)buffer;
				4155	len = vsprintf(charbuffer, format, va);
				4156	for (i = len - 1; i >= 0; i--)
				4157	buffer[i] = (Py_UNICODE) charbuffer[i];
				4158
				4159	va_end(va);
				4160	return len;
				4161	}
				4162
				4163	static int
				4164	formatfloat(Py_UNICODE *buf,
				4165	int flags,
				4166	int prec,
				4167	int type,
				4168	PyObject *v)
				4169	{
				4170	char fmt[20];
				4171	double x;
				4172
				4173	x = PyFloat_AsDouble(v);
				4174	if (x == -1.0 && PyErr_Occurred())
				4175	return -1;
				4176	if (prec < 0)
				4177	prec = 6;
				4178	if (prec > 50)
				4179	prec = 50; /* Arbitrary limitation */
				4180	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4181	type = 'g';
				4182	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
				4183	return usprintf(buf, fmt, x);
				4184	}
				4185
				4186	static int
				4187	formatint(Py_UNICODE *buf,
				4188	int flags,
				4189	int prec,
				4190	int type,
				4191	PyObject *v)
				4192	{
				4193	char fmt[20];
				4194	long x;
				4195
				4196	x = PyInt_AsLong(v);
				4197	if (x == -1 && PyErr_Occurred())
				4198	return -1;
				4199	if (prec < 0)
				4200	prec = 1;
				4201	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4202	return usprintf(buf, fmt, x);
				4203	}
				4204
				4205	static int
				4206	formatchar(Py_UNICODE *buf,
				4207	PyObject *v)
				4208	{
				4209	if (PyUnicode_Check(v))
				4210	buf[0] = PyUnicode_AS_UNICODE(v)[0];
				4211
				4212	else if (PyString_Check(v))
				4213	buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
				4214
				4215	else {
				4216	/* Integer input truncated to a character */
				4217	long x;
				4218	x = PyInt_AsLong(v);
				4219	if (x == -1 && PyErr_Occurred())
				4220	return -1;
				4221	buf[0] = (char) x;
				4222	}
				4223	buf[1] = '\0';
				4224	return 1;
				4225	}
				4226
				4227	PyObject PyUnicode_Format(PyObject format,
				4228	PyObject *args)
				4229	{
				4230	Py_UNICODE fmt, res;
				4231	int fmtcnt, rescnt, reslen, arglen, argidx;
				4232	int args_owned = 0;
				4233	PyUnicodeObject *result = NULL;
				4234	PyObject *dict = NULL;
				4235	PyObject *uformat;
				4236
				4237	if (format == NULL \|\| args == NULL) {
				4238	PyErr_BadInternalCall();
				4239	return NULL;
				4240	}
				4241	uformat = PyUnicode_FromObject(format);
				4242	fmt = PyUnicode_AS_UNICODE(uformat);
				4243	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4244
				4245	reslen = rescnt = fmtcnt + 100;
				4246	result = _PyUnicode_New(reslen);
				4247	if (result == NULL)
				4248	goto onError;
				4249	res = PyUnicode_AS_UNICODE(result);
				4250
				4251	if (PyTuple_Check(args)) {
				4252	arglen = PyTuple_Size(args);
				4253	argidx = 0;
				4254	}
				4255	else {
				4256	arglen = -1;
				4257	argidx = -2;
				4258	}
				4259	if (args->ob_type->tp_as_mapping)
				4260	dict = args;
				4261
				4262	while (--fmtcnt >= 0) {
				4263	if (*fmt != '%') {
				4264	if (--rescnt < 0) {
				4265	rescnt = fmtcnt + 100;
				4266	reslen += rescnt;
				4267	if (_PyUnicode_Resize(result, reslen) < 0)
				4268	return NULL;
				4269	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4270	--rescnt;
				4271	}
				4272	res++ = fmt++;
				4273	}
				4274	else {
				4275	/* Got a format specifier */
				4276	int flags = 0;
				4277	int width = -1;
				4278	int prec = -1;
				4279	int size = 0;
				4280	Py_UNICODE c = '\0';
				4281	Py_UNICODE fill;
				4282	PyObject *v = NULL;
				4283	PyObject *temp = NULL;
				4284	Py_UNICODE *buf;
				4285	Py_UNICODE sign;
				4286	int len;
				4287	Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
				4288
				4289	fmt++;
				4290	if (*fmt == '(') {
				4291	Py_UNICODE *keystart;
				4292	int keylen;
				4293	PyObject *key;
				4294	int pcount = 1;
				4295
				4296	if (dict == NULL) {
				4297	PyErr_SetString(PyExc_TypeError,
				4298	"format requires a mapping");
				4299	goto onError;
				4300	}
				4301	++fmt;
				4302	--fmtcnt;
				4303	keystart = fmt;
				4304	/* Skip over balanced parentheses */
				4305	while (pcount > 0 && --fmtcnt >= 0) {
				4306	if (*fmt == ')')
				4307	--pcount;
				4308	else if (*fmt == '(')
				4309	++pcount;
				4310	fmt++;
				4311	}
				4312	keylen = fmt - keystart - 1;
				4313	if (fmtcnt < 0 \|\| pcount > 0) {
				4314	PyErr_SetString(PyExc_ValueError,
				4315	"incomplete format key");
				4316	goto onError;
				4317	}
				4318	/* keys are converted to strings (using UTF-8) and
				4319	then looked up since Python uses strings to hold
				4320	variables names etc. in its namespaces and we
				4321	wouldn't want to break common idioms. The
				4322	alternative would be using Unicode objects for the
				4323	lookup but u"abc" and "abc" have different hash
				4324	values (on purpose). */
				4325	key = PyUnicode_EncodeUTF8(keystart,
				4326	keylen,
				4327	NULL);
				4328	if (key == NULL)
				4329	goto onError;
				4330	if (args_owned) {
				4331	Py_DECREF(args);
				4332	args_owned = 0;
				4333	}
				4334	args = PyObject_GetItem(dict, key);
				4335	Py_DECREF(key);
				4336	if (args == NULL) {
				4337	goto onError;
				4338	}
				4339	args_owned = 1;
				4340	arglen = -1;
				4341	argidx = -2;
				4342	}
				4343	while (--fmtcnt >= 0) {
				4344	switch (c = *fmt++) {
				4345	case '-': flags \|= F_LJUST; continue;
				4346	case '+': flags \|= F_SIGN; continue;
				4347	case ' ': flags \|= F_BLANK; continue;
				4348	case '#': flags \|= F_ALT; continue;
				4349	case '0': flags \|= F_ZERO; continue;
				4350	}
				4351	break;
				4352	}
				4353	if (c == '*') {
				4354	v = getnextarg(args, arglen, &argidx);
				4355	if (v == NULL)
				4356	goto onError;
				4357	if (!PyInt_Check(v)) {
				4358	PyErr_SetString(PyExc_TypeError,
				4359	"* wants int");
				4360	goto onError;
				4361	}
				4362	width = PyInt_AsLong(v);
				4363	if (width < 0) {
				4364	flags \|= F_LJUST;
				4365	width = -width;
				4366	}
				4367	if (--fmtcnt >= 0)
				4368	c = *fmt++;
				4369	}
				4370	else if (c >= '0' && c <= '9') {
				4371	width = c - '0';
				4372	while (--fmtcnt >= 0) {
				4373	c = *fmt++;
				4374	if (c < '0' \|\| c > '9')
				4375	break;
				4376	if ((width*10) / 10 != width) {
				4377	PyErr_SetString(PyExc_ValueError,
				4378	"width too big");
				4379	goto onError;
				4380	}
				4381	width = width*10 + (c - '0');
				4382	}
				4383	}
				4384	if (c == '.') {
				4385	prec = 0;
				4386	if (--fmtcnt >= 0)
				4387	c = *fmt++;
				4388	if (c == '*') {
				4389	v = getnextarg(args, arglen, &argidx);
				4390	if (v == NULL)
				4391	goto onError;
				4392	if (!PyInt_Check(v)) {
				4393	PyErr_SetString(PyExc_TypeError,
				4394	"* wants int");
				4395	goto onError;
				4396	}
				4397	prec = PyInt_AsLong(v);
				4398	if (prec < 0)
				4399	prec = 0;
				4400	if (--fmtcnt >= 0)
				4401	c = *fmt++;
				4402	}
				4403	else if (c >= '0' && c <= '9') {
				4404	prec = c - '0';
				4405	while (--fmtcnt >= 0) {
				4406	c = Py_CHARMASK(*fmt++);
				4407	if (c < '0' \|\| c > '9')
				4408	break;
				4409	if ((prec*10) / 10 != prec) {
				4410	PyErr_SetString(PyExc_ValueError,
				4411	"prec too big");
				4412	goto onError;
				4413	}
				4414	prec = prec*10 + (c - '0');
				4415	}
				4416	}
				4417	} /* prec */
				4418	if (fmtcnt >= 0) {
				4419	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4420	size = c;
				4421	if (--fmtcnt >= 0)
				4422	c = *fmt++;
				4423	}
				4424	}
				4425	if (fmtcnt < 0) {
				4426	PyErr_SetString(PyExc_ValueError,
				4427	"incomplete format");
				4428	goto onError;
				4429	}
				4430	if (c != '%') {
				4431	v = getnextarg(args, arglen, &argidx);
				4432	if (v == NULL)
				4433	goto onError;
				4434	}
				4435	sign = 0;
				4436	fill = ' ';
				4437	switch (c) {
				4438
				4439	case '%':
				4440	buf = tmpbuf;
				4441	buf[0] = '%';
				4442	len = 1;
				4443	break;
				4444
				4445	case 's':
				4446	case 'r':
				4447	if (PyUnicode_Check(v) && c == 's') {
				4448	temp = v;
				4449	Py_INCREF(temp);
				4450	}
				4451	else {
				4452	PyObject *unicode;
				4453	if (c == 's')
				4454	temp = PyObject_Str(v);
				4455	else
				4456	temp = PyObject_Repr(v);
				4457	if (temp == NULL)
				4458	goto onError;
				4459	if (!PyString_Check(temp)) {
				4460	/* XXX Note: this should never happen, since
				4461	PyObject_Repr() and PyObject_Str() assure
				4462	this */
				4463	Py_DECREF(temp);
				4464	PyErr_SetString(PyExc_TypeError,
				4465	"%s argument has non-string str()");
				4466	goto onError;
				4467	}
				4468	unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
				4469	PyString_GET_SIZE(temp),
				4470	"strict");
				4471	Py_DECREF(temp);
				4472	temp = unicode;
				4473	if (temp == NULL)
				4474	goto onError;
				4475	}
				4476	buf = PyUnicode_AS_UNICODE(temp);
				4477	len = PyUnicode_GET_SIZE(temp);
				4478	if (prec >= 0 && len > prec)
				4479	len = prec;
				4480	break;
				4481
				4482	case 'i':
				4483	case 'd':
				4484	case 'u':
				4485	case 'o':
				4486	case 'x':
				4487	case 'X':
				4488	if (c == 'i')
				4489	c = 'd';
				4490	buf = tmpbuf;
				4491	len = formatint(buf, flags, prec, c, v);
				4492	if (len < 0)
				4493	goto onError;
				4494	sign = (c == 'd');
				4495	if (flags & F_ZERO) {
				4496	fill = '0';
				4497	if ((flags&F_ALT) &&
				4498	(c == 'x' \|\| c == 'X') &&
				4499	buf[0] == '0' && buf[1] == c) {
				4500	res++ = buf++;
				4501	res++ = buf++;
				4502	rescnt -= 2;
				4503	len -= 2;
				4504	width -= 2;
				4505	if (width < 0)
				4506	width = 0;
				4507	}
				4508	}
				4509	break;
				4510
				4511	case 'e':
				4512	case 'E':
				4513	case 'f':
				4514	case 'g':
				4515	case 'G':
				4516	buf = tmpbuf;
				4517	len = formatfloat(buf, flags, prec, c, v);
				4518	if (len < 0)
				4519	goto onError;
				4520	sign = 1;
				4521	if (flags&F_ZERO)
				4522	fill = '0';
				4523	break;
				4524
				4525	case 'c':
				4526	buf = tmpbuf;
				4527	len = formatchar(buf, v);
				4528	if (len < 0)
				4529	goto onError;
				4530	break;
				4531
				4532	default:
				4533	PyErr_Format(PyExc_ValueError,
				4534	"unsupported format character '%c' (0x%x)",
				4535	c, c);
				4536	goto onError;
				4537	}
				4538	if (sign) {
				4539	if (buf == '-' \|\| buf == '+') {
				4540	sign = *buf++;
				4541	len--;
				4542	}
				4543	else if (flags & F_SIGN)
				4544	sign = '+';
				4545	else if (flags & F_BLANK)
				4546	sign = ' ';
				4547	else
				4548	sign = 0;
				4549	}
				4550	if (width < len)
				4551	width = len;
				4552	if (rescnt < width + (sign != 0)) {
				4553	reslen -= rescnt;
				4554	rescnt = width + fmtcnt + 100;
				4555	reslen += rescnt;
				4556	if (_PyUnicode_Resize(result, reslen) < 0)
				4557	return NULL;
				4558	res = PyUnicode_AS_UNICODE(result)
				4559	+ reslen - rescnt;
				4560	}
				4561	if (sign) {
				4562	if (fill != ' ')
				4563	*res++ = sign;
				4564	rescnt--;
				4565	if (width > len)
				4566	width--;
				4567	}
				4568	if (width > len && !(flags & F_LJUST)) {
				4569	do {
				4570	--rescnt;
				4571	*res++ = fill;
				4572	} while (--width > len);
				4573	}
				4574	if (sign && fill == ' ')
				4575	*res++ = sign;
				4576	memcpy(res, buf, len * sizeof(Py_UNICODE));
				4577	res += len;
				4578	rescnt -= len;
				4579	while (--width >= len) {
				4580	--rescnt;
				4581	*res++ = ' ';
				4582	}
				4583	if (dict && (argidx < arglen) && c != '%') {
				4584	PyErr_SetString(PyExc_TypeError,
				4585	"not all arguments converted");
				4586	goto onError;
				4587	}
				4588	Py_XDECREF(temp);
				4589	} /* '%' */
				4590	} /* until end */
				4591	if (argidx < arglen && !dict) {
				4592	PyErr_SetString(PyExc_TypeError,
				4593	"not all arguments converted");
				4594	goto onError;
				4595	}
				4596
				4597	if (args_owned) {
				4598	Py_DECREF(args);
				4599	}
				4600	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4601	if (_PyUnicode_Resize(result, reslen - rescnt))
				4602	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4603	return (PyObject *)result;
				4604
				4605	onError:
				4606	Py_XDECREF(result);
				4607	Py_DECREF(uformat);
				4608	if (args_owned) {
				4609	Py_DECREF(args);
				4610	}
				4611	return NULL;
				4612	}
				4613
				4614	static PyBufferProcs unicode_as_buffer = {
				4615	(getreadbufferproc) unicode_buffer_getreadbuf,
				4616	(getwritebufferproc) unicode_buffer_getwritebuf,
				4617	(getsegcountproc) unicode_buffer_getsegcount,
				4618	(getcharbufferproc) unicode_buffer_getcharbuf,
				4619	};
				4620
				4621	PyTypeObject PyUnicode_Type = {
				4622	PyObject_HEAD_INIT(&PyType_Type)
				4623	0, /* ob_size */
				4624	"unicode", /* tp_name */
				4625	sizeof(PyUnicodeObject), /* tp_size */
				4626	0, /* tp_itemsize */
				4627	/* Slots */
				4628	(destructor)_PyUnicode_Free, /* tp_dealloc */
				4629	0, /* tp_print */
				4630	(getattrfunc)unicode_getattr, /* tp_getattr */
				4631	0, /* tp_setattr */
				4632	(cmpfunc) unicode_compare, /* tp_compare */
				4633	(reprfunc) unicode_repr, /* tp_repr */
				4634	0, /* tp_as_number */
				4635	&unicode_as_sequence, /* tp_as_sequence */
				4636	0, /* tp_as_mapping */
				4637	(hashfunc) unicode_hash, /* tp_hash*/
				4638	0, /* tp_call*/
				4639	(reprfunc) unicode_str, /* tp_str */
				4640	(getattrofunc) NULL, /* tp_getattro */
				4641	(setattrofunc) NULL, /* tp_setattro */
				4642	&unicode_as_buffer, /* tp_as_buffer */
				4643	Py_TPFLAGS_DEFAULT, /* tp_flags */
				4644	};
				4645
				4646	/* Initialize the Unicode implementation */
				4647
				4648	void _PyUnicode_Init()
				4649	{
				4650	/* Doublecheck the configuration... */
				4651	if (sizeof(Py_UNICODE) != 2)
				4652	Py_FatalError("Unicode configuration error: "
				4653	"sizeof(Py_UNICODE) != 2 bytes");
				4654
				4655	unicode_empty = _PyUnicode_New(0);
				4656	}
				4657
				4658	/* Finalize the Unicode implementation */
				4659
				4660	void
				4661	_PyUnicode_Fini()
				4662	{
				4663	PyUnicodeObject *u = unicode_freelist;
				4664
				4665	while (u != NULL) {
				4666	PyUnicodeObject *v = u;
				4667	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4668	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	4669	PyMem_DEL(v->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4670	Py_XDECREF(v->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	4671	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4672	}
				4673	Py_XDECREF(unicode_empty);
				4674	}