Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: da12da264481ab30cd62d7ae9ac4ffacfbd7e2a0 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
				4	modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
				5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
				69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
				76	/* Limit for the Unicode object free list */
				77
				78	#define MAX_UNICODE_FREELIST_SIZE 1024
				79
				80	/* Limit for the Unicode object free list stay alive optimization.
				81
				82	The implementation will keep allocated Unicode memory intact for
				83	all objects on the free list having a size less than this
				84	limit. This reduces malloc() overhead for small Unicode objects.
				85
				86	At worse this will result in MAX_UNICODE_FREELIST_SIZE *
				87	(sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
				88	malloc()-overhead) bytes of unused garbage.
				89
				90	Setting the limit to 0 effectively turns the feature off.
				91
				92	XXX The feature is currently turned off because there are
				93	apparently some lingering bugs in its implementation which I
				94	haven't yet been able to sort out.
				95
				96	*/
				97
				98	#define STAYALIVE_SIZE_LIMIT 0
				99
				100	/* Endianness switches; defaults to little endian */
				101
				102	#ifdef WORDS_BIGENDIAN
				103	# define BYTEORDER_IS_BIG_ENDIAN
				104	#else
				105	# define BYTEORDER_IS_LITTLE_ENDIAN
				106	#endif
				107
				108	/* --- Globals ------------------------------------------------------------ */
				109
				110	/* The empty Unicode object */
				111	static PyUnicodeObject *unicode_empty = NULL;
				112
				113	/* Free list for Unicode objects */
				114	static PyUnicodeObject *unicode_freelist = NULL;
				115	static int unicode_freelist_size = 0;
				116
				117	/* --- Unicode Object ----------------------------------------------------- */
				118
				119	static
				120	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				121	int length)
				122	{
				123	void *oldstr;
				124
				125	/* Shortcut if there's nothing to do. */
				126	if (unicode->length == length)
				127	return 0;
				128
				129	/* Resizing unicode_empty is not allowed. */
				130	if (unicode == unicode_empty) {
				131	PyErr_SetString(PyExc_SystemError,
				132	"can't resize empty unicode object");
				133	return -1;
				134	}
				135
				136	/* We allocate one more byte to make sure the string is
				137	Ux0000 terminated -- XXX is this needed ? */
				138	oldstr = unicode->str;
				139	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				140	if (!unicode->str) {
				141	unicode->str = oldstr;
				142	PyErr_NoMemory();
				143	return -1;
				144	}
				145	unicode->str[length] = 0;
				146	unicode->length = length;
				147
				148	/* Reset the object caches */
				149	if (unicode->utf8str) {
				150	Py_DECREF(unicode->utf8str);
				151	unicode->utf8str = NULL;
				152	}
				153	unicode->hash = -1;
				154
				155	return 0;
				156	}
				157
				158	/* We allocate one more byte to make sure the string is
				159	Ux0000 terminated -- XXX is this needed ?
				160
				161	XXX This allocator could further be enhanced by assuring that the
				162	free list never reduces its size below 1.
				163
				164	*/
				165
				166	static
				167	PyUnicodeObject *_PyUnicode_New(int length)
				168	{
				169	register PyUnicodeObject *unicode;
				170
				171	/* Optimization for empty strings */
				172	if (length == 0 && unicode_empty != NULL) {
				173	Py_INCREF(unicode_empty);
				174	return unicode_empty;
				175	}
				176
				177	/* Unicode freelist & memory allocation */
				178	if (unicode_freelist) {
				179	unicode = unicode_freelist;
				180	unicode_freelist = (PyUnicodeObject *)unicode_freelist;
				181	unicode_freelist_size--;
				182	unicode->ob_type = &PyUnicode_Type;
				183	_Py_NewReference(unicode);
				184	if (unicode->str) {
				185	if (unicode->length < length &&
				186	_PyUnicode_Resize(unicode, length)) {
				187	free(unicode->str);
				188	PyMem_DEL(unicode);
				189	return NULL;
				190	}
				191	}
				192	else
				193	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				194	}
				195	else {
				196	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				197	if (unicode == NULL)
				198	return NULL;
				199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				200	}
				201
				202	if (!unicode->str) {
				203	PyMem_DEL(unicode);
				204	PyErr_NoMemory();
				205	return NULL;
				206	}
				207	unicode->str[length] = 0;
				208	unicode->length = length;
				209	unicode->hash = -1;
				210	unicode->utf8str = NULL;
				211	return unicode;
				212	}
				213
				214	static
				215	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				216	{
				217	Py_XDECREF(unicode->utf8str);
				218	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
				219	if (unicode->length >= STAYALIVE_SIZE_LIMIT) {
				220	free(unicode->str);
				221	unicode->str = NULL;
				222	unicode->length = 0;
				223	}
				224	(PyUnicodeObject *)unicode = unicode_freelist;
				225	unicode_freelist = unicode;
				226	unicode_freelist_size++;
				227	_Py_ForgetReference(unicode);
				228	}
				229	else {
				230	free(unicode->str);
				231	PyMem_DEL(unicode);
				232	}
				233	}
				234
				235	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				236	int size)
				237	{
				238	PyUnicodeObject *unicode;
				239
				240	unicode = _PyUnicode_New(size);
				241	if (!unicode)
				242	return NULL;
				243
				244	/* Copy the Unicode data into the new object */
				245	if (u != NULL)
				246	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				247
				248	return (PyObject *)unicode;
				249	}
				250
				251	#ifdef HAVE_WCHAR_H
				252
				253	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				254	int size)
				255	{
				256	PyUnicodeObject *unicode;
				257
				258	if (w == NULL) {
				259	PyErr_BadInternalCall();
				260	return NULL;
				261	}
				262
				263	unicode = _PyUnicode_New(size);
				264	if (!unicode)
				265	return NULL;
				266
				267	/* Copy the wchar_t data into the new object */
				268	#ifdef HAVE_USABLE_WCHAR_T
				269	memcpy(unicode->str, w, size * sizeof(wchar_t));
				270	#else
				271	{
				272	register Py_UNICODE *u;
				273	register int i;
				274	u = PyUnicode_AS_UNICODE(unicode);
				275	for (i = size; i >= 0; i--)
				276	u++ = w++;
				277	}
				278	#endif
				279
				280	return (PyObject *)unicode;
				281	}
				282
				283	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				284	register wchar_t *w,
				285	int size)
				286	{
				287	if (unicode == NULL) {
				288	PyErr_BadInternalCall();
				289	return -1;
				290	}
				291	if (size > PyUnicode_GET_SIZE(unicode))
				292	size = PyUnicode_GET_SIZE(unicode);
				293	#ifdef HAVE_USABLE_WCHAR_T
				294	memcpy(w, unicode->str, size * sizeof(wchar_t));
				295	#else
				296	{
				297	register Py_UNICODE *u;
				298	register int i;
				299	u = PyUnicode_AS_UNICODE(unicode);
				300	for (i = size; i >= 0; i--)
				301	w++ = u++;
				302	}
				303	#endif
				304
				305	return size;
				306	}
				307
				308	#endif
				309
				310	PyObject PyUnicode_FromObject(register PyObject obj)
				311	{
				312	const char *s;
				313	int len;
				314
				315	if (obj == NULL) {
				316	PyErr_BadInternalCall();
				317	return NULL;
				318	}
				319	else if (PyUnicode_Check(obj)) {
				320	Py_INCREF(obj);
				321	return obj;
				322	}
				323	else if (PyString_Check(obj)) {
				324	s = PyString_AS_STRING(obj);
				325	len = PyString_GET_SIZE(obj);
				326	}
				327	else if (PyObject_AsCharBuffer(obj, &s, &len))
				328	return NULL;
				329	if (len == 0) {
				330	Py_INCREF(unicode_empty);
				331	return (PyObject *)unicode_empty;
				332	}
				333	return PyUnicode_DecodeUTF8(s, len, "strict");
				334	}
				335
				336	PyObject PyUnicode_Decode(const char s,
				337	int size,
				338	const char *encoding,
				339	const char *errors)
				340	{
				341	PyObject buffer = NULL, unicode;
				342
				343	/* Shortcut for the default encoding UTF-8 */
				344	if (encoding == NULL \|\|
				345	(strcmp(encoding, "utf-8") == 0))
				346	return PyUnicode_DecodeUTF8(s, size, errors);
				347
				348	/* Decode via the codec registry */
				349	buffer = PyBuffer_FromMemory((void *)s, size);
				350	if (buffer == NULL)
				351	goto onError;
				352	unicode = PyCodec_Decode(buffer, encoding, errors);
				353	if (unicode == NULL)
				354	goto onError;
				355	if (!PyUnicode_Check(unicode)) {
				356	PyErr_Format(PyExc_TypeError,
				357	"decoder did not return an unicode object (type=%s)",
				358	unicode->ob_type->tp_name);
				359	Py_DECREF(unicode);
				360	goto onError;
				361	}
				362	Py_DECREF(buffer);
				363	return unicode;
				364
				365	onError:
				366	Py_XDECREF(buffer);
				367	return NULL;
				368	}
				369
				370	PyObject PyUnicode_Encode(const Py_UNICODE s,
				371	int size,
				372	const char *encoding,
				373	const char *errors)
				374	{
				375	PyObject v, unicode;
				376
				377	unicode = PyUnicode_FromUnicode(s, size);
				378	if (unicode == NULL)
				379	return NULL;
				380	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				381	Py_DECREF(unicode);
				382	return v;
				383	}
				384
				385	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				386	const char *encoding,
				387	const char *errors)
				388	{
				389	PyObject *v;
				390
				391	if (!PyUnicode_Check(unicode)) {
				392	PyErr_BadArgument();
				393	goto onError;
				394	}
				395	/* Shortcut for the default encoding UTF-8 */
				396	if ((encoding == NULL \|\|
				397	(strcmp(encoding, "utf-8") == 0)) &&
				398	errors == NULL)
				399	return PyUnicode_AsUTF8String(unicode);
				400
				401	/* Encode via the codec registry */
				402	v = PyCodec_Encode(unicode, encoding, errors);
				403	if (v == NULL)
				404	goto onError;
				405	/* XXX Should we really enforce this ? */
				406	if (!PyString_Check(v)) {
				407	PyErr_Format(PyExc_TypeError,
				408	"encoder did not return a string object (type=%s)",
				409	v->ob_type->tp_name);
				410	Py_DECREF(v);
				411	goto onError;
				412	}
				413	return v;
				414
				415	onError:
				416	return NULL;
				417	}
				418
				419	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				420	{
				421	if (!PyUnicode_Check(unicode)) {
				422	PyErr_BadArgument();
				423	goto onError;
				424	}
				425	return PyUnicode_AS_UNICODE(unicode);
				426
				427	onError:
				428	return NULL;
				429	}
				430
				431	int PyUnicode_GetSize(PyObject *unicode)
				432	{
				433	if (!PyUnicode_Check(unicode)) {
				434	PyErr_BadArgument();
				435	goto onError;
				436	}
				437	return PyUnicode_GET_SIZE(unicode);
				438
				439	onError:
				440	return -1;
				441	}
				442
				443	/* --- UTF-8 Codec -------------------------------------------------------- */
				444
				445	static
				446	char utf8_code_length[256] = {
				447	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				448	illegal prefix. see RFC 2279 for details */
				449	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				450	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				451	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				452	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				453	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				454	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				455	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				456	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				457	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				458	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				459	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				460	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				461	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				462	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				463	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				464	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				465	};
				466
				467	static
				468	int utf8_decoding_error(const char **source,
				469	Py_UNICODE **dest,
				470	const char *errors,
				471	const char *details)
				472	{
				473	if ((errors == NULL) \|\|
				474	(strcmp(errors,"strict") == 0)) {
				475	PyErr_Format(PyExc_UnicodeError,
				476	"UTF-8 decoding error: %s",
				477	details);
				478	return -1;
				479	}
				480	else if (strcmp(errors,"ignore") == 0) {
				481	(*source)++;
				482	return 0;
				483	}
				484	else if (strcmp(errors,"replace") == 0) {
				485	(*source)++;
				486	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				487	(*dest)++;
				488	return 0;
				489	}
				490	else {
				491	PyErr_Format(PyExc_ValueError,
				492	"UTF-8 decoding error; unkown error handling code: %s",
				493	errors);
				494	return -1;
				495	}
				496	}
				497
				498	#define UTF8_ERROR(details) do { \
				499	if (utf8_decoding_error(&s, &p, errors, details)) \
				500	goto onError; \
				501	continue; \
				502	} while (0)
				503
				504	PyObject PyUnicode_DecodeUTF8(const char s,
				505	int size,
				506	const char *errors)
				507	{
				508	int n;
				509	const char *e;
				510	PyUnicodeObject *unicode;
				511	Py_UNICODE *p;
				512
				513	/* Note: size will always be longer than the resulting Unicode
				514	character count */
				515	unicode = _PyUnicode_New(size);
				516	if (!unicode)
				517	return NULL;
				518	if (size == 0)
				519	return (PyObject *)unicode;
				520
				521	/* Unpack UTF-8 encoded data */
				522	p = unicode->str;
				523	e = s + size;
				524
				525	while (s < e) {
				526	register Py_UNICODE ch = (unsigned char)*s;
				527
				528	if (ch < 0x80) {
				529	*p++ = ch;
				530	s++;
				531	continue;
				532	}
				533
				534	n = utf8_code_length[ch];
				535
				536	if (s + n > e)
				537	UTF8_ERROR("unexpected end of data");
				538
				539	switch (n) {
				540
				541	case 0:
				542	UTF8_ERROR("unexpected code byte");
				543	break;
				544
				545	case 1:
				546	UTF8_ERROR("internal error");
				547	break;
				548
				549	case 2:
				550	if ((s[1] & 0xc0) != 0x80)
				551	UTF8_ERROR("invalid data");
				552	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				553	if (ch < 0x80)
				554	UTF8_ERROR("illegal encoding");
				555	else
				556	*p++ = ch;
				557	break;
				558
				559	case 3:
				560	if ((s[1] & 0xc0) != 0x80 \|\|
				561	(s[2] & 0xc0) != 0x80)
				562	UTF8_ERROR("invalid data");
				563	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				564	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				565	UTF8_ERROR("illegal encoding");
				566	else
				567	*p++ = ch;
				568	break;
				569
				570	default:
				571	/* Other sizes are only needed for UCS-4 */
				572	UTF8_ERROR("unsupported Unicode code range");
				573	}
				574	s += n;
				575	}
				576
				577	/* Adjust length */
				578	if (_PyUnicode_Resize(unicode, p - unicode->str))
				579	goto onError;
				580
				581	return (PyObject *)unicode;
				582
				583	onError:
				584	Py_DECREF(unicode);
				585	return NULL;
				586	}
				587
				588	#undef UTF8_ERROR
				589
				590	static
				591	int utf8_encoding_error(const Py_UNICODE **source,
				592	char **dest,
				593	const char *errors,
				594	const char *details)
				595	{
				596	if ((errors == NULL) \|\|
				597	(strcmp(errors,"strict") == 0)) {
				598	PyErr_Format(PyExc_UnicodeError,
				599	"UTF-8 encoding error: %s",
				600	details);
				601	return -1;
				602	}
				603	else if (strcmp(errors,"ignore") == 0) {
				604	return 0;
				605	}
				606	else if (strcmp(errors,"replace") == 0) {
				607	**dest = '?';
				608	(*dest)++;
				609	return 0;
				610	}
				611	else {
				612	PyErr_Format(PyExc_ValueError,
				613	"UTF-8 encoding error; "
				614	"unkown error handling code: %s",
				615	errors);
				616	return -1;
				617	}
				618	}
				619
				620	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				621	int size,
				622	const char *errors)
				623	{
				624	PyObject *v;
				625	char *p;
				626	char *q;
				627
				628	v = PyString_FromStringAndSize(NULL, 3 * size);
				629	if (v == NULL)
				630	return NULL;
				631	if (size == 0)
				632	goto done;
				633
				634	p = q = PyString_AS_STRING(v);
				635	while (size-- > 0) {
				636	Py_UNICODE ch = *s++;
				637	if (ch < 0x80)
				638	*p++ = (char) ch;
				639	else if (ch < 0x0800) {
				640	*p++ = 0xc0 \| (ch >> 6);
				641	*p++ = 0x80 \| (ch & 0x3f);
				642	} else if (0xD800 <= ch && ch <= 0xDFFF) {
				643	/* These byte ranges are reserved for UTF-16 surrogate
				644	bytes which the Python implementation currently does
				645	not support. */
				646	printf("code range problem: U+%04x\n", ch);
				647	if (utf8_encoding_error(&s, &p, errors,
				648	"unsupported code range"))
				649	goto onError;
				650	} else {
				651	*p++ = 0xe0 \| (ch >> 12);
				652	*p++ = 0x80 \| ((ch >> 6) & 0x3f);
				653	*p++ = 0x80 \| (ch & 0x3f);
				654	}
				655	}
				656	*p = '\0';
				657	_PyString_Resize(&v, p - q);
				658
				659	done:
				660	return v;
				661
				662	onError:
				663	Py_DECREF(v);
				664	return NULL;
				665	}
				666
				667	/* Return a Python string holding the UTF-8 encoded value of the
				668	Unicode object.
				669
				670	The resulting string is cached in the Unicode object for subsequent
				671	usage by this function. The cached version is needed to implement
				672	the character buffer interface.
				673
				674	The refcount of the string is not incremented.
				675
				676	*/
				677
				678	static
				679	PyObject utf8_string(PyUnicodeObject self,
				680	const char *errors)
				681	{
				682	PyObject *v = self->utf8str;
				683
				684	if (v)
				685	return v;
				686	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(self),
				687	PyUnicode_GET_SIZE(self),
				688	errors);
				689	if (v && errors == NULL)
				690	self->utf8str = v;
				691	return v;
				692	}
				693
				694	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				695	{
				696	PyObject *str;
				697
				698	if (!PyUnicode_Check(unicode)) {
				699	PyErr_BadArgument();
				700	return NULL;
				701	}
				702	str = utf8_string((PyUnicodeObject *)unicode, NULL);
				703	if (str == NULL)
				704	return NULL;
				705	Py_INCREF(str);
				706	return str;
				707	}
				708
				709	/* --- UTF-16 Codec ------------------------------------------------------- */
				710
				711	static
				712	int utf16_decoding_error(const Py_UNICODE **source,
				713	Py_UNICODE **dest,
				714	const char *errors,
				715	const char *details)
				716	{
				717	if ((errors == NULL) \|\|
				718	(strcmp(errors,"strict") == 0)) {
				719	PyErr_Format(PyExc_UnicodeError,
				720	"UTF-16 decoding error: %s",
				721	details);
				722	return -1;
				723	}
				724	else if (strcmp(errors,"ignore") == 0) {
				725	return 0;
				726	}
				727	else if (strcmp(errors,"replace") == 0) {
				728	if (dest) {
				729	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				730	(*dest)++;
				731	}
				732	return 0;
				733	}
				734	else {
				735	PyErr_Format(PyExc_ValueError,
				736	"UTF-16 decoding error; unkown error handling code: %s",
				737	errors);
				738	return -1;
				739	}
				740	}
				741
				742	#define UTF16_ERROR(details) do { \
				743	if (utf16_decoding_error(&q, &p, errors, details)) \
				744	goto onError; \
				745	continue; \
				746	} while(0)
				747
				748	PyObject PyUnicode_DecodeUTF16(const char s,
				749	int size,
				750	const char *errors,
				751	int *byteorder)
				752	{
				753	PyUnicodeObject *unicode;
				754	Py_UNICODE *p;
				755	const Py_UNICODE q, e;
				756	int bo = 0;
				757
				758	/* size should be an even number */
				759	if (size % sizeof(Py_UNICODE) != 0) {
				760	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				761	return NULL;
				762	/* The remaining input chars are ignored if we fall through
				763	here... */
				764	}
				765
				766	/* Note: size will always be longer than the resulting Unicode
				767	character count */
				768	unicode = _PyUnicode_New(size);
				769	if (!unicode)
				770	return NULL;
				771	if (size == 0)
				772	return (PyObject *)unicode;
				773
				774	/* Unpack UTF-16 encoded data */
				775	p = unicode->str;
				776	q = (Py_UNICODE *)s;
				777	e = q + (size / sizeof(Py_UNICODE));
				778
				779	if (byteorder)
				780	bo = *byteorder;
				781
				782	while (q < e) {
				783	register Py_UNICODE ch = *q++;
				784
				785	/* Check for BOM marks (U+FEFF) in the input and adjust
				786	current byte order setting accordingly. Swap input
				787	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				788	!) */
				789	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				790	if (ch == 0xFEFF) {
				791	bo = -1;
				792	continue;
				793	} else if (ch == 0xFFFE) {
				794	bo = 1;
				795	continue;
				796	}
				797	if (bo == 1)
				798	ch = (ch >> 8) \| (ch << 8);
				799	#else
				800	if (ch == 0xFEFF) {
				801	bo = 1;
				802	continue;
				803	} else if (ch == 0xFFFE) {
				804	bo = -1;
				805	continue;
				806	}
				807	if (bo == -1)
				808	ch = (ch >> 8) \| (ch << 8);
				809	#endif
				810	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				811	*p++ = ch;
				812	continue;
				813	}
				814
				815	/* UTF-16 code pair: */
				816	if (q >= e)
				817	UTF16_ERROR("unexpected end of data");
				818	if (0xDC00 <= q && q <= 0xDFFF) {
				819	q++;
				820	if (0xD800 <= q && q <= 0xDBFF)
				821	/* This is valid data (a UTF-16 surrogate pair), but
				822	we are not able to store this information since our
				823	Py_UNICODE type only has 16 bits... this might
				824	change someday, even though it's unlikely. */
				825	UTF16_ERROR("code pairs are not supported");
				826	else
				827	continue;
				828	}
				829	UTF16_ERROR("illegal encoding");
				830	}
				831
				832	if (byteorder)
				833	*byteorder = bo;
				834
				835	/* Adjust length */
				836	if (_PyUnicode_Resize(unicode, p - unicode->str))
				837	goto onError;
				838
				839	return (PyObject *)unicode;
				840
				841	onError:
				842	Py_DECREF(unicode);
				843	return NULL;
				844	}
				845
				846	#undef UTF16_ERROR
				847
				848	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				849	int size,
				850	const char *errors,
				851	int byteorder)
				852	{
				853	PyObject *v;
				854	Py_UNICODE *p;
				855	char *q;
				856
				857	/* We don't create UTF-16 pairs... */
				858	v = PyString_FromStringAndSize(NULL,
				859	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				860	if (v == NULL)
				861	return NULL;
				862	if (size == 0)
				863	goto done;
				864
				865	q = PyString_AS_STRING(v);
				866	p = (Py_UNICODE *)q;
				867
				868	if (byteorder == 0)
				869	*p++ = 0xFEFF;
				870	if (byteorder == 0 \|\|
				871	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				872	byteorder == -1
				873	#else
				874	byteorder == 1
				875	#endif
				876	)
				877	memcpy(p, s, size * sizeof(Py_UNICODE));
				878	else
				879	while (size-- > 0) {
				880	Py_UNICODE ch = *s++;
				881	*p++ = (ch >> 8) \| (ch << 8);
				882	}
				883	done:
				884	return v;
				885	}
				886
				887	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				888	{
				889	if (!PyUnicode_Check(unicode)) {
				890	PyErr_BadArgument();
				891	return NULL;
				892	}
				893	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				894	PyUnicode_GET_SIZE(unicode),
				895	NULL,
				896	0);
				897	}
				898
				899	/* --- Unicode Escape Codec ----------------------------------------------- */
				900
				901	static
				902	int unicodeescape_decoding_error(const char **source,
				903	unsigned int *x,
				904	const char *errors,
				905	const char *details)
				906	{
				907	if ((errors == NULL) \|\|
				908	(strcmp(errors,"strict") == 0)) {
				909	PyErr_Format(PyExc_UnicodeError,
				910	"Unicode-Escape decoding error: %s",
				911	details);
				912	return -1;
				913	}
				914	else if (strcmp(errors,"ignore") == 0) {
				915	return 0;
				916	}
				917	else if (strcmp(errors,"replace") == 0) {
				918	*x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
				919	return 0;
				920	}
				921	else {
				922	PyErr_Format(PyExc_ValueError,
				923	"Unicode-Escape decoding error; "
				924	"unkown error handling code: %s",
				925	errors);
				926	return -1;
				927	}
				928	}
				929
				930	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				931	int size,
				932	const char *errors)
				933	{
				934	PyUnicodeObject *v;
				935	Py_UNICODE p = NULL, buf = NULL;
				936	const char *end;
				937
				938	/* Escaped strings will always be longer than the resulting
				939	Unicode string, so we start with size here and then reduce the
				940	length after conversion to the true value. */
				941	v = _PyUnicode_New(size);
				942	if (v == NULL)
				943	goto onError;
				944	if (size == 0)
				945	return (PyObject *)v;
				946	p = buf = PyUnicode_AS_UNICODE(v);
				947	end = s + size;
				948	while (s < end) {
				949	unsigned char c;
				950	unsigned int x;
				951	int i;
				952
				953	/* Non-escape characters are interpreted as Unicode ordinals */
				954	if (*s != '\\') {
				955	p++ = (unsigned char)s++;
				956	continue;
				957	}
				958
				959	/* \ - Escapes */
				960	s++;
				961	switch (*s++) {
				962
				963	/* \x escapes */
				964	case '\n': break;
				965	case '\\': *p++ = '\\'; break;
				966	case '\'': *p++ = '\''; break;
				967	case '\"': *p++ = '\"'; break;
				968	case 'b': *p++ = '\b'; break;
				969	case 'f': p++ = '\014'; break; / FF */
				970	case 't': *p++ = '\t'; break;
				971	case 'n': *p++ = '\n'; break;
				972	case 'r': *p++ = '\r'; break;
				973	case 'v': p++ = '\013'; break; / VT */
				974	case 'a': p++ = '\007'; break; / BEL, not classic C */
				975
				976	/* \OOO (octal) escapes */
				977	case '0': case '1': case '2': case '3':
				978	case '4': case '5': case '6': case '7':
				979	c = s[-1] - '0';
				980	if ('0' <= s && s <= '7') {
				981	c = (c<<3) + *s++ - '0';
				982	if ('0' <= s && s <= '7')
				983	c = (c<<3) + *s++ - '0';
				984	}
				985	*p++ = c;
				986	break;
				987
				988	/* \xXXXX escape with 0-4 hex digits */
				989	case 'x':
				990	x = 0;
				991	c = (unsigned char)*s;
				992	if (isxdigit(c)) {
				993	do {
				994	x = (x<<4) & ~0xF;
				995	if ('0' <= c && c <= '9')
				996	x += c - '0';
				997	else if ('a' <= c && c <= 'f')
				998	x += 10 + c - 'a';
				999	else
				1000	x += 10 + c - 'A';
				1001	c = (unsigned char)*++s;
				1002	} while (isxdigit(c));
				1003	*p++ = x;
				1004	} else {
				1005	*p++ = '\\';
				1006	*p++ = (unsigned char)s[-1];
				1007	}
				1008	break;
				1009
				1010	/* \uXXXX with 4 hex digits */
				1011	case 'u':
				1012	for (x = 0, i = 0; i < 4; i++) {
				1013	c = (unsigned char)s[i];
				1014	if (!isxdigit(c)) {
				1015	if (unicodeescape_decoding_error(&s, &x, errors,
				1016	"truncated \\uXXXX"))
				1017	goto onError;
				1018	i++;
				1019	break;
				1020	}
				1021	x = (x<<4) & ~0xF;
				1022	if (c >= '0' && c <= '9')
				1023	x += c - '0';
				1024	else if (c >= 'a' && c <= 'f')
				1025	x += 10 + c - 'a';
				1026	else
				1027	x += 10 + c - 'A';
				1028	}
				1029	s += i;
				1030	*p++ = x;
				1031	break;
				1032
				1033	default:
				1034	*p++ = '\\';
				1035	*p++ = (unsigned char)s[-1];
				1036	break;
				1037	}
				1038	}
				1039	_PyUnicode_Resize(v, (int)(p - buf));
				1040	return (PyObject *)v;
				1041
				1042	onError:
				1043	Py_XDECREF(v);
				1044	return NULL;
				1045	}
				1046
				1047	/* Return a Unicode-Escape string version of the Unicode object.
				1048
				1049	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1050	appropriate.
				1051
				1052	*/
				1053
				1054	static
				1055	PyObject unicodeescape_string(const Py_UNICODE s,
				1056	int size,
				1057	int quotes)
				1058	{
				1059	PyObject *repr;
				1060	char *p;
				1061	char *q;
				1062
				1063	static const char *hexdigit = "0123456789ABCDEF";
				1064
				1065	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1066	if (repr == NULL)
				1067	return NULL;
				1068
				1069	p = q = PyString_AS_STRING(repr);
				1070
				1071	if (quotes) {
				1072	static const Py_UNICODE findchar(const Py_UNICODE s,
				1073	int size,
				1074	Py_UNICODE ch);
				1075	*p++ = 'u';
				1076	*p++ = (findchar(s, size, '\'') &&
				1077	!findchar(s, size, '"')) ? '"' : '\'';
				1078	}
				1079	while (size-- > 0) {
				1080	Py_UNICODE ch = *s++;
				1081	/* Escape quotes */
				1082	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1083	*p++ = '\\';
				1084	*p++ = (char) ch;
				1085	}
				1086	/* Map 16-bit characters to '\uxxxx' */
				1087	else if (ch >= 256) {
				1088	*p++ = '\\';
				1089	*p++ = 'u';
				1090	*p++ = hexdigit[(ch >> 12) & 0xf];
				1091	*p++ = hexdigit[(ch >> 8) & 0xf];
				1092	*p++ = hexdigit[(ch >> 4) & 0xf];
				1093	*p++ = hexdigit[ch & 15];
				1094	}
				1095	/* Map non-printable US ASCII to '\ooo' */
				1096	else if (ch < ' ' \|\| ch >= 128) {
				1097	*p++ = '\\';
				1098	*p++ = hexdigit[(ch >> 6) & 7];
				1099	*p++ = hexdigit[(ch >> 3) & 7];
				1100	*p++ = hexdigit[ch & 7];
				1101	}
				1102	/* Copy everything else as-is */
				1103	else
				1104	*p++ = (char) ch;
				1105	}
				1106	if (quotes)
				1107	*p++ = q[1];
				1108
				1109	*p = '\0';
				1110	_PyString_Resize(&repr, p - q);
				1111
				1112	return repr;
				1113	}
				1114
				1115	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1116	int size)
				1117	{
				1118	return unicodeescape_string(s, size, 0);
				1119	}
				1120
				1121	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1122	{
				1123	if (!PyUnicode_Check(unicode)) {
				1124	PyErr_BadArgument();
				1125	return NULL;
				1126	}
				1127	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1128	PyUnicode_GET_SIZE(unicode));
				1129	}
				1130
				1131	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1132
				1133	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1134	int size,
				1135	const char *errors)
				1136	{
				1137	PyUnicodeObject *v;
				1138	Py_UNICODE p, buf;
				1139	const char *end;
				1140	const char *bs;
				1141
				1142	/* Escaped strings will always be longer than the resulting
				1143	Unicode string, so we start with size here and then reduce the
				1144	length after conversion to the true value. */
				1145	v = _PyUnicode_New(size);
				1146	if (v == NULL)
				1147	goto onError;
				1148	if (size == 0)
				1149	return (PyObject *)v;
				1150	p = buf = PyUnicode_AS_UNICODE(v);
				1151	end = s + size;
				1152	while (s < end) {
				1153	unsigned char c;
				1154	unsigned int x;
				1155	int i;
				1156
				1157	/* Non-escape characters are interpreted as Unicode ordinals */
				1158	if (*s != '\\') {
				1159	p++ = (unsigned char)s++;
				1160	continue;
				1161	}
				1162
				1163	/* \u-escapes are only interpreted iff the number of leading
				1164	backslashes if odd */
				1165	bs = s;
				1166	for (;s < end;) {
				1167	if (*s != '\\')
				1168	break;
				1169	p++ = (unsigned char)s++;
				1170	}
				1171	if (((s - bs) & 1) == 0 \|\|
				1172	s >= end \|\|
				1173	*s != 'u') {
				1174	continue;
				1175	}
				1176	p--;
				1177	s++;
				1178
				1179	/* \uXXXX with 4 hex digits */
				1180	for (x = 0, i = 0; i < 4; i++) {
				1181	c = (unsigned char)s[i];
				1182	if (!isxdigit(c)) {
				1183	if (unicodeescape_decoding_error(&s, &x, errors,
				1184	"truncated \\uXXXX"))
				1185	goto onError;
				1186	i++;
				1187	break;
				1188	}
				1189	x = (x<<4) & ~0xF;
				1190	if (c >= '0' && c <= '9')
				1191	x += c - '0';
				1192	else if (c >= 'a' && c <= 'f')
				1193	x += 10 + c - 'a';
				1194	else
				1195	x += 10 + c - 'A';
				1196	}
				1197	s += i;
				1198	*p++ = x;
				1199	}
				1200	_PyUnicode_Resize(v, (int)(p - buf));
				1201	return (PyObject *)v;
				1202
				1203	onError:
				1204	Py_XDECREF(v);
				1205	return NULL;
				1206	}
				1207
				1208	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1209	int size)
				1210	{
				1211	PyObject *repr;
				1212	char *p;
				1213	char *q;
				1214
				1215	static const char *hexdigit = "0123456789ABCDEF";
				1216
				1217	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1218	if (repr == NULL)
				1219	return NULL;
				1220
				1221	p = q = PyString_AS_STRING(repr);
				1222	while (size-- > 0) {
				1223	Py_UNICODE ch = *s++;
				1224	/* Map 16-bit characters to '\uxxxx' */
				1225	if (ch >= 256) {
				1226	*p++ = '\\';
				1227	*p++ = 'u';
				1228	*p++ = hexdigit[(ch >> 12) & 0xf];
				1229	*p++ = hexdigit[(ch >> 8) & 0xf];
				1230	*p++ = hexdigit[(ch >> 4) & 0xf];
				1231	*p++ = hexdigit[ch & 15];
				1232	}
				1233	/* Copy everything else as-is */
				1234	else
				1235	*p++ = (char) ch;
				1236	}
				1237	*p = '\0';
				1238	_PyString_Resize(&repr, p - q);
				1239
				1240	return repr;
				1241	}
				1242
				1243	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1244	{
				1245	if (!PyUnicode_Check(unicode)) {
				1246	PyErr_BadArgument();
				1247	return NULL;
				1248	}
				1249	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1250	PyUnicode_GET_SIZE(unicode));
				1251	}
				1252
				1253	/* --- Latin-1 Codec ------------------------------------------------------ */
				1254
				1255	PyObject PyUnicode_DecodeLatin1(const char s,
				1256	int size,
				1257	const char *errors)
				1258	{
				1259	PyUnicodeObject *v;
				1260	Py_UNICODE *p;
				1261
				1262	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1263	v = _PyUnicode_New(size);
				1264	if (v == NULL)
				1265	goto onError;
				1266	if (size == 0)
				1267	return (PyObject *)v;
				1268	p = PyUnicode_AS_UNICODE(v);
				1269	while (size-- > 0)
				1270	p++ = (unsigned char)s++;
				1271	return (PyObject *)v;
				1272
				1273	onError:
				1274	Py_XDECREF(v);
				1275	return NULL;
				1276	}
				1277
				1278	static
				1279	int latin1_encoding_error(const Py_UNICODE **source,
				1280	char **dest,
				1281	const char *errors,
				1282	const char *details)
				1283	{
				1284	if ((errors == NULL) \|\|
				1285	(strcmp(errors,"strict") == 0)) {
				1286	PyErr_Format(PyExc_UnicodeError,
				1287	"Latin-1 encoding error: %s",
				1288	details);
				1289	return -1;
				1290	}
				1291	else if (strcmp(errors,"ignore") == 0) {
				1292	return 0;
				1293	}
				1294	else if (strcmp(errors,"replace") == 0) {
				1295	**dest = '?';
				1296	return 0;
				1297	}
				1298	else {
				1299	PyErr_Format(PyExc_ValueError,
				1300	"Latin-1 encoding error; "
				1301	"unkown error handling code: %s",
				1302	errors);
				1303	return -1;
				1304	}
				1305	}
				1306
				1307	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1308	int size,
				1309	const char *errors)
				1310	{
				1311	PyObject *repr;
				1312	char *s;
				1313	repr = PyString_FromStringAndSize(NULL, size);
				1314	if (repr == NULL)
				1315	return NULL;
				1316
				1317	s = PyString_AS_STRING(repr);
				1318	while (size-- > 0) {
				1319	Py_UNICODE ch = *p++;
				1320	if (ch >= 256) {
				1321	if (latin1_encoding_error(&p, &s, errors,
				1322	"ordinal not in range(256)"))
				1323	goto onError;
				1324	}
				1325	else
				1326	*s++ = (char)ch;
				1327	}
				1328	return repr;
				1329
				1330	onError:
				1331	Py_DECREF(repr);
				1332	return NULL;
				1333	}
				1334
				1335	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1336	{
				1337	if (!PyUnicode_Check(unicode)) {
				1338	PyErr_BadArgument();
				1339	return NULL;
				1340	}
				1341	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1342	PyUnicode_GET_SIZE(unicode),
				1343	NULL);
				1344	}
				1345
				1346	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1347
				1348	static
				1349	int ascii_decoding_error(const char **source,
				1350	Py_UNICODE **dest,
				1351	const char *errors,
				1352	const char *details)
				1353	{
				1354	if ((errors == NULL) \|\|
				1355	(strcmp(errors,"strict") == 0)) {
				1356	PyErr_Format(PyExc_UnicodeError,
				1357	"ASCII decoding error: %s",
				1358	details);
				1359	return -1;
				1360	}
				1361	else if (strcmp(errors,"ignore") == 0) {
				1362	return 0;
				1363	}
				1364	else if (strcmp(errors,"replace") == 0) {
				1365	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1366	(*dest)++;
				1367	return 0;
				1368	}
				1369	else {
				1370	PyErr_Format(PyExc_ValueError,
				1371	"ASCII decoding error; "
				1372	"unkown error handling code: %s",
				1373	errors);
				1374	return -1;
				1375	}
				1376	}
				1377
				1378	PyObject PyUnicode_DecodeASCII(const char s,
				1379	int size,
				1380	const char *errors)
				1381	{
				1382	PyUnicodeObject *v;
				1383	Py_UNICODE *p;
				1384
				1385	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1386	v = _PyUnicode_New(size);
				1387	if (v == NULL)
				1388	goto onError;
				1389	if (size == 0)
				1390	return (PyObject *)v;
				1391	p = PyUnicode_AS_UNICODE(v);
				1392	while (size-- > 0) {
				1393	register unsigned char c;
				1394
				1395	c = (unsigned char)*s++;
				1396	if (c < 128)
				1397	*p++ = c;
				1398	else if (ascii_decoding_error(&s, &p, errors,
				1399	"ordinal not in range(128)"))
				1400	goto onError;
				1401	}
				1402	if (p - PyUnicode_AS_UNICODE(v) < size)
				1403	_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
				1404	return (PyObject *)v;
				1405
				1406	onError:
				1407	Py_XDECREF(v);
				1408	return NULL;
				1409	}
				1410
				1411	static
				1412	int ascii_encoding_error(const Py_UNICODE **source,
				1413	char **dest,
				1414	const char *errors,
				1415	const char *details)
				1416	{
				1417	if ((errors == NULL) \|\|
				1418	(strcmp(errors,"strict") == 0)) {
				1419	PyErr_Format(PyExc_UnicodeError,
				1420	"ASCII encoding error: %s",
				1421	details);
				1422	return -1;
				1423	}
				1424	else if (strcmp(errors,"ignore") == 0) {
				1425	return 0;
				1426	}
				1427	else if (strcmp(errors,"replace") == 0) {
				1428	**dest = '?';
				1429	return 0;
				1430	}
				1431	else {
				1432	PyErr_Format(PyExc_ValueError,
				1433	"ASCII encoding error; "
				1434	"unkown error handling code: %s",
				1435	errors);
				1436	return -1;
				1437	}
				1438	}
				1439
				1440	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1441	int size,
				1442	const char *errors)
				1443	{
				1444	PyObject *repr;
				1445	char *s;
				1446	repr = PyString_FromStringAndSize(NULL, size);
				1447	if (repr == NULL)
				1448	return NULL;
				1449
				1450	s = PyString_AS_STRING(repr);
				1451	while (size-- > 0) {
				1452	Py_UNICODE ch = *p++;
				1453	if (ch >= 128) {
				1454	if (ascii_encoding_error(&p, &s, errors,
				1455	"ordinal not in range(128)"))
				1456	goto onError;
				1457	}
				1458	else
				1459	*s++ = (char)ch;
				1460	}
				1461	return repr;
				1462
				1463	onError:
				1464	Py_DECREF(repr);
				1465	return NULL;
				1466	}
				1467
				1468	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1469	{
				1470	if (!PyUnicode_Check(unicode)) {
				1471	PyErr_BadArgument();
				1472	return NULL;
				1473	}
				1474	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1475	PyUnicode_GET_SIZE(unicode),
				1476	NULL);
				1477	}
				1478
				1479	/* --- Character Mapping Codec -------------------------------------------- */
				1480
				1481	static
				1482	int charmap_decoding_error(const char **source,
				1483	Py_UNICODE **dest,
				1484	const char *errors,
				1485	const char *details)
				1486	{
				1487	if ((errors == NULL) \|\|
				1488	(strcmp(errors,"strict") == 0)) {
				1489	PyErr_Format(PyExc_UnicodeError,
				1490	"charmap decoding error: %s",
				1491	details);
				1492	return -1;
				1493	}
				1494	else if (strcmp(errors,"ignore") == 0) {
				1495	return 0;
				1496	}
				1497	else if (strcmp(errors,"replace") == 0) {
				1498	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1499	(*dest)++;
				1500	return 0;
				1501	}
				1502	else {
				1503	PyErr_Format(PyExc_ValueError,
				1504	"charmap decoding error; "
				1505	"unkown error handling code: %s",
				1506	errors);
				1507	return -1;
				1508	}
				1509	}
				1510
				1511	PyObject PyUnicode_DecodeCharmap(const char s,
				1512	int size,
				1513	PyObject *mapping,
				1514	const char *errors)
				1515	{
				1516	PyUnicodeObject *v;
				1517	Py_UNICODE *p;
				1518
				1519	/* Default to Latin-1 */
				1520	if (mapping == NULL)
				1521	return PyUnicode_DecodeLatin1(s, size, errors);
				1522
				1523	v = _PyUnicode_New(size);
				1524	if (v == NULL)
				1525	goto onError;
				1526	if (size == 0)
				1527	return (PyObject *)v;
				1528	p = PyUnicode_AS_UNICODE(v);
				1529	while (size-- > 0) {
				1530	unsigned char ch = *s++;
				1531	PyObject w, x;
				1532
				1533	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1534	w = PyInt_FromLong((long)ch);
				1535	if (w == NULL)
				1536	goto onError;
				1537	x = PyObject_GetItem(mapping, w);
				1538	Py_DECREF(w);
				1539	if (x == NULL) {
				1540	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1541	/* No mapping found: default to Latin-1 mapping */
				1542	PyErr_Clear();
				1543	*p++ = (Py_UNICODE)ch;
				1544	continue;
				1545	}
				1546	goto onError;
				1547	}
				1548
				1549	/* Apply mapping */
				1550	if (PyInt_Check(x)) {
				1551	int value = PyInt_AS_LONG(x);
				1552	if (value < 0 \|\| value > 65535) {
				1553	PyErr_SetString(PyExc_TypeError,
				1554	"character mapping must be in range(65336)");
				1555	Py_DECREF(x);
				1556	goto onError;
				1557	}
				1558	*p++ = (Py_UNICODE)value;
				1559	}
				1560	else if (x == Py_None) {
				1561	/* undefined mapping */
				1562	if (charmap_decoding_error(&s, &p, errors,
				1563	"character maps to <undefined>")) {
				1564	Py_DECREF(x);
				1565	goto onError;
				1566	}
				1567	}
				1568	else if (PyUnicode_Check(x)) {
				1569	if (PyUnicode_GET_SIZE(x) != 1) {
				1570	/* 1-n mapping */
				1571	PyErr_SetString(PyExc_NotImplementedError,
				1572	"1-n mappings are currently not implemented");
				1573	Py_DECREF(x);
				1574	goto onError;
				1575	}
				1576	p++ = PyUnicode_AS_UNICODE(x);
				1577	}
				1578	else {
				1579	/* wrong return value */
				1580	PyErr_SetString(PyExc_TypeError,
				1581	"character mapping must return integer, None or unicode");
				1582	Py_DECREF(x);
				1583	goto onError;
				1584	}
				1585	Py_DECREF(x);
				1586	}
				1587	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1588	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1589	goto onError;
				1590	return (PyObject *)v;
				1591
				1592	onError:
				1593	Py_XDECREF(v);
				1594	return NULL;
				1595	}
				1596
				1597	static
				1598	int charmap_encoding_error(const Py_UNICODE **source,
				1599	char **dest,
				1600	const char *errors,
				1601	const char *details)
				1602	{
				1603	if ((errors == NULL) \|\|
				1604	(strcmp(errors,"strict") == 0)) {
				1605	PyErr_Format(PyExc_UnicodeError,
				1606	"charmap encoding error: %s",
				1607	details);
				1608	return -1;
				1609	}
				1610	else if (strcmp(errors,"ignore") == 0) {
				1611	return 0;
				1612	}
				1613	else if (strcmp(errors,"replace") == 0) {
				1614	**dest = '?';
				1615	(*dest)++;
				1616	return 0;
				1617	}
				1618	else {
				1619	PyErr_Format(PyExc_ValueError,
				1620	"charmap encoding error; "
				1621	"unkown error handling code: %s",
				1622	errors);
				1623	return -1;
				1624	}
				1625	}
				1626
				1627	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1628	int size,
				1629	PyObject *mapping,
				1630	const char *errors)
				1631	{
				1632	PyObject *v;
				1633	char *s;
				1634
				1635	/* Default to Latin-1 */
				1636	if (mapping == NULL)
				1637	return PyUnicode_EncodeLatin1(p, size, errors);
				1638
				1639	v = PyString_FromStringAndSize(NULL, size);
				1640	if (v == NULL)
				1641	return NULL;
				1642	s = PyString_AS_STRING(v);
				1643	while (size-- > 0) {
				1644	Py_UNICODE ch = *p++;
				1645	PyObject w, x;
				1646
				1647	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				1648	w = PyInt_FromLong((long)ch);
				1649	if (w == NULL)
				1650	goto onError;
				1651	x = PyObject_GetItem(mapping, w);
				1652	Py_DECREF(w);
				1653	if (x == NULL) {
				1654	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1655	/* No mapping found: default to Latin-1 mapping if possible */
				1656	PyErr_Clear();
				1657	if (ch < 256) {
				1658	*s++ = (char)ch;
				1659	continue;
				1660	}
				1661	else if (!charmap_encoding_error(&p, &s, errors,
				1662	"missing character mapping"))
				1663	continue;
				1664	}
				1665	goto onError;
				1666	}
				1667
				1668	/* Apply mapping */
				1669	if (PyInt_Check(x)) {
				1670	int value = PyInt_AS_LONG(x);
				1671	if (value < 0 \|\| value > 255) {
				1672	PyErr_SetString(PyExc_TypeError,
				1673	"character mapping must be in range(256)");
				1674	Py_DECREF(x);
				1675	goto onError;
				1676	}
				1677	*s++ = (char)value;
				1678	}
				1679	else if (x == Py_None) {
				1680	/* undefined mapping */
				1681	if (charmap_encoding_error(&p, &s, errors,
				1682	"character maps to <undefined>")) {
				1683	Py_DECREF(x);
				1684	goto onError;
				1685	}
				1686	}
				1687	else if (PyString_Check(x)) {
				1688	if (PyString_GET_SIZE(x) != 1) {
				1689	/* 1-n mapping */
				1690	PyErr_SetString(PyExc_NotImplementedError,
				1691	"1-n mappings are currently not implemented");
				1692	Py_DECREF(x);
				1693	goto onError;
				1694	}
				1695	s++ = PyString_AS_STRING(x);
				1696	}
				1697	else {
				1698	/* wrong return value */
				1699	PyErr_SetString(PyExc_TypeError,
				1700	"character mapping must return integer, None or unicode");
				1701	Py_DECREF(x);
				1702	goto onError;
				1703	}
				1704	Py_DECREF(x);
				1705	}
				1706	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				1707	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				1708	goto onError;
				1709	return v;
				1710
				1711	onError:
				1712	Py_DECREF(v);
				1713	return NULL;
				1714	}
				1715
				1716	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				1717	PyObject *mapping)
				1718	{
				1719	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				1720	PyErr_BadArgument();
				1721	return NULL;
				1722	}
				1723	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				1724	PyUnicode_GET_SIZE(unicode),
				1725	mapping,
				1726	NULL);
				1727	}
				1728
				1729	static
				1730	int translate_error(const Py_UNICODE **source,
				1731	Py_UNICODE **dest,
				1732	const char *errors,
				1733	const char *details)
				1734	{
				1735	if ((errors == NULL) \|\|
				1736	(strcmp(errors,"strict") == 0)) {
				1737	PyErr_Format(PyExc_UnicodeError,
				1738	"translate error: %s",
				1739	details);
				1740	return -1;
				1741	}
				1742	else if (strcmp(errors,"ignore") == 0) {
				1743	return 0;
				1744	}
				1745	else if (strcmp(errors,"replace") == 0) {
				1746	**dest = '?';
				1747	(*dest)++;
				1748	return 0;
				1749	}
				1750	else {
				1751	PyErr_Format(PyExc_ValueError,
				1752	"translate error; "
				1753	"unkown error handling code: %s",
				1754	errors);
				1755	return -1;
				1756	}
				1757	}
				1758
				1759	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				1760	int size,
				1761	PyObject *mapping,
				1762	const char *errors)
				1763	{
				1764	PyUnicodeObject *v;
				1765	Py_UNICODE *p;
				1766
				1767	if (mapping == NULL) {
				1768	PyErr_BadArgument();
				1769	return NULL;
				1770	}
				1771
				1772	/* Output will never be longer than input */
				1773	v = _PyUnicode_New(size);
				1774	if (v == NULL)
				1775	goto onError;
				1776	if (size == 0)
				1777	goto done;
				1778	p = PyUnicode_AS_UNICODE(v);
				1779	while (size-- > 0) {
				1780	Py_UNICODE ch = *s++;
				1781	PyObject w, x;
				1782
				1783	/* Get mapping */
				1784	w = PyInt_FromLong(ch);
				1785	if (w == NULL)
				1786	goto onError;
				1787	x = PyObject_GetItem(mapping, w);
				1788	Py_DECREF(w);
				1789	if (x == NULL) {
				1790	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1791	/* No mapping found: default to 1-1 mapping */
				1792	PyErr_Clear();
				1793	*p++ = ch;
				1794	continue;
				1795	}
				1796	goto onError;
				1797	}
				1798
				1799	/* Apply mapping */
				1800	if (PyInt_Check(x))
				1801	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				1802	else if (x == Py_None) {
				1803	/* undefined mapping */
				1804	if (translate_error(&s, &p, errors,
				1805	"character maps to <undefined>")) {
				1806	Py_DECREF(x);
				1807	goto onError;
				1808	}
				1809	}
				1810	else if (PyUnicode_Check(x)) {
				1811	if (PyUnicode_GET_SIZE(x) != 1) {
				1812	/* 1-n mapping */
				1813	PyErr_SetString(PyExc_NotImplementedError,
				1814	"1-n mappings are currently not implemented");
				1815	Py_DECREF(x);
				1816	goto onError;
				1817	}
				1818	p++ = PyUnicode_AS_UNICODE(x);
				1819	}
				1820	else {
				1821	/* wrong return value */
				1822	PyErr_SetString(PyExc_TypeError,
				1823	"translate mapping must return integer, None or unicode");
				1824	Py_DECREF(x);
				1825	goto onError;
				1826	}
				1827	Py_DECREF(x);
				1828	}
				1829	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1830	_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
				1831
				1832	done:
				1833	return (PyObject *)v;
				1834
				1835	onError:
				1836	Py_XDECREF(v);
				1837	return NULL;
				1838	}
				1839
				1840	PyObject PyUnicode_Translate(PyObject str,
				1841	PyObject *mapping,
				1842	const char *errors)
				1843	{
				1844	PyObject *result;
				1845
				1846	str = PyUnicode_FromObject(str);
				1847	if (str == NULL)
				1848	goto onError;
				1849	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				1850	PyUnicode_GET_SIZE(str),
				1851	mapping,
				1852	errors);
				1853	Py_DECREF(str);
				1854	return result;
				1855
				1856	onError:
				1857	Py_XDECREF(str);
				1858	return NULL;
				1859	}
				1860
				1861	/* --- Helpers ------------------------------------------------------------ */
				1862
				1863	static
				1864	int count(PyUnicodeObject *self,
				1865	int start,
				1866	int end,
				1867	PyUnicodeObject *substring)
				1868	{
				1869	int count = 0;
				1870
				1871	end -= substring->length;
				1872
				1873	while (start <= end)
				1874	if (Py_UNICODE_MATCH(self, start, substring)) {
				1875	count++;
				1876	start += substring->length;
				1877	} else
				1878	start++;
				1879
				1880	return count;
				1881	}
				1882
				1883	int PyUnicode_Count(PyObject *str,
				1884	PyObject *substr,
				1885	int start,
				1886	int end)
				1887	{
				1888	int result;
				1889
				1890	str = PyUnicode_FromObject(str);
				1891	if (str == NULL)
				1892	return -1;
				1893	substr = PyUnicode_FromObject(substr);
				1894	if (substr == NULL) {
				1895	Py_DECREF(substr);
				1896	return -1;
				1897	}
				1898
				1899	result = count((PyUnicodeObject *)str,
				1900	start, end,
				1901	(PyUnicodeObject *)substr);
				1902
				1903	Py_DECREF(str);
				1904	Py_DECREF(substr);
				1905	return result;
				1906	}
				1907
				1908	static
				1909	int findstring(PyUnicodeObject *self,
				1910	PyUnicodeObject *substring,
				1911	int start,
				1912	int end,
				1913	int direction)
				1914	{
				1915	if (start < 0)
				1916	start += self->length;
				1917	if (start < 0)
				1918	start = 0;
				1919
				1920	if (substring->length == 0)
				1921	return start;
				1922
				1923	if (end > self->length)
				1924	end = self->length;
				1925	if (end < 0)
				1926	end += self->length;
				1927	if (end < 0)
				1928	end = 0;
				1929
				1930	end -= substring->length;
				1931
				1932	if (direction < 0) {
				1933	for (; end >= start; end--)
				1934	if (Py_UNICODE_MATCH(self, end, substring))
				1935	return end;
				1936	} else {
				1937	for (; start <= end; start++)
				1938	if (Py_UNICODE_MATCH(self, start, substring))
				1939	return start;
				1940	}
				1941
				1942	return -1;
				1943	}
				1944
				1945	int PyUnicode_Find(PyObject *str,
				1946	PyObject *substr,
				1947	int start,
				1948	int end,
				1949	int direction)
				1950	{
				1951	int result;
				1952
				1953	str = PyUnicode_FromObject(str);
				1954	if (str == NULL)
				1955	return -1;
				1956	substr = PyUnicode_FromObject(substr);
				1957	if (substr == NULL) {
				1958	Py_DECREF(substr);
				1959	return -1;
				1960	}
				1961
				1962	result = findstring((PyUnicodeObject *)str,
				1963	(PyUnicodeObject *)substr,
				1964	start, end, direction);
				1965	Py_DECREF(str);
				1966	Py_DECREF(substr);
				1967	return result;
				1968	}
				1969
				1970	static
				1971	int tailmatch(PyUnicodeObject *self,
				1972	PyUnicodeObject *substring,
				1973	int start,
				1974	int end,
				1975	int direction)
				1976	{
				1977	if (start < 0)
				1978	start += self->length;
				1979	if (start < 0)
				1980	start = 0;
				1981
				1982	if (substring->length == 0)
				1983	return 1;
				1984
				1985	if (end > self->length)
				1986	end = self->length;
				1987	if (end < 0)
				1988	end += self->length;
				1989	if (end < 0)
				1990	end = 0;
				1991
				1992	end -= substring->length;
				1993	if (end < start)
				1994	return 0;
				1995
				1996	if (direction > 0) {
				1997	if (Py_UNICODE_MATCH(self, end, substring))
				1998	return 1;
				1999	} else {
				2000	if (Py_UNICODE_MATCH(self, start, substring))
				2001	return 1;
				2002	}
				2003
				2004	return 0;
				2005	}
				2006
				2007	int PyUnicode_Tailmatch(PyObject *str,
				2008	PyObject *substr,
				2009	int start,
				2010	int end,
				2011	int direction)
				2012	{
				2013	int result;
				2014
				2015	str = PyUnicode_FromObject(str);
				2016	if (str == NULL)
				2017	return -1;
				2018	substr = PyUnicode_FromObject(substr);
				2019	if (substr == NULL) {
				2020	Py_DECREF(substr);
				2021	return -1;
				2022	}
				2023
				2024	result = tailmatch((PyUnicodeObject *)str,
				2025	(PyUnicodeObject *)substr,
				2026	start, end, direction);
				2027	Py_DECREF(str);
				2028	Py_DECREF(substr);
				2029	return result;
				2030	}
				2031
				2032	static
				2033	const Py_UNICODE findchar(const Py_UNICODE s,
				2034	int size,
				2035	Py_UNICODE ch)
				2036	{
				2037	/* like wcschr, but doesn't stop at NULL characters */
				2038
				2039	while (size-- > 0) {
				2040	if (*s == ch)
				2041	return s;
				2042	s++;
				2043	}
				2044
				2045	return NULL;
				2046	}
				2047
				2048	/* Apply fixfct filter to the Unicode object self and return a
				2049	reference to the modified object */
				2050
				2051	static
				2052	PyObject fixup(PyUnicodeObject self,
				2053	int (fixfct)(PyUnicodeObject s))
				2054	{
				2055
				2056	PyUnicodeObject *u;
				2057
				2058	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2059	self->length);
				2060	if (u == NULL)
				2061	return NULL;
				2062	if (!fixfct(u)) {
				2063	/* fixfct should return TRUE if it modified the buffer. If
				2064	FALSE, return a reference to the original buffer instead
				2065	(to save space, not time) */
				2066	Py_INCREF(self);
				2067	Py_DECREF(u);
				2068	return (PyObject*) self;
				2069	}
				2070	return (PyObject*) u;
				2071	}
				2072
				2073	static
				2074	int fixupper(PyUnicodeObject *self)
				2075	{
				2076	int len = self->length;
				2077	Py_UNICODE *s = self->str;
				2078	int status = 0;
				2079
				2080	while (len-- > 0) {
				2081	register Py_UNICODE ch;
				2082
				2083	ch = Py_UNICODE_TOUPPER(*s);
				2084	if (ch != *s) {
				2085	status = 1;
				2086	*s = ch;
				2087	}
				2088	s++;
				2089	}
				2090
				2091	return status;
				2092	}
				2093
				2094	static
				2095	int fixlower(PyUnicodeObject *self)
				2096	{
				2097	int len = self->length;
				2098	Py_UNICODE *s = self->str;
				2099	int status = 0;
				2100
				2101	while (len-- > 0) {
				2102	register Py_UNICODE ch;
				2103
				2104	ch = Py_UNICODE_TOLOWER(*s);
				2105	if (ch != *s) {
				2106	status = 1;
				2107	*s = ch;
				2108	}
				2109	s++;
				2110	}
				2111
				2112	return status;
				2113	}
				2114
				2115	static
				2116	int fixswapcase(PyUnicodeObject *self)
				2117	{
				2118	int len = self->length;
				2119	Py_UNICODE *s = self->str;
				2120	int status = 0;
				2121
				2122	while (len-- > 0) {
				2123	if (Py_UNICODE_ISUPPER(*s)) {
				2124	s = Py_UNICODE_TOLOWER(s);
				2125	status = 1;
				2126	} else if (Py_UNICODE_ISLOWER(*s)) {
				2127	s = Py_UNICODE_TOUPPER(s);
				2128	status = 1;
				2129	}
				2130	s++;
				2131	}
				2132
				2133	return status;
				2134	}
				2135
				2136	static
				2137	int fixcapitalize(PyUnicodeObject *self)
				2138	{
				2139	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2140	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2141	return 1;
				2142	}
				2143	return 0;
				2144	}
				2145
				2146	static
				2147	int fixtitle(PyUnicodeObject *self)
				2148	{
				2149	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2150	register Py_UNICODE *e;
				2151	int previous_is_cased;
				2152
				2153	/* Shortcut for single character strings */
				2154	if (PyUnicode_GET_SIZE(self) == 1) {
				2155	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2156	if (*p != ch) {
				2157	*p = ch;
				2158	return 1;
				2159	}
				2160	else
				2161	return 0;
				2162	}
				2163
				2164	e = p + PyUnicode_GET_SIZE(self);
				2165	previous_is_cased = 0;
				2166	for (; p < e; p++) {
				2167	register const Py_UNICODE ch = *p;
				2168
				2169	if (previous_is_cased)
				2170	*p = Py_UNICODE_TOLOWER(ch);
				2171	else
				2172	*p = Py_UNICODE_TOTITLE(ch);
				2173
				2174	if (Py_UNICODE_ISLOWER(ch) \|\|
				2175	Py_UNICODE_ISUPPER(ch) \|\|
				2176	Py_UNICODE_ISTITLE(ch))
				2177	previous_is_cased = 1;
				2178	else
				2179	previous_is_cased = 0;
				2180	}
				2181	return 1;
				2182	}
				2183
				2184	PyObject PyUnicode_Join(PyObject separator,
				2185	PyObject *seq)
				2186	{
				2187	Py_UNICODE *sep;
				2188	int seplen;
				2189	PyUnicodeObject *res = NULL;
				2190	int reslen = 0;
				2191	Py_UNICODE *p;
				2192	int seqlen = 0;
				2193	int sz = 100;
				2194	int i;
				2195
				2196	seqlen = PySequence_Length(seq);
				2197	if (seqlen < 0 && PyErr_Occurred())
				2198	return NULL;
				2199
				2200	if (separator == NULL) {
				2201	Py_UNICODE blank = ' ';
				2202	sep = &blank;
				2203	seplen = 1;
				2204	}
				2205	else {
				2206	separator = PyUnicode_FromObject(separator);
				2207	if (separator == NULL)
				2208	return NULL;
				2209	sep = PyUnicode_AS_UNICODE(separator);
				2210	seplen = PyUnicode_GET_SIZE(separator);
				2211	}
				2212
				2213	res = _PyUnicode_New(sz);
				2214	if (res == NULL)
				2215	goto onError;
				2216	p = PyUnicode_AS_UNICODE(res);
				2217	reslen = 0;
				2218
				2219	for (i = 0; i < seqlen; i++) {
				2220	int itemlen;
				2221	PyObject *item;
				2222
				2223	item = PySequence_GetItem(seq, i);
				2224	if (item == NULL)
				2225	goto onError;
				2226	if (!PyUnicode_Check(item)) {
				2227	PyObject *v;
				2228	v = PyUnicode_FromObject(item);
				2229	Py_DECREF(item);
				2230	item = v;
				2231	if (item == NULL)
				2232	goto onError;
				2233	}
				2234	itemlen = PyUnicode_GET_SIZE(item);
				2235	while (reslen + itemlen + seplen >= sz) {
				2236	if (_PyUnicode_Resize(res, sz*2))
				2237	goto onError;
				2238	sz *= 2;
				2239	p = PyUnicode_AS_UNICODE(res) + reslen;
				2240	}
				2241	if (i > 0) {
				2242	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2243	p += seplen;
				2244	reslen += seplen;
				2245	}
				2246	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2247	p += itemlen;
				2248	reslen += itemlen;
				2249	Py_DECREF(item);
				2250	}
				2251	if (_PyUnicode_Resize(res, reslen))
				2252	goto onError;
				2253
				2254	Py_XDECREF(separator);
				2255	return (PyObject *)res;
				2256
				2257	onError:
				2258	Py_XDECREF(separator);
				2259	Py_DECREF(res);
				2260	return NULL;
				2261	}
				2262
				2263	static
				2264	PyUnicodeObject pad(PyUnicodeObject self,
				2265	int left,
				2266	int right,
				2267	Py_UNICODE fill)
				2268	{
				2269	PyUnicodeObject *u;
				2270
				2271	if (left < 0)
				2272	left = 0;
				2273	if (right < 0)
				2274	right = 0;
				2275
				2276	if (left == 0 && right == 0) {
				2277	Py_INCREF(self);
				2278	return self;
				2279	}
				2280
				2281	u = _PyUnicode_New(left + self->length + right);
				2282	if (u) {
				2283	if (left)
				2284	Py_UNICODE_FILL(u->str, fill, left);
				2285	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2286	if (right)
				2287	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2288	}
				2289
				2290	return u;
				2291	}
				2292
				2293	#define SPLIT_APPEND(data, left, right) \
				2294	str = PyUnicode_FromUnicode(data + left, right - left); \
				2295	if (!str) \
				2296	goto onError; \
				2297	if (PyList_Append(list, str)) { \
				2298	Py_DECREF(str); \
				2299	goto onError; \
				2300	} \
				2301	else \
				2302	Py_DECREF(str);
				2303
				2304	static
				2305	PyObject split_whitespace(PyUnicodeObject self,
				2306	PyObject *list,
				2307	int maxcount)
				2308	{
				2309	register int i;
				2310	register int j;
				2311	int len = self->length;
				2312	PyObject *str;
				2313
				2314	for (i = j = 0; i < len; ) {
				2315	/* find a token */
				2316	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2317	i++;
				2318	j = i;
				2319	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2320	i++;
				2321	if (j < i) {
				2322	if (maxcount-- <= 0)
				2323	break;
				2324	SPLIT_APPEND(self->str, j, i);
				2325	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2326	i++;
				2327	j = i;
				2328	}
				2329	}
				2330	if (j < len) {
				2331	SPLIT_APPEND(self->str, j, len);
				2332	}
				2333	return list;
				2334
				2335	onError:
				2336	Py_DECREF(list);
				2337	return NULL;
				2338	}
				2339
				2340	PyObject PyUnicode_Splitlines(PyObject string,
				2341	int maxcount)
				2342	{
				2343	register int i;
				2344	register int j;
				2345	int len;
				2346	PyObject *list;
				2347	PyObject *str;
				2348	Py_UNICODE *data;
				2349
				2350	string = PyUnicode_FromObject(string);
				2351	if (string == NULL)
				2352	return NULL;
				2353	data = PyUnicode_AS_UNICODE(string);
				2354	len = PyUnicode_GET_SIZE(string);
				2355
				2356	if (maxcount < 0)
				2357	maxcount = INT_MAX;
				2358
				2359	list = PyList_New(0);
				2360	if (!list)
				2361	goto onError;
				2362
				2363	for (i = j = 0; i < len; ) {
				2364	/* Find a line and append it */
				2365	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2366	i++;
				2367	if (maxcount-- <= 0)
				2368	break;
				2369	SPLIT_APPEND(data, j, i);
				2370
				2371	/* Skip the line break reading CRLF as one line break */
				2372	if (i < len) {
				2373	if (data[i] == '\r' && i + 1 < len &&
				2374	data[i+1] == '\n')
				2375	i += 2;
				2376	else
				2377	i++;
				2378	}
				2379	j = i;
				2380	}
				2381	if (j < len) {
				2382	SPLIT_APPEND(data, j, len);
				2383	}
				2384
				2385	Py_DECREF(string);
				2386	return list;
				2387
				2388	onError:
				2389	Py_DECREF(list);
				2390	Py_DECREF(string);
				2391	return NULL;
				2392	}
				2393
				2394	static
				2395	PyObject split_char(PyUnicodeObject self,
				2396	PyObject *list,
				2397	Py_UNICODE ch,
				2398	int maxcount)
				2399	{
				2400	register int i;
				2401	register int j;
				2402	int len = self->length;
				2403	PyObject *str;
				2404
				2405	for (i = j = 0; i < len; ) {
				2406	if (self->str[i] == ch) {
				2407	if (maxcount-- <= 0)
				2408	break;
				2409	SPLIT_APPEND(self->str, j, i);
				2410	i = j = i + 1;
				2411	} else
				2412	i++;
				2413	}
				2414	if (j <= len) {
				2415	SPLIT_APPEND(self->str, j, len);
				2416	}
				2417	return list;
				2418
				2419	onError:
				2420	Py_DECREF(list);
				2421	return NULL;
				2422	}
				2423
				2424	static
				2425	PyObject split_substring(PyUnicodeObject self,
				2426	PyObject *list,
				2427	PyUnicodeObject *substring,
				2428	int maxcount)
				2429	{
				2430	register int i;
				2431	register int j;
				2432	int len = self->length;
				2433	int sublen = substring->length;
				2434	PyObject *str;
				2435
				2436	for (i = j = 0; i < len - sublen; ) {
				2437	if (Py_UNICODE_MATCH(self, i, substring)) {
				2438	if (maxcount-- <= 0)
				2439	break;
				2440	SPLIT_APPEND(self->str, j, i);
				2441	i = j = i + sublen;
				2442	} else
				2443	i++;
				2444	}
				2445	if (j <= len) {
				2446	SPLIT_APPEND(self->str, j, len);
				2447	}
				2448	return list;
				2449
				2450	onError:
				2451	Py_DECREF(list);
				2452	return NULL;
				2453	}
				2454
				2455	#undef SPLIT_APPEND
				2456
				2457	static
				2458	PyObject split(PyUnicodeObject self,
				2459	PyUnicodeObject *substring,
				2460	int maxcount)
				2461	{
				2462	PyObject *list;
				2463
				2464	if (maxcount < 0)
				2465	maxcount = INT_MAX;
				2466
				2467	list = PyList_New(0);
				2468	if (!list)
				2469	return NULL;
				2470
				2471	if (substring == NULL)
				2472	return split_whitespace(self,list,maxcount);
				2473
				2474	else if (substring->length == 1)
				2475	return split_char(self,list,substring->str[0],maxcount);
				2476
				2477	else if (substring->length == 0) {
				2478	Py_DECREF(list);
				2479	PyErr_SetString(PyExc_ValueError, "empty separator");
				2480	return NULL;
				2481	}
				2482	else
				2483	return split_substring(self,list,substring,maxcount);
				2484	}
				2485
				2486	static
				2487	PyObject strip(PyUnicodeObject self,
				2488	int left,
				2489	int right)
				2490	{
				2491	Py_UNICODE *p = self->str;
				2492	int start = 0;
				2493	int end = self->length;
				2494
				2495	if (left)
				2496	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2497	start++;
				2498
				2499	if (right)
				2500	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2501	end--;
				2502
				2503	if (start == 0 && end == self->length) {
				2504	/* couldn't strip anything off, return original string */
				2505	Py_INCREF(self);
				2506	return (PyObject*) self;
				2507	}
				2508
				2509	return (PyObject*) PyUnicode_FromUnicode(
				2510	self->str + start,
				2511	end - start
				2512	);
				2513	}
				2514
				2515	static
				2516	PyObject replace(PyUnicodeObject self,
				2517	PyUnicodeObject *str1,
				2518	PyUnicodeObject *str2,
				2519	int maxcount)
				2520	{
				2521	PyUnicodeObject *u;
				2522
				2523	if (maxcount < 0)
				2524	maxcount = INT_MAX;
				2525
				2526	if (str1->length == 1 && str2->length == 1) {
				2527	int i;
				2528
				2529	/* replace characters */
				2530	if (!findchar(self->str, self->length, str1->str[0])) {
				2531	/* nothing to replace, return original string */
				2532	Py_INCREF(self);
				2533	u = self;
				2534	} else {
				2535	Py_UNICODE u1 = str1->str[0];
				2536	Py_UNICODE u2 = str2->str[0];
				2537
				2538	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2539	self->str,
				2540	self->length
				2541	);
				2542	if (u)
				2543	for (i = 0; i < u->length; i++)
				2544	if (u->str[i] == u1) {
				2545	if (--maxcount < 0)
				2546	break;
				2547	u->str[i] = u2;
				2548	}
				2549	}
				2550
				2551	} else {
				2552	int n, i;
				2553	Py_UNICODE *p;
				2554
				2555	/* replace strings */
				2556	n = count(self, 0, self->length, str1);
				2557	if (n > maxcount)
				2558	n = maxcount;
				2559	if (n == 0) {
				2560	/* nothing to replace, return original string */
				2561	Py_INCREF(self);
				2562	u = self;
				2563	} else {
				2564	u = _PyUnicode_New(
				2565	self->length + n * (str2->length - str1->length));
				2566	if (u) {
				2567	i = 0;
				2568	p = u->str;
				2569	while (i <= self->length - str1->length)
				2570	if (Py_UNICODE_MATCH(self, i, str1)) {
				2571	/* replace string segment */
				2572	Py_UNICODE_COPY(p, str2->str, str2->length);
				2573	p += str2->length;
				2574	i += str1->length;
				2575	if (--n <= 0) {
				2576	/* copy remaining part */
				2577	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				2578	break;
				2579	}
				2580	} else
				2581	*p++ = self->str[i++];
				2582	}
				2583	}
				2584	}
				2585
				2586	return (PyObject *) u;
				2587	}
				2588
				2589	/* --- Unicode Object Methods --------------------------------------------- */
				2590
				2591	static char title__doc__[] =
				2592	"S.title() -> unicode\n\
				2593	\n\
				2594	Return a titlecased version of S, i.e. words start with title case\n\
				2595	characters, all remaining cased characters have lower case.";
				2596
				2597	static PyObject*
				2598	unicode_title(PyUnicodeObject self, PyObject args)
				2599	{
				2600	if (!PyArg_NoArgs(args))
				2601	return NULL;
				2602	return fixup(self, fixtitle);
				2603	}
				2604
				2605	static char capitalize__doc__[] =
				2606	"S.capitalize() -> unicode\n\
				2607	\n\
				2608	Return a capitalized version of S, i.e. make the first character\n\
				2609	have upper case.";
				2610
				2611	static PyObject*
				2612	unicode_capitalize(PyUnicodeObject self, PyObject args)
				2613	{
				2614	if (!PyArg_NoArgs(args))
				2615	return NULL;
				2616	return fixup(self, fixcapitalize);
				2617	}
				2618
				2619	#if 0
				2620	static char capwords__doc__[] =
				2621	"S.capwords() -> unicode\n\
				2622	\n\
				2623	Apply .capitalize() to all words in S and return the result with\n\
				2624	normalized whitespace (all whitespace strings are replaced by ' ').";
				2625
				2626	static PyObject*
				2627	unicode_capwords(PyUnicodeObject self, PyObject args)
				2628	{
				2629	PyObject *list;
				2630	PyObject *item;
				2631	int i;
				2632
				2633	if (!PyArg_NoArgs(args))
				2634	return NULL;
				2635
				2636	/* Split into words */
				2637	list = split(self, NULL, -1);
				2638	if (!list)
				2639	return NULL;
				2640
				2641	/* Capitalize each word */
				2642	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				2643	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				2644	fixcapitalize);
				2645	if (item == NULL)
				2646	goto onError;
				2647	Py_DECREF(PyList_GET_ITEM(list, i));
				2648	PyList_SET_ITEM(list, i, item);
				2649	}
				2650
				2651	/* Join the words to form a new string */
				2652	item = PyUnicode_Join(NULL, list);
				2653
				2654	onError:
				2655	Py_DECREF(list);
				2656	return (PyObject *)item;
				2657	}
				2658	#endif
				2659
				2660	static char center__doc__[] =
				2661	"S.center(width) -> unicode\n\
				2662	\n\
				2663	Return S centered in a Unicode string of length width. Padding is done\n\
				2664	using spaces.";
				2665
				2666	static PyObject *
				2667	unicode_center(PyUnicodeObject self, PyObject args)
				2668	{
				2669	int marg, left;
				2670	int width;
				2671
				2672	if (!PyArg_ParseTuple(args, "i:center", &width))
				2673	return NULL;
				2674
				2675	if (self->length >= width) {
				2676	Py_INCREF(self);
				2677	return (PyObject*) self;
				2678	}
				2679
				2680	marg = width - self->length;
				2681	left = marg / 2 + (marg & width & 1);
				2682
				2683	return (PyObject*) pad(self, left, marg - left, ' ');
				2684	}
				2685
				2686	static int
				2687	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				2688	{
				2689	int len1, len2;
				2690	Py_UNICODE *s1 = str1->str;
				2691	Py_UNICODE *s2 = str2->str;
				2692
				2693	len1 = str1->length;
				2694	len2 = str2->length;
				2695
				2696	while (len1 > 0 && len2 > 0) {
				2697	int cmp = (s1++) - (s2++);
				2698	if (cmp)
				2699	/* This should make Christian happy! */
				2700	return (cmp < 0) ? -1 : (cmp != 0);
				2701	len1--, len2--;
				2702	}
				2703
				2704	return (len1 < len2) ? -1 : (len1 != len2);
				2705	}
				2706
				2707	int PyUnicode_Compare(PyObject *left,
				2708	PyObject *right)
				2709	{
				2710	PyUnicodeObject u = NULL, v = NULL;
				2711	int result;
				2712
				2713	/* Coerce the two arguments */
				2714	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2715	if (u == NULL)
				2716	goto onError;
				2717	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2718	if (v == NULL)
				2719	goto onError;
				2720
				2721	/* Shortcut for emtpy or interned objects */
				2722	if (v == u) {
				2723	Py_DECREF(u);
				2724	Py_DECREF(v);
				2725	return 0;
				2726	}
				2727
				2728	result = unicode_compare(u, v);
				2729
				2730	Py_DECREF(u);
				2731	Py_DECREF(v);
				2732	return result;
				2733
				2734	onError:
				2735	Py_XDECREF(u);
				2736	Py_XDECREF(v);
				2737	return -1;
				2738	}
				2739
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2740	int PyUnicode_Contains(PyObject *container,
				2741	PyObject *element)
				2742	{
				2743	PyUnicodeObject u = NULL, v = NULL;
				2744	int result;
				2745	register const Py_UNICODE p, e;
				2746	register Py_UNICODE ch;
				2747
				2748	/* Coerce the two arguments */
				2749	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				2750	if (u == NULL)
				2751	goto onError;
				2752	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
				2753	if (v == NULL)
				2754	goto onError;
				2755
				2756	/* Check v in u */
				2757	if (PyUnicode_GET_SIZE(v) != 1) {
				2758	PyErr_SetString(PyExc_TypeError,
				2759	"string member test needs char left operand");
				2760	goto onError;
				2761	}
				2762	ch = *PyUnicode_AS_UNICODE(v);
				2763	p = PyUnicode_AS_UNICODE(u);
				2764	e = p + PyUnicode_GET_SIZE(u);
				2765	result = 0;
				2766	while (p < e) {
				2767	if (*p++ == ch) {
				2768	result = 1;
				2769	break;
				2770	}
				2771	}
				2772
				2773	Py_DECREF(u);
				2774	Py_DECREF(v);
				2775	return result;
				2776
				2777	onError:
				2778	Py_XDECREF(u);
				2779	Py_XDECREF(v);
				2780	return -1;
				2781	}
				2782
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2783	/* Concat to string or Unicode object giving a new Unicode object. */
				2784
				2785	PyObject PyUnicode_Concat(PyObject left,
				2786	PyObject *right)
				2787	{
				2788	PyUnicodeObject u = NULL, v = NULL, *w;
				2789
				2790	/* Coerce the two arguments */
				2791	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2792	if (u == NULL)
				2793	goto onError;
				2794	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2795	if (v == NULL)
				2796	goto onError;
				2797
				2798	/* Shortcuts */
				2799	if (v == unicode_empty) {
				2800	Py_DECREF(v);
				2801	return (PyObject *)u;
				2802	}
				2803	if (u == unicode_empty) {
				2804	Py_DECREF(u);
				2805	return (PyObject *)v;
				2806	}
				2807
				2808	/* Concat the two Unicode strings */
				2809	w = _PyUnicode_New(u->length + v->length);
				2810	if (w == NULL)
				2811	goto onError;
				2812	Py_UNICODE_COPY(w->str, u->str, u->length);
				2813	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				2814
				2815	Py_DECREF(u);
				2816	Py_DECREF(v);
				2817	return (PyObject *)w;
				2818
				2819	onError:
				2820	Py_XDECREF(u);
				2821	Py_XDECREF(v);
				2822	return NULL;
				2823	}
				2824
				2825	static char count__doc__[] =
				2826	"S.count(sub[, start[, end]]) -> int\n\
				2827	\n\
				2828	Return the number of occurrences of substring sub in Unicode string\n\
				2829	S[start:end]. Optional arguments start and end are\n\
				2830	interpreted as in slice notation.";
				2831
				2832	static PyObject *
				2833	unicode_count(PyUnicodeObject self, PyObject args)
				2834	{
				2835	PyUnicodeObject *substring;
				2836	int start = 0;
				2837	int end = INT_MAX;
				2838	PyObject *result;
				2839
				2840	if (!PyArg_ParseTuple(args, "O\|ii:count", &substring, &start, &end))
				2841	return NULL;
				2842
				2843	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				2844	(PyObject *)substring);
				2845	if (substring == NULL)
				2846	return NULL;
				2847
				2848	if (substring->length == 0) {
				2849	Py_DECREF(substring);
				2850	return PyInt_FromLong((long) 0);
				2851	}
				2852
				2853	if (start < 0)
				2854	start += self->length;
				2855	if (start < 0)
				2856	start = 0;
				2857	if (end > self->length)
				2858	end = self->length;
				2859	if (end < 0)
				2860	end += self->length;
				2861	if (end < 0)
				2862	end = 0;
				2863
				2864	result = PyInt_FromLong((long) count(self, start, end, substring));
				2865
				2866	Py_DECREF(substring);
				2867	return result;
				2868	}
				2869
				2870	static char encode__doc__[] =
				2871	"S.encode([encoding[,errors]]) -> string\n\
				2872	\n\
				2873	Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
				2874	errors may be given to set a different error handling scheme. Default\n\
				2875	is 'strict' meaning that encoding errors raise a ValueError. Other\n\
				2876	possible values are 'ignore' and 'replace'.";
				2877
				2878	static PyObject *
				2879	unicode_encode(PyUnicodeObject self, PyObject args)
				2880	{
				2881	char *encoding = NULL;
				2882	char *errors = NULL;
				2883	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				2884	return NULL;
				2885	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				2886	}
				2887
				2888	static char expandtabs__doc__[] =
				2889	"S.expandtabs([tabsize]) -> unicode\n\
				2890	\n\
				2891	Return a copy of S where all tab characters are expanded using spaces.\n\
				2892	If tabsize is not given, a tab size of 8 characters is assumed.";
				2893
				2894	static PyObject*
				2895	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				2896	{
				2897	Py_UNICODE *e;
				2898	Py_UNICODE *p;
				2899	Py_UNICODE *q;
				2900	int i, j;
				2901	PyUnicodeObject *u;
				2902	int tabsize = 8;
				2903
				2904	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				2905	return NULL;
				2906
				2907	/* First pass: determine size of ouput string */
				2908	i = j = 0;
				2909	e = self->str + self->length;
				2910	for (p = self->str; p < e; p++)
				2911	if (*p == '\t') {
				2912	if (tabsize > 0)
				2913	j += tabsize - (j % tabsize);
				2914	}
				2915	else {
				2916	j++;
				2917	if (p == '\n' \|\| p == '\r') {
				2918	i += j;
				2919	j = 0;
				2920	}
				2921	}
				2922
				2923	/* Second pass: create output string and fill it */
				2924	u = _PyUnicode_New(i + j);
				2925	if (!u)
				2926	return NULL;
				2927
				2928	j = 0;
				2929	q = u->str;
				2930
				2931	for (p = self->str; p < e; p++)
				2932	if (*p == '\t') {
				2933	if (tabsize > 0) {
				2934	i = tabsize - (j % tabsize);
				2935	j += i;
				2936	while (i--)
				2937	*q++ = ' ';
				2938	}
				2939	}
				2940	else {
				2941	j++;
				2942	q++ = p;
				2943	if (p == '\n' \|\| p == '\r')
				2944	j = 0;
				2945	}
				2946
				2947	return (PyObject*) u;
				2948	}
				2949
				2950	static char find__doc__[] =
				2951	"S.find(sub [,start [,end]]) -> int\n\
				2952	\n\
				2953	Return the lowest index in S where substring sub is found,\n\
				2954	such that sub is contained within s[start,end]. Optional\n\
				2955	arguments start and end are interpreted as in slice notation.\n\
				2956	\n\
				2957	Return -1 on failure.";
				2958
				2959	static PyObject *
				2960	unicode_find(PyUnicodeObject self, PyObject args)
				2961	{
				2962	PyUnicodeObject *substring;
				2963	int start = 0;
				2964	int end = INT_MAX;
				2965	PyObject *result;
				2966
				2967	if (!PyArg_ParseTuple(args, "O\|ii:find", &substring, &start, &end))
				2968	return NULL;
				2969	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				2970	(PyObject *)substring);
				2971	if (substring == NULL)
				2972	return NULL;
				2973
				2974	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				2975
				2976	Py_DECREF(substring);
				2977	return result;
				2978	}
				2979
				2980	static PyObject *
				2981	unicode_getitem(PyUnicodeObject *self, int index)
				2982	{
				2983	if (index < 0 \|\| index >= self->length) {
				2984	PyErr_SetString(PyExc_IndexError, "string index out of range");
				2985	return NULL;
				2986	}
				2987
				2988	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				2989	}
				2990
				2991	static long
				2992	unicode_hash(PyUnicodeObject *self)
				2993	{
				2994	long hash;
				2995	PyObject *utf8;
				2996
				2997	/* Since Unicode objects compare equal to their UTF-8 string
				2998	counterparts, they should also use the UTF-8 strings as basis
				2999	for their hash value. This is needed to assure that strings and
				3000	Unicode objects behave in the same way as dictionary
				3001	keys. Unfortunately, this costs some performance and also some
				3002	memory if the cached UTF-8 representation is not used later
				3003	on. */
				3004	if (self->hash != -1)
				3005	return self->hash;
				3006	utf8 = utf8_string(self, NULL);
				3007	if (utf8 == NULL)
				3008	return -1;
				3009	hash = PyObject_Hash(utf8);
				3010	if (hash == -1)
				3011	return -1;
				3012	self->hash = hash;
				3013	return hash;
				3014	}
				3015
				3016	static char index__doc__[] =
				3017	"S.index(sub [,start [,end]]) -> int\n\
				3018	\n\
				3019	Like S.find() but raise ValueError when the substring is not found.";
				3020
				3021	static PyObject *
				3022	unicode_index(PyUnicodeObject self, PyObject args)
				3023	{
				3024	int result;
				3025	PyUnicodeObject *substring;
				3026	int start = 0;
				3027	int end = INT_MAX;
				3028
				3029	if (!PyArg_ParseTuple(args, "O\|ii:index", &substring, &start, &end))
				3030	return NULL;
				3031
				3032	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3033	(PyObject *)substring);
				3034	if (substring == NULL)
				3035	return NULL;
				3036
				3037	result = findstring(self, substring, start, end, 1);
				3038
				3039	Py_DECREF(substring);
				3040	if (result < 0) {
				3041	PyErr_SetString(PyExc_ValueError, "substring not found");
				3042	return NULL;
				3043	}
				3044	return PyInt_FromLong(result);
				3045	}
				3046
				3047	static char islower__doc__[] =
				3048	"S.islower() -> int\n\
				3049	\n\
				3050	Return 1 if all cased characters in S are lowercase and there is\n\
				3051	at least one cased character in S, 0 otherwise.";
				3052
				3053	static PyObject*
				3054	unicode_islower(PyUnicodeObject self, PyObject args)
				3055	{
				3056	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3057	register const Py_UNICODE *e;
				3058	int cased;
				3059
				3060	if (!PyArg_NoArgs(args))
				3061	return NULL;
				3062
				3063	/* Shortcut for single character strings */
				3064	if (PyUnicode_GET_SIZE(self) == 1)
				3065	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3066
				3067	e = p + PyUnicode_GET_SIZE(self);
				3068	cased = 0;
				3069	for (; p < e; p++) {
				3070	register const Py_UNICODE ch = *p;
				3071
				3072	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3073	return PyInt_FromLong(0);
				3074	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3075	cased = 1;
				3076	}
				3077	return PyInt_FromLong(cased);
				3078	}
				3079
				3080	static char isupper__doc__[] =
				3081	"S.isupper() -> int\n\
				3082	\n\
				3083	Return 1 if all cased characters in S are uppercase and there is\n\
				3084	at least one cased character in S, 0 otherwise.";
				3085
				3086	static PyObject*
				3087	unicode_isupper(PyUnicodeObject self, PyObject args)
				3088	{
				3089	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3090	register const Py_UNICODE *e;
				3091	int cased;
				3092
				3093	if (!PyArg_NoArgs(args))
				3094	return NULL;
				3095
				3096	/* Shortcut for single character strings */
				3097	if (PyUnicode_GET_SIZE(self) == 1)
				3098	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3099
				3100	e = p + PyUnicode_GET_SIZE(self);
				3101	cased = 0;
				3102	for (; p < e; p++) {
				3103	register const Py_UNICODE ch = *p;
				3104
				3105	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3106	return PyInt_FromLong(0);
				3107	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3108	cased = 1;
				3109	}
				3110	return PyInt_FromLong(cased);
				3111	}
				3112
				3113	static char istitle__doc__[] =
				3114	"S.istitle() -> int\n\
				3115	\n\
				3116	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3117	may only follow uncased characters and lowercase characters only cased\n\
				3118	ones. Return 0 otherwise.";
				3119
				3120	static PyObject*
				3121	unicode_istitle(PyUnicodeObject self, PyObject args)
				3122	{
				3123	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3124	register const Py_UNICODE *e;
				3125	int cased, previous_is_cased;
				3126
				3127	if (!PyArg_NoArgs(args))
				3128	return NULL;
				3129
				3130	/* Shortcut for single character strings */
				3131	if (PyUnicode_GET_SIZE(self) == 1)
				3132	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3133	(Py_UNICODE_ISUPPER(*p) != 0));
				3134
				3135	e = p + PyUnicode_GET_SIZE(self);
				3136	cased = 0;
				3137	previous_is_cased = 0;
				3138	for (; p < e; p++) {
				3139	register const Py_UNICODE ch = *p;
				3140
				3141	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3142	if (previous_is_cased)
				3143	return PyInt_FromLong(0);
				3144	previous_is_cased = 1;
				3145	cased = 1;
				3146	}
				3147	else if (Py_UNICODE_ISLOWER(ch)) {
				3148	if (!previous_is_cased)
				3149	return PyInt_FromLong(0);
				3150	previous_is_cased = 1;
				3151	cased = 1;
				3152	}
				3153	else
				3154	previous_is_cased = 0;
				3155	}
				3156	return PyInt_FromLong(cased);
				3157	}
				3158
				3159	static char isspace__doc__[] =
				3160	"S.isspace() -> int\n\
				3161	\n\
				3162	Return 1 if there are only whitespace characters in S,\n\
				3163	0 otherwise.";
				3164
				3165	static PyObject*
				3166	unicode_isspace(PyUnicodeObject self, PyObject args)
				3167	{
				3168	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3169	register const Py_UNICODE *e;
				3170
				3171	if (!PyArg_NoArgs(args))
				3172	return NULL;
				3173
				3174	/* Shortcut for single character strings */
				3175	if (PyUnicode_GET_SIZE(self) == 1 &&
				3176	Py_UNICODE_ISSPACE(*p))
				3177	return PyInt_FromLong(1);
				3178
				3179	e = p + PyUnicode_GET_SIZE(self);
				3180	for (; p < e; p++) {
				3181	if (!Py_UNICODE_ISSPACE(*p))
				3182	return PyInt_FromLong(0);
				3183	}
				3184	return PyInt_FromLong(1);
				3185	}
				3186
				3187	static char isdecimal__doc__[] =
				3188	"S.isdecimal() -> int\n\
				3189	\n\
				3190	Return 1 if there are only decimal characters in S,\n\
				3191	0 otherwise.";
				3192
				3193	static PyObject*
				3194	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3195	{
				3196	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3197	register const Py_UNICODE *e;
				3198
				3199	if (!PyArg_NoArgs(args))
				3200	return NULL;
				3201
				3202	/* Shortcut for single character strings */
				3203	if (PyUnicode_GET_SIZE(self) == 1 &&
				3204	Py_UNICODE_ISDECIMAL(*p))
				3205	return PyInt_FromLong(1);
				3206
				3207	e = p + PyUnicode_GET_SIZE(self);
				3208	for (; p < e; p++) {
				3209	if (!Py_UNICODE_ISDECIMAL(*p))
				3210	return PyInt_FromLong(0);
				3211	}
				3212	return PyInt_FromLong(1);
				3213	}
				3214
				3215	static char isdigit__doc__[] =
				3216	"S.isdigit() -> int\n\
				3217	\n\
				3218	Return 1 if there are only digit characters in S,\n\
				3219	0 otherwise.";
				3220
				3221	static PyObject*
				3222	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3223	{
				3224	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3225	register const Py_UNICODE *e;
				3226
				3227	if (!PyArg_NoArgs(args))
				3228	return NULL;
				3229
				3230	/* Shortcut for single character strings */
				3231	if (PyUnicode_GET_SIZE(self) == 1 &&
				3232	Py_UNICODE_ISDIGIT(*p))
				3233	return PyInt_FromLong(1);
				3234
				3235	e = p + PyUnicode_GET_SIZE(self);
				3236	for (; p < e; p++) {
				3237	if (!Py_UNICODE_ISDIGIT(*p))
				3238	return PyInt_FromLong(0);
				3239	}
				3240	return PyInt_FromLong(1);
				3241	}
				3242
				3243	static char isnumeric__doc__[] =
				3244	"S.isnumeric() -> int\n\
				3245	\n\
				3246	Return 1 if there are only numeric characters in S,\n\
				3247	0 otherwise.";
				3248
				3249	static PyObject*
				3250	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3251	{
				3252	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3253	register const Py_UNICODE *e;
				3254
				3255	if (!PyArg_NoArgs(args))
				3256	return NULL;
				3257
				3258	/* Shortcut for single character strings */
				3259	if (PyUnicode_GET_SIZE(self) == 1 &&
				3260	Py_UNICODE_ISNUMERIC(*p))
				3261	return PyInt_FromLong(1);
				3262
				3263	e = p + PyUnicode_GET_SIZE(self);
				3264	for (; p < e; p++) {
				3265	if (!Py_UNICODE_ISNUMERIC(*p))
				3266	return PyInt_FromLong(0);
				3267	}
				3268	return PyInt_FromLong(1);
				3269	}
				3270
				3271	static char join__doc__[] =
				3272	"S.join(sequence) -> unicode\n\
				3273	\n\
				3274	Return a string which is the concatenation of the strings in the\n\
				3275	sequence. The separator between elements is S.";
				3276
				3277	static PyObject*
				3278	unicode_join(PyUnicodeObject self, PyObject args)
				3279	{
				3280	PyObject *data;
				3281	if (!PyArg_ParseTuple(args, "O:join", &data))
				3282	return NULL;
				3283
				3284	return PyUnicode_Join((PyObject *)self, data);
				3285	}
				3286
				3287	static int
				3288	unicode_length(PyUnicodeObject *self)
				3289	{
				3290	return self->length;
				3291	}
				3292
				3293	static char ljust__doc__[] =
				3294	"S.ljust(width) -> unicode\n\
				3295	\n\
				3296	Return S left justified in a Unicode string of length width. Padding is\n\
				3297	done using spaces.";
				3298
				3299	static PyObject *
				3300	unicode_ljust(PyUnicodeObject self, PyObject args)
				3301	{
				3302	int width;
				3303	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3304	return NULL;
				3305
				3306	if (self->length >= width) {
				3307	Py_INCREF(self);
				3308	return (PyObject*) self;
				3309	}
				3310
				3311	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3312	}
				3313
				3314	static char lower__doc__[] =
				3315	"S.lower() -> unicode\n\
				3316	\n\
				3317	Return a copy of the string S converted to lowercase.";
				3318
				3319	static PyObject*
				3320	unicode_lower(PyUnicodeObject self, PyObject args)
				3321	{
				3322	if (!PyArg_NoArgs(args))
				3323	return NULL;
				3324	return fixup(self, fixlower);
				3325	}
				3326
				3327	static char lstrip__doc__[] =
				3328	"S.lstrip() -> unicode\n\
				3329	\n\
				3330	Return a copy of the string S with leading whitespace removed.";
				3331
				3332	static PyObject *
				3333	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3334	{
				3335	if (!PyArg_NoArgs(args))
				3336	return NULL;
				3337	return strip(self, 1, 0);
				3338	}
				3339
				3340	static PyObject*
				3341	unicode_repeat(PyUnicodeObject *str, int len)
				3342	{
				3343	PyUnicodeObject *u;
				3344	Py_UNICODE *p;
				3345
				3346	if (len < 0)
				3347	len = 0;
				3348
				3349	if (len == 1) {
				3350	/* no repeat, return original string */
				3351	Py_INCREF(str);
				3352	return (PyObject*) str;
				3353	}
				3354
				3355	u = _PyUnicode_New(len * str->length);
				3356	if (!u)
				3357	return NULL;
				3358
				3359	p = u->str;
				3360
				3361	while (len-- > 0) {
				3362	Py_UNICODE_COPY(p, str->str, str->length);
				3363	p += str->length;
				3364	}
				3365
				3366	return (PyObject*) u;
				3367	}
				3368
				3369	PyObject PyUnicode_Replace(PyObject obj,
				3370	PyObject *subobj,
				3371	PyObject *replobj,
				3372	int maxcount)
				3373	{
				3374	PyObject *self;
				3375	PyObject *str1;
				3376	PyObject *str2;
				3377	PyObject *result;
				3378
				3379	self = PyUnicode_FromObject(obj);
				3380	if (self == NULL)
				3381	return NULL;
				3382	str1 = PyUnicode_FromObject(subobj);
				3383	if (str1 == NULL) {
				3384	Py_DECREF(self);
				3385	return NULL;
				3386	}
				3387	str2 = PyUnicode_FromObject(replobj);
				3388	if (str2 == NULL) {
				3389	Py_DECREF(self);
				3390	Py_DECREF(str1);
				3391	return NULL;
				3392	}
				3393	result = replace((PyUnicodeObject *)self,
				3394	(PyUnicodeObject *)str1,
				3395	(PyUnicodeObject *)str2,
				3396	maxcount);
				3397	Py_DECREF(self);
				3398	Py_DECREF(str1);
				3399	Py_DECREF(str2);
				3400	return result;
				3401	}
				3402
				3403	static char replace__doc__[] =
				3404	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3405	\n\
				3406	Return a copy of S with all occurrences of substring\n\
				3407	old replaced by new. If the optional argument maxsplit is\n\
				3408	given, only the first maxsplit occurrences are replaced.";
				3409
				3410	static PyObject*
				3411	unicode_replace(PyUnicodeObject self, PyObject args)
				3412	{
				3413	PyUnicodeObject *str1;
				3414	PyUnicodeObject *str2;
				3415	int maxcount = -1;
				3416	PyObject *result;
				3417
				3418	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3419	return NULL;
				3420	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3421	if (str1 == NULL)
				3422	return NULL;
				3423	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3424	if (str2 == NULL)
				3425	return NULL;
				3426
				3427	result = replace(self, str1, str2, maxcount);
				3428
				3429	Py_DECREF(str1);
				3430	Py_DECREF(str2);
				3431	return result;
				3432	}
				3433
				3434	static
				3435	PyObject unicode_repr(PyObject unicode)
				3436	{
				3437	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				3438	PyUnicode_GET_SIZE(unicode),
				3439	1);
				3440	}
				3441
				3442	static char rfind__doc__[] =
				3443	"S.rfind(sub [,start [,end]]) -> int\n\
				3444	\n\
				3445	Return the highest index in S where substring sub is found,\n\
				3446	such that sub is contained within s[start,end]. Optional\n\
				3447	arguments start and end are interpreted as in slice notation.\n\
				3448	\n\
				3449	Return -1 on failure.";
				3450
				3451	static PyObject *
				3452	unicode_rfind(PyUnicodeObject self, PyObject args)
				3453	{
				3454	PyUnicodeObject *substring;
				3455	int start = 0;
				3456	int end = INT_MAX;
				3457	PyObject *result;
				3458
				3459	if (!PyArg_ParseTuple(args, "O\|ii:rfind", &substring, &start, &end))
				3460	return NULL;
				3461	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3462	(PyObject *)substring);
				3463	if (substring == NULL)
				3464	return NULL;
				3465
				3466	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				3467
				3468	Py_DECREF(substring);
				3469	return result;
				3470	}
				3471
				3472	static char rindex__doc__[] =
				3473	"S.rindex(sub [,start [,end]]) -> int\n\
				3474	\n\
				3475	Like S.rfind() but raise ValueError when the substring is not found.";
				3476
				3477	static PyObject *
				3478	unicode_rindex(PyUnicodeObject self, PyObject args)
				3479	{
				3480	int result;
				3481	PyUnicodeObject *substring;
				3482	int start = 0;
				3483	int end = INT_MAX;
				3484
				3485	if (!PyArg_ParseTuple(args, "O\|ii:rindex", &substring, &start, &end))
				3486	return NULL;
				3487	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3488	(PyObject *)substring);
				3489	if (substring == NULL)
				3490	return NULL;
				3491
				3492	result = findstring(self, substring, start, end, -1);
				3493
				3494	Py_DECREF(substring);
				3495	if (result < 0) {
				3496	PyErr_SetString(PyExc_ValueError, "substring not found");
				3497	return NULL;
				3498	}
				3499	return PyInt_FromLong(result);
				3500	}
				3501
				3502	static char rjust__doc__[] =
				3503	"S.rjust(width) -> unicode\n\
				3504	\n\
				3505	Return S right justified in a Unicode string of length width. Padding is\n\
				3506	done using spaces.";
				3507
				3508	static PyObject *
				3509	unicode_rjust(PyUnicodeObject self, PyObject args)
				3510	{
				3511	int width;
				3512	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				3513	return NULL;
				3514
				3515	if (self->length >= width) {
				3516	Py_INCREF(self);
				3517	return (PyObject*) self;
				3518	}
				3519
				3520	return (PyObject*) pad(self, width - self->length, 0, ' ');
				3521	}
				3522
				3523	static char rstrip__doc__[] =
				3524	"S.rstrip() -> unicode\n\
				3525	\n\
				3526	Return a copy of the string S with trailing whitespace removed.";
				3527
				3528	static PyObject *
				3529	unicode_rstrip(PyUnicodeObject self, PyObject args)
				3530	{
				3531	if (!PyArg_NoArgs(args))
				3532	return NULL;
				3533	return strip(self, 0, 1);
				3534	}
				3535
				3536	static PyObject*
				3537	unicode_slice(PyUnicodeObject *self, int start, int end)
				3538	{
				3539	/* standard clamping */
				3540	if (start < 0)
				3541	start = 0;
				3542	if (end < 0)
				3543	end = 0;
				3544	if (end > self->length)
				3545	end = self->length;
				3546	if (start == 0 && end == self->length) {
				3547	/* full slice, return original string */
				3548	Py_INCREF(self);
				3549	return (PyObject*) self;
				3550	}
				3551	if (start > end)
				3552	start = end;
				3553	/* copy slice */
				3554	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				3555	end - start);
				3556	}
				3557
				3558	PyObject PyUnicode_Split(PyObject s,
				3559	PyObject *sep,
				3560	int maxsplit)
				3561	{
				3562	PyObject *result;
				3563
				3564	s = PyUnicode_FromObject(s);
				3565	if (s == NULL)
				3566	return NULL;
				3567	if (sep != NULL) {
				3568	sep = PyUnicode_FromObject(sep);
				3569	if (sep == NULL) {
				3570	Py_DECREF(s);
				3571	return NULL;
				3572	}
				3573	}
				3574
				3575	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				3576
				3577	Py_DECREF(s);
				3578	Py_XDECREF(sep);
				3579	return result;
				3580	}
				3581
				3582	static char split__doc__[] =
				3583	"S.split([sep [,maxsplit]]) -> list of strings\n\
				3584	\n\
				3585	Return a list of the words in S, using sep as the\n\
				3586	delimiter string. If maxsplit is given, at most maxsplit\n\
				3587	splits are done. If sep is not specified, any whitespace string\n\
				3588	is a separator.";
				3589
				3590	static PyObject*
				3591	unicode_split(PyUnicodeObject self, PyObject args)
				3592	{
				3593	PyObject *substring = Py_None;
				3594	int maxcount = -1;
				3595
				3596	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				3597	return NULL;
				3598
				3599	if (substring == Py_None)
				3600	return split(self, NULL, maxcount);
				3601	else if (PyUnicode_Check(substring))
				3602	return split(self, (PyUnicodeObject *)substring, maxcount);
				3603	else
				3604	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				3605	}
				3606
				3607	static char splitlines__doc__[] =
				3608	"S.splitlines([maxsplit]]) -> list of strings\n\
				3609	\n\
				3610	Return a list of the lines in S, breaking at line boundaries.\n\
				3611	If maxsplit is given, at most maxsplit are done. Line breaks are not\n\
				3612	included in the resulting list.";
				3613
				3614	static PyObject*
				3615	unicode_splitlines(PyUnicodeObject self, PyObject args)
				3616	{
				3617	int maxcount = -1;
				3618
				3619	if (!PyArg_ParseTuple(args, "\|i:splitlines", &maxcount))
				3620	return NULL;
				3621
				3622	return PyUnicode_Splitlines((PyObject *)self, maxcount);
				3623	}
				3624
				3625	static
				3626	PyObject unicode_str(PyUnicodeObject self)
				3627	{
				3628	return PyUnicode_AsUTF8String((PyObject *)self);
				3629	}
				3630
				3631	static char strip__doc__[] =
				3632	"S.strip() -> unicode\n\
				3633	\n\
				3634	Return a copy of S with leading and trailing whitespace removed.";
				3635
				3636	static PyObject *
				3637	unicode_strip(PyUnicodeObject self, PyObject args)
				3638	{
				3639	if (!PyArg_NoArgs(args))
				3640	return NULL;
				3641	return strip(self, 1, 1);
				3642	}
				3643
				3644	static char swapcase__doc__[] =
				3645	"S.swapcase() -> unicode\n\
				3646	\n\
				3647	Return a copy of S with uppercase characters converted to lowercase\n\
				3648	and vice versa.";
				3649
				3650	static PyObject*
				3651	unicode_swapcase(PyUnicodeObject self, PyObject args)
				3652	{
				3653	if (!PyArg_NoArgs(args))
				3654	return NULL;
				3655	return fixup(self, fixswapcase);
				3656	}
				3657
				3658	static char translate__doc__[] =
				3659	"S.translate(table) -> unicode\n\
				3660	\n\
				3661	Return a copy of the string S, where all characters have been mapped\n\
				3662	through the given translation table, which must be a mapping of\n\
				3663	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				3664	are left untouched. Characters mapped to None are deleted.";
				3665
				3666	static PyObject*
				3667	unicode_translate(PyUnicodeObject self, PyObject args)
				3668	{
				3669	PyObject *table;
				3670
				3671	if (!PyArg_ParseTuple(args, "O:translate", &table))
				3672	return NULL;
				3673	return PyUnicode_TranslateCharmap(self->str,
				3674	self->length,
				3675	table,
				3676	"ignore");
				3677	}
				3678
				3679	static char upper__doc__[] =
				3680	"S.upper() -> unicode\n\
				3681	\n\
				3682	Return a copy of S converted to uppercase.";
				3683
				3684	static PyObject*
				3685	unicode_upper(PyUnicodeObject self, PyObject args)
				3686	{
				3687	if (!PyArg_NoArgs(args))
				3688	return NULL;
				3689	return fixup(self, fixupper);
				3690	}
				3691
				3692	#if 0
				3693	static char zfill__doc__[] =
				3694	"S.zfill(width) -> unicode\n\
				3695	\n\
				3696	Pad a numeric string x with zeros on the left, to fill a field\n\
				3697	of the specified width. The string x is never truncated.";
				3698
				3699	static PyObject *
				3700	unicode_zfill(PyUnicodeObject self, PyObject args)
				3701	{
				3702	int fill;
				3703	PyUnicodeObject *u;
				3704
				3705	int width;
				3706	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				3707	return NULL;
				3708
				3709	if (self->length >= width) {
				3710	Py_INCREF(self);
				3711	return (PyObject*) self;
				3712	}
				3713
				3714	fill = width - self->length;
				3715
				3716	u = pad(self, fill, 0, '0');
				3717
				3718	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				3719	/* move sign to beginning of string */
				3720	u->str[0] = u->str[fill];
				3721	u->str[fill] = '0';
				3722	}
				3723
				3724	return (PyObject*) u;
				3725	}
				3726	#endif
				3727
				3728	#if 0
				3729	static PyObject*
				3730	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				3731	{
				3732	if (!PyArg_NoArgs(args))
				3733	return NULL;
				3734	return PyInt_FromLong(unicode_freelist_size);
				3735	}
				3736	#endif
				3737
				3738	static char startswith__doc__[] =
				3739	"S.startswith(prefix[, start[, end]]) -> int\n\
				3740	\n\
				3741	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				3742	optional start, test S beginning at that position. With optional end, stop\n\
				3743	comparing S at that position.";
				3744
				3745	static PyObject *
				3746	unicode_startswith(PyUnicodeObject *self,
				3747	PyObject *args)
				3748	{
				3749	PyUnicodeObject *substring;
				3750	int start = 0;
				3751	int end = INT_MAX;
				3752	PyObject *result;
				3753
				3754	if (!PyArg_ParseTuple(args, "O\|ii:startswith", &substring, &start, &end))
				3755	return NULL;
				3756	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3757	(PyObject *)substring);
				3758	if (substring == NULL)
				3759	return NULL;
				3760
				3761	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				3762
				3763	Py_DECREF(substring);
				3764	return result;
				3765	}
				3766
				3767
				3768	static char endswith__doc__[] =
				3769	"S.endswith(suffix[, start[, end]]) -> int\n\
				3770	\n\
				3771	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				3772	optional start, test S beginning at that position. With optional end, stop\n\
				3773	comparing S at that position.";
				3774
				3775	static PyObject *
				3776	unicode_endswith(PyUnicodeObject *self,
				3777	PyObject *args)
				3778	{
				3779	PyUnicodeObject *substring;
				3780	int start = 0;
				3781	int end = INT_MAX;
				3782	PyObject *result;
				3783
				3784	if (!PyArg_ParseTuple(args, "O\|ii:endswith", &substring, &start, &end))
				3785	return NULL;
				3786	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3787	(PyObject *)substring);
				3788	if (substring == NULL)
				3789	return NULL;
				3790
				3791	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				3792
				3793	Py_DECREF(substring);
				3794	return result;
				3795	}
				3796
				3797
				3798	static PyMethodDef unicode_methods[] = {
				3799
				3800	/* Order is according to common usage: often used methods should
				3801	appear first, since lookup is done sequentially. */
				3802
				3803	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				3804	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				3805	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				3806	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				3807	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				3808	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				3809	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				3810	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				3811	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				3812	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				3813	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				3814	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				3815	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				3816	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				3817	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				3818	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				3819	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				3820	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				3821	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				3822	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				3823	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				3824	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				3825	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				3826	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				3827	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				3828	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				3829	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				3830	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				3831	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				3832	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				3833	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				3834	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				3835	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
				3836	#if 0
				3837	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				3838	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				3839	#endif
				3840
				3841	#if 0
				3842	/* This one is just used for debugging the implementation. */
				3843	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				3844	#endif
				3845
				3846	{NULL, NULL}
				3847	};
				3848
				3849	static PyObject *
				3850	unicode_getattr(PyUnicodeObject self, char name)
				3851	{
				3852	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				3853	}
				3854
				3855	static PySequenceMethods unicode_as_sequence = {
				3856	(inquiry) unicode_length, /* sq_length */
				3857	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				3858	(intargfunc) unicode_repeat, /* sq_repeat */
				3859	(intargfunc) unicode_getitem, /* sq_item */
				3860	(intintargfunc) unicode_slice, /* sq_slice */
				3861	0, /* sq_ass_item */
				3862	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3863	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3864	};
				3865
				3866	static int
				3867	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				3868	int index,
				3869	const void **ptr)
				3870	{
				3871	if (index != 0) {
				3872	PyErr_SetString(PyExc_SystemError,
				3873	"accessing non-existent unicode segment");
				3874	return -1;
				3875	}
				3876	ptr = (void ) self->str;
				3877	return PyUnicode_GET_DATA_SIZE(self);
				3878	}
				3879
				3880	static int
				3881	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				3882	const void **ptr)
				3883	{
				3884	PyErr_SetString(PyExc_TypeError,
				3885	"cannot use unicode as modifyable buffer");
				3886	return -1;
				3887	}
				3888
				3889	static int
				3890	unicode_buffer_getsegcount(PyUnicodeObject *self,
				3891	int *lenp)
				3892	{
				3893	if (lenp)
				3894	*lenp = PyUnicode_GET_DATA_SIZE(self);
				3895	return 1;
				3896	}
				3897
				3898	static int
				3899	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				3900	int index,
				3901	const void **ptr)
				3902	{
				3903	PyObject *str;
				3904
				3905	if (index != 0) {
				3906	PyErr_SetString(PyExc_SystemError,
				3907	"accessing non-existent unicode segment");
				3908	return -1;
				3909	}
				3910	str = utf8_string(self, NULL);
				3911	if (str == NULL)
				3912	return -1;
				3913	ptr = (void ) PyString_AS_STRING(str);
				3914	return PyString_GET_SIZE(str);
				3915	}
				3916
				3917	/* Helpers for PyUnicode_Format() */
				3918
				3919	static PyObject *
				3920	getnextarg(args, arglen, p_argidx)
				3921	PyObject *args;
				3922	int arglen;
				3923	int *p_argidx;
				3924	{
				3925	int argidx = *p_argidx;
				3926	if (argidx < arglen) {
				3927	(*p_argidx)++;
				3928	if (arglen < 0)
				3929	return args;
				3930	else
				3931	return PyTuple_GetItem(args, argidx);
				3932	}
				3933	PyErr_SetString(PyExc_TypeError,
				3934	"not enough arguments for format string");
				3935	return NULL;
				3936	}
				3937
				3938	#define F_LJUST (1<<0)
				3939	#define F_SIGN (1<<1)
				3940	#define F_BLANK (1<<2)
				3941	#define F_ALT (1<<3)
				3942	#define F_ZERO (1<<4)
				3943
				3944	static
				3945	#ifdef HAVE_STDARG_PROTOTYPES
				3946	int usprintf(register Py_UNICODE buffer, char format, ...)
				3947	#else
				3948	int usprintf(va_alist) va_dcl
				3949	#endif
				3950	{
				3951	register int i;
				3952	int len;
				3953	va_list va;
				3954	char *charbuffer;
				3955	#ifdef HAVE_STDARG_PROTOTYPES
				3956	va_start(va, format);
				3957	#else
				3958	Py_UNICODE *args;
				3959	char *format;
				3960
				3961	va_start(va);
				3962	buffer = va_arg(va, Py_UNICODE *);
				3963	format = va_arg(va, char *);
				3964	#endif
				3965
				3966	/* First, format the string as char array, then expand to Py_UNICODE
				3967	array. */
				3968	charbuffer = (char *)buffer;
				3969	len = vsprintf(charbuffer, format, va);
				3970	for (i = len - 1; i >= 0; i--)
				3971	buffer[i] = (Py_UNICODE) charbuffer[i];
				3972
				3973	va_end(va);
				3974	return len;
				3975	}
				3976
				3977	static int
				3978	formatfloat(Py_UNICODE *buf,
				3979	int flags,
				3980	int prec,
				3981	int type,
				3982	PyObject *v)
				3983	{
				3984	char fmt[20];
				3985	double x;
				3986
				3987	x = PyFloat_AsDouble(v);
				3988	if (x == -1.0 && PyErr_Occurred())
				3989	return -1;
				3990	if (prec < 0)
				3991	prec = 6;
				3992	if (prec > 50)
				3993	prec = 50; /* Arbitrary limitation */
				3994	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				3995	type = 'g';
				3996	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
				3997	return usprintf(buf, fmt, x);
				3998	}
				3999
				4000	static int
				4001	formatint(Py_UNICODE *buf,
				4002	int flags,
				4003	int prec,
				4004	int type,
				4005	PyObject *v)
				4006	{
				4007	char fmt[20];
				4008	long x;
				4009
				4010	x = PyInt_AsLong(v);
				4011	if (x == -1 && PyErr_Occurred())
				4012	return -1;
				4013	if (prec < 0)
				4014	prec = 1;
				4015	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4016	return usprintf(buf, fmt, x);
				4017	}
				4018
				4019	static int
				4020	formatchar(Py_UNICODE *buf,
				4021	PyObject *v)
				4022	{
				4023	if (PyUnicode_Check(v))
				4024	buf[0] = PyUnicode_AS_UNICODE(v)[0];
				4025
				4026	else if (PyString_Check(v))
				4027	buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
				4028
				4029	else {
				4030	/* Integer input truncated to a character */
				4031	long x;
				4032	x = PyInt_AsLong(v);
				4033	if (x == -1 && PyErr_Occurred())
				4034	return -1;
				4035	buf[0] = (char) x;
				4036	}
				4037	buf[1] = '\0';
				4038	return 1;
				4039	}
				4040
				4041	PyObject PyUnicode_Format(PyObject format,
				4042	PyObject *args)
				4043	{
				4044	Py_UNICODE fmt, res;
				4045	int fmtcnt, rescnt, reslen, arglen, argidx;
				4046	int args_owned = 0;
				4047	PyUnicodeObject *result = NULL;
				4048	PyObject *dict = NULL;
				4049	PyObject *uformat;
				4050
				4051	if (format == NULL \|\| args == NULL) {
				4052	PyErr_BadInternalCall();
				4053	return NULL;
				4054	}
				4055	uformat = PyUnicode_FromObject(format);
				4056	fmt = PyUnicode_AS_UNICODE(uformat);
				4057	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4058
				4059	reslen = rescnt = fmtcnt + 100;
				4060	result = _PyUnicode_New(reslen);
				4061	if (result == NULL)
				4062	goto onError;
				4063	res = PyUnicode_AS_UNICODE(result);
				4064
				4065	if (PyTuple_Check(args)) {
				4066	arglen = PyTuple_Size(args);
				4067	argidx = 0;
				4068	}
				4069	else {
				4070	arglen = -1;
				4071	argidx = -2;
				4072	}
				4073	if (args->ob_type->tp_as_mapping)
				4074	dict = args;
				4075
				4076	while (--fmtcnt >= 0) {
				4077	if (*fmt != '%') {
				4078	if (--rescnt < 0) {
				4079	rescnt = fmtcnt + 100;
				4080	reslen += rescnt;
				4081	if (_PyUnicode_Resize(result, reslen) < 0)
				4082	return NULL;
				4083	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4084	--rescnt;
				4085	}
				4086	res++ = fmt++;
				4087	}
				4088	else {
				4089	/* Got a format specifier */
				4090	int flags = 0;
				4091	int width = -1;
				4092	int prec = -1;
				4093	int size = 0;
				4094	Py_UNICODE c = '\0';
				4095	Py_UNICODE fill;
				4096	PyObject *v = NULL;
				4097	PyObject *temp = NULL;
				4098	Py_UNICODE *buf;
				4099	Py_UNICODE sign;
				4100	int len;
				4101	Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
				4102
				4103	fmt++;
				4104	if (*fmt == '(') {
				4105	Py_UNICODE *keystart;
				4106	int keylen;
				4107	PyObject *key;
				4108	int pcount = 1;
				4109
				4110	if (dict == NULL) {
				4111	PyErr_SetString(PyExc_TypeError,
				4112	"format requires a mapping");
				4113	goto onError;
				4114	}
				4115	++fmt;
				4116	--fmtcnt;
				4117	keystart = fmt;
				4118	/* Skip over balanced parentheses */
				4119	while (pcount > 0 && --fmtcnt >= 0) {
				4120	if (*fmt == ')')
				4121	--pcount;
				4122	else if (*fmt == '(')
				4123	++pcount;
				4124	fmt++;
				4125	}
				4126	keylen = fmt - keystart - 1;
				4127	if (fmtcnt < 0 \|\| pcount > 0) {
				4128	PyErr_SetString(PyExc_ValueError,
				4129	"incomplete format key");
				4130	goto onError;
				4131	}
				4132	/* keys are converted to strings (using UTF-8) and
				4133	then looked up since Python uses strings to hold
				4134	variables names etc. in its namespaces and we
				4135	wouldn't want to break common idioms. The
				4136	alternative would be using Unicode objects for the
				4137	lookup but u"abc" and "abc" have different hash
				4138	values (on purpose). */
				4139	key = PyUnicode_EncodeUTF8(keystart,
				4140	keylen,
				4141	NULL);
				4142	if (key == NULL)
				4143	goto onError;
				4144	if (args_owned) {
				4145	Py_DECREF(args);
				4146	args_owned = 0;
				4147	}
				4148	args = PyObject_GetItem(dict, key);
				4149	Py_DECREF(key);
				4150	if (args == NULL) {
				4151	goto onError;
				4152	}
				4153	args_owned = 1;
				4154	arglen = -1;
				4155	argidx = -2;
				4156	}
				4157	while (--fmtcnt >= 0) {
				4158	switch (c = *fmt++) {
				4159	case '-': flags \|= F_LJUST; continue;
				4160	case '+': flags \|= F_SIGN; continue;
				4161	case ' ': flags \|= F_BLANK; continue;
				4162	case '#': flags \|= F_ALT; continue;
				4163	case '0': flags \|= F_ZERO; continue;
				4164	}
				4165	break;
				4166	}
				4167	if (c == '*') {
				4168	v = getnextarg(args, arglen, &argidx);
				4169	if (v == NULL)
				4170	goto onError;
				4171	if (!PyInt_Check(v)) {
				4172	PyErr_SetString(PyExc_TypeError,
				4173	"* wants int");
				4174	goto onError;
				4175	}
				4176	width = PyInt_AsLong(v);
				4177	if (width < 0) {
				4178	flags \|= F_LJUST;
				4179	width = -width;
				4180	}
				4181	if (--fmtcnt >= 0)
				4182	c = *fmt++;
				4183	}
				4184	else if (c >= '0' && c <= '9') {
				4185	width = c - '0';
				4186	while (--fmtcnt >= 0) {
				4187	c = *fmt++;
				4188	if (c < '0' \|\| c > '9')
				4189	break;
				4190	if ((width*10) / 10 != width) {
				4191	PyErr_SetString(PyExc_ValueError,
				4192	"width too big");
				4193	goto onError;
				4194	}
				4195	width = width*10 + (c - '0');
				4196	}
				4197	}
				4198	if (c == '.') {
				4199	prec = 0;
				4200	if (--fmtcnt >= 0)
				4201	c = *fmt++;
				4202	if (c == '*') {
				4203	v = getnextarg(args, arglen, &argidx);
				4204	if (v == NULL)
				4205	goto onError;
				4206	if (!PyInt_Check(v)) {
				4207	PyErr_SetString(PyExc_TypeError,
				4208	"* wants int");
				4209	goto onError;
				4210	}
				4211	prec = PyInt_AsLong(v);
				4212	if (prec < 0)
				4213	prec = 0;
				4214	if (--fmtcnt >= 0)
				4215	c = *fmt++;
				4216	}
				4217	else if (c >= '0' && c <= '9') {
				4218	prec = c - '0';
				4219	while (--fmtcnt >= 0) {
				4220	c = Py_CHARMASK(*fmt++);
				4221	if (c < '0' \|\| c > '9')
				4222	break;
				4223	if ((prec*10) / 10 != prec) {
				4224	PyErr_SetString(PyExc_ValueError,
				4225	"prec too big");
				4226	goto onError;
				4227	}
				4228	prec = prec*10 + (c - '0');
				4229	}
				4230	}
				4231	} /* prec */
				4232	if (fmtcnt >= 0) {
				4233	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4234	size = c;
				4235	if (--fmtcnt >= 0)
				4236	c = *fmt++;
				4237	}
				4238	}
				4239	if (fmtcnt < 0) {
				4240	PyErr_SetString(PyExc_ValueError,
				4241	"incomplete format");
				4242	goto onError;
				4243	}
				4244	if (c != '%') {
				4245	v = getnextarg(args, arglen, &argidx);
				4246	if (v == NULL)
				4247	goto onError;
				4248	}
				4249	sign = 0;
				4250	fill = ' ';
				4251	switch (c) {
				4252
				4253	case '%':
				4254	buf = tmpbuf;
				4255	buf[0] = '%';
				4256	len = 1;
				4257	break;
				4258
				4259	case 's':
				4260	case 'r':
				4261	if (PyUnicode_Check(v) && c == 's') {
				4262	temp = v;
				4263	Py_INCREF(temp);
				4264	}
				4265	else {
				4266	PyObject *unicode;
				4267	if (c == 's')
				4268	temp = PyObject_Str(v);
				4269	else
				4270	temp = PyObject_Repr(v);
				4271	if (temp == NULL)
				4272	goto onError;
				4273	if (!PyString_Check(temp)) {
				4274	/* XXX Note: this should never happen, since
				4275	PyObject_Repr() and PyObject_Str() assure
				4276	this */
				4277	Py_DECREF(temp);
				4278	PyErr_SetString(PyExc_TypeError,
				4279	"%s argument has non-string str()");
				4280	goto onError;
				4281	}
				4282	unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
				4283	PyString_GET_SIZE(temp),
				4284	"strict");
				4285	Py_DECREF(temp);
				4286	temp = unicode;
				4287	if (temp == NULL)
				4288	goto onError;
				4289	}
				4290	buf = PyUnicode_AS_UNICODE(temp);
				4291	len = PyUnicode_GET_SIZE(temp);
				4292	if (prec >= 0 && len > prec)
				4293	len = prec;
				4294	break;
				4295
				4296	case 'i':
				4297	case 'd':
				4298	case 'u':
				4299	case 'o':
				4300	case 'x':
				4301	case 'X':
				4302	if (c == 'i')
				4303	c = 'd';
				4304	buf = tmpbuf;
				4305	len = formatint(buf, flags, prec, c, v);
				4306	if (len < 0)
				4307	goto onError;
				4308	sign = (c == 'd');
				4309	if (flags & F_ZERO) {
				4310	fill = '0';
				4311	if ((flags&F_ALT) &&
				4312	(c == 'x' \|\| c == 'X') &&
				4313	buf[0] == '0' && buf[1] == c) {
				4314	res++ = buf++;
				4315	res++ = buf++;
				4316	rescnt -= 2;
				4317	len -= 2;
				4318	width -= 2;
				4319	if (width < 0)
				4320	width = 0;
				4321	}
				4322	}
				4323	break;
				4324
				4325	case 'e':
				4326	case 'E':
				4327	case 'f':
				4328	case 'g':
				4329	case 'G':
				4330	buf = tmpbuf;
				4331	len = formatfloat(buf, flags, prec, c, v);
				4332	if (len < 0)
				4333	goto onError;
				4334	sign = 1;
				4335	if (flags&F_ZERO)
				4336	fill = '0';
				4337	break;
				4338
				4339	case 'c':
				4340	buf = tmpbuf;
				4341	len = formatchar(buf, v);
				4342	if (len < 0)
				4343	goto onError;
				4344	break;
				4345
				4346	default:
				4347	PyErr_Format(PyExc_ValueError,
				4348	"unsupported format character '%c' (0x%x)",
				4349	c, c);
				4350	goto onError;
				4351	}
				4352	if (sign) {
				4353	if (buf == '-' \|\| buf == '+') {
				4354	sign = *buf++;
				4355	len--;
				4356	}
				4357	else if (flags & F_SIGN)
				4358	sign = '+';
				4359	else if (flags & F_BLANK)
				4360	sign = ' ';
				4361	else
				4362	sign = 0;
				4363	}
				4364	if (width < len)
				4365	width = len;
				4366	if (rescnt < width + (sign != 0)) {
				4367	reslen -= rescnt;
				4368	rescnt = width + fmtcnt + 100;
				4369	reslen += rescnt;
				4370	if (_PyUnicode_Resize(result, reslen) < 0)
				4371	return NULL;
				4372	res = PyUnicode_AS_UNICODE(result)
				4373	+ reslen - rescnt;
				4374	}
				4375	if (sign) {
				4376	if (fill != ' ')
				4377	*res++ = sign;
				4378	rescnt--;
				4379	if (width > len)
				4380	width--;
				4381	}
				4382	if (width > len && !(flags & F_LJUST)) {
				4383	do {
				4384	--rescnt;
				4385	*res++ = fill;
				4386	} while (--width > len);
				4387	}
				4388	if (sign && fill == ' ')
				4389	*res++ = sign;
				4390	memcpy(res, buf, len * sizeof(Py_UNICODE));
				4391	res += len;
				4392	rescnt -= len;
				4393	while (--width >= len) {
				4394	--rescnt;
				4395	*res++ = ' ';
				4396	}
				4397	if (dict && (argidx < arglen) && c != '%') {
				4398	PyErr_SetString(PyExc_TypeError,
				4399	"not all arguments converted");
				4400	goto onError;
				4401	}
				4402	Py_XDECREF(temp);
				4403	} /* '%' */
				4404	} /* until end */
				4405	if (argidx < arglen && !dict) {
				4406	PyErr_SetString(PyExc_TypeError,
				4407	"not all arguments converted");
				4408	goto onError;
				4409	}
				4410
				4411	if (args_owned) {
				4412	Py_DECREF(args);
				4413	}
				4414	Py_DECREF(uformat);
				4415	_PyUnicode_Resize(result, reslen - rescnt);
				4416	return (PyObject *)result;
				4417
				4418	onError:
				4419	Py_XDECREF(result);
				4420	Py_DECREF(uformat);
				4421	if (args_owned) {
				4422	Py_DECREF(args);
				4423	}
				4424	return NULL;
				4425	}
				4426
				4427	static PyBufferProcs unicode_as_buffer = {
				4428	(getreadbufferproc) unicode_buffer_getreadbuf,
				4429	(getwritebufferproc) unicode_buffer_getwritebuf,
				4430	(getsegcountproc) unicode_buffer_getsegcount,
				4431	(getcharbufferproc) unicode_buffer_getcharbuf,
				4432	};
				4433
				4434	PyTypeObject PyUnicode_Type = {
				4435	PyObject_HEAD_INIT(&PyType_Type)
				4436	0, /* ob_size */
				4437	"unicode", /* tp_name */
				4438	sizeof(PyUnicodeObject), /* tp_size */
				4439	0, /* tp_itemsize */
				4440	/* Slots */
				4441	(destructor)_PyUnicode_Free, /* tp_dealloc */
				4442	0, /* tp_print */
				4443	(getattrfunc)unicode_getattr, /* tp_getattr */
				4444	0, /* tp_setattr */
				4445	(cmpfunc) unicode_compare, /* tp_compare */
				4446	(reprfunc) unicode_repr, /* tp_repr */
				4447	0, /* tp_as_number */
				4448	&unicode_as_sequence, /* tp_as_sequence */
				4449	0, /* tp_as_mapping */
				4450	(hashfunc) unicode_hash, /* tp_hash*/
				4451	0, /* tp_call*/
				4452	(reprfunc) unicode_str, /* tp_str */
				4453	(getattrofunc) NULL, /* tp_getattro */
				4454	(setattrofunc) NULL, /* tp_setattro */
				4455	&unicode_as_buffer, /* tp_as_buffer */
				4456	Py_TPFLAGS_DEFAULT, /* tp_flags */
				4457	};
				4458
				4459	/* Initialize the Unicode implementation */
				4460
				4461	void _PyUnicode_Init()
				4462	{
				4463	/* Doublecheck the configuration... */
				4464	if (sizeof(Py_UNICODE) != 2)
				4465	Py_FatalError("Unicode configuration error: "
				4466	"sizeof(Py_UNICODE) != 2 bytes");
				4467
				4468	unicode_empty = _PyUnicode_New(0);
				4469	}
				4470
				4471	/* Finalize the Unicode implementation */
				4472
				4473	void
				4474	_PyUnicode_Fini()
				4475	{
				4476	PyUnicodeObject *u = unicode_freelist;
				4477
				4478	while (u != NULL) {
				4479	PyUnicodeObject *v = u;
				4480	u = (PyUnicodeObject *)u;
				4481	free(v);
				4482	}
				4483	Py_XDECREF(unicode_empty);
				4484	}