Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: d63165ea05b64c1e483dfb6c6934ef9c330f3a97 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
				4	modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
				5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
				69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
				76	/* Limit for the Unicode object free list */
				77
				78	#define MAX_UNICODE_FREELIST_SIZE 1024
				79
				80	/* Limit for the Unicode object free list stay alive optimization.
				81
				82	The implementation will keep allocated Unicode memory intact for
				83	all objects on the free list having a size less than this
				84	limit. This reduces malloc() overhead for small Unicode objects.
				85
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	86	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	87	(sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
				88	malloc()-overhead) bytes of unused garbage.
				89
				90	Setting the limit to 0 effectively turns the feature off.
				91
				92	XXX The feature is currently turned off because there are
				93	apparently some lingering bugs in its implementation which I
				94	haven't yet been able to sort out.
				95
				96	*/
				97
				98	#define STAYALIVE_SIZE_LIMIT 0
				99
				100	/* Endianness switches; defaults to little endian */
				101
				102	#ifdef WORDS_BIGENDIAN
				103	# define BYTEORDER_IS_BIG_ENDIAN
				104	#else
				105	# define BYTEORDER_IS_LITTLE_ENDIAN
				106	#endif
				107
				108	/* --- Globals ------------------------------------------------------------ */
				109
				110	/* The empty Unicode object */
				111	static PyUnicodeObject *unicode_empty = NULL;
				112
				113	/* Free list for Unicode objects */
				114	static PyUnicodeObject *unicode_freelist = NULL;
				115	static int unicode_freelist_size = 0;
				116
				117	/* --- Unicode Object ----------------------------------------------------- */
				118
				119	static
				120	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				121	int length)
				122	{
				123	void *oldstr;
				124
				125	/* Shortcut if there's nothing to do. */
				126	if (unicode->length == length)
				127	return 0;
				128
				129	/* Resizing unicode_empty is not allowed. */
				130	if (unicode == unicode_empty) {
				131	PyErr_SetString(PyExc_SystemError,
				132	"can't resize empty unicode object");
				133	return -1;
				134	}
				135
				136	/* We allocate one more byte to make sure the string is
				137	Ux0000 terminated -- XXX is this needed ? */
				138	oldstr = unicode->str;
				139	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				140	if (!unicode->str) {
				141	unicode->str = oldstr;
				142	PyErr_NoMemory();
				143	return -1;
				144	}
				145	unicode->str[length] = 0;
				146	unicode->length = length;
				147
				148	/* Reset the object caches */
				149	if (unicode->utf8str) {
				150	Py_DECREF(unicode->utf8str);
				151	unicode->utf8str = NULL;
				152	}
				153	unicode->hash = -1;
				154
				155	return 0;
				156	}
				157
				158	/* We allocate one more byte to make sure the string is
				159	Ux0000 terminated -- XXX is this needed ?
				160
				161	XXX This allocator could further be enhanced by assuring that the
				162	free list never reduces its size below 1.
				163
				164	*/
				165
				166	static
				167	PyUnicodeObject *_PyUnicode_New(int length)
				168	{
				169	register PyUnicodeObject *unicode;
				170
				171	/* Optimization for empty strings */
				172	if (length == 0 && unicode_empty != NULL) {
				173	Py_INCREF(unicode_empty);
				174	return unicode_empty;
				175	}
				176
				177	/* Unicode freelist & memory allocation */
				178	if (unicode_freelist) {
				179	unicode = unicode_freelist;
				180	unicode_freelist = (PyUnicodeObject *)unicode_freelist;
				181	unicode_freelist_size--;
				182	unicode->ob_type = &PyUnicode_Type;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	183	_Py_NewReference((PyObject *)unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	184	if (unicode->str) {
				185	if (unicode->length < length &&
				186	_PyUnicode_Resize(unicode, length)) {
				187	free(unicode->str);
				188	PyMem_DEL(unicode);
				189	return NULL;
				190	}
				191	}
				192	else
				193	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				194	}
				195	else {
				196	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				197	if (unicode == NULL)
				198	return NULL;
				199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				200	}
				201
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	202	if (!unicode->str)
				203	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	204	unicode->str[length] = 0;
				205	unicode->length = length;
				206	unicode->hash = -1;
				207	unicode->utf8str = NULL;
				208	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	209
				210	onError:
				211	_Py_ForgetReference((PyObject *)unicode);
				212	PyMem_DEL(unicode);
				213	PyErr_NoMemory();
				214	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	215	}
				216
				217	static
				218	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				219	{
				220	Py_XDECREF(unicode->utf8str);
				221	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
				222	if (unicode->length >= STAYALIVE_SIZE_LIMIT) {
				223	free(unicode->str);
				224	unicode->str = NULL;
				225	unicode->length = 0;
				226	}
				227	(PyUnicodeObject *)unicode = unicode_freelist;
				228	unicode_freelist = unicode;
				229	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	230	}
				231	else {
				232	free(unicode->str);
				233	PyMem_DEL(unicode);
				234	}
				235	}
				236
				237	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				238	int size)
				239	{
				240	PyUnicodeObject *unicode;
				241
				242	unicode = _PyUnicode_New(size);
				243	if (!unicode)
				244	return NULL;
				245
				246	/* Copy the Unicode data into the new object */
				247	if (u != NULL)
				248	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				249
				250	return (PyObject *)unicode;
				251	}
				252
				253	#ifdef HAVE_WCHAR_H
				254
				255	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				256	int size)
				257	{
				258	PyUnicodeObject *unicode;
				259
				260	if (w == NULL) {
				261	PyErr_BadInternalCall();
				262	return NULL;
				263	}
				264
				265	unicode = _PyUnicode_New(size);
				266	if (!unicode)
				267	return NULL;
				268
				269	/* Copy the wchar_t data into the new object */
				270	#ifdef HAVE_USABLE_WCHAR_T
				271	memcpy(unicode->str, w, size * sizeof(wchar_t));
				272	#else
				273	{
				274	register Py_UNICODE *u;
				275	register int i;
				276	u = PyUnicode_AS_UNICODE(unicode);
				277	for (i = size; i >= 0; i--)
				278	u++ = w++;
				279	}
				280	#endif
				281
				282	return (PyObject *)unicode;
				283	}
				284
				285	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				286	register wchar_t *w,
				287	int size)
				288	{
				289	if (unicode == NULL) {
				290	PyErr_BadInternalCall();
				291	return -1;
				292	}
				293	if (size > PyUnicode_GET_SIZE(unicode))
				294	size = PyUnicode_GET_SIZE(unicode);
				295	#ifdef HAVE_USABLE_WCHAR_T
				296	memcpy(w, unicode->str, size * sizeof(wchar_t));
				297	#else
				298	{
				299	register Py_UNICODE *u;
				300	register int i;
				301	u = PyUnicode_AS_UNICODE(unicode);
				302	for (i = size; i >= 0; i--)
				303	w++ = u++;
				304	}
				305	#endif
				306
				307	return size;
				308	}
				309
				310	#endif
				311
				312	PyObject PyUnicode_FromObject(register PyObject obj)
				313	{
				314	const char *s;
				315	int len;
				316
				317	if (obj == NULL) {
				318	PyErr_BadInternalCall();
				319	return NULL;
				320	}
				321	else if (PyUnicode_Check(obj)) {
				322	Py_INCREF(obj);
				323	return obj;
				324	}
				325	else if (PyString_Check(obj)) {
				326	s = PyString_AS_STRING(obj);
				327	len = PyString_GET_SIZE(obj);
				328	}
				329	else if (PyObject_AsCharBuffer(obj, &s, &len))
				330	return NULL;
				331	if (len == 0) {
				332	Py_INCREF(unicode_empty);
				333	return (PyObject *)unicode_empty;
				334	}
				335	return PyUnicode_DecodeUTF8(s, len, "strict");
				336	}
				337
				338	PyObject PyUnicode_Decode(const char s,
				339	int size,
				340	const char *encoding,
				341	const char *errors)
				342	{
				343	PyObject buffer = NULL, unicode;
				344
				345	/* Shortcut for the default encoding UTF-8 */
				346	if (encoding == NULL \|\|
				347	(strcmp(encoding, "utf-8") == 0))
				348	return PyUnicode_DecodeUTF8(s, size, errors);
				349
				350	/* Decode via the codec registry */
				351	buffer = PyBuffer_FromMemory((void *)s, size);
				352	if (buffer == NULL)
				353	goto onError;
				354	unicode = PyCodec_Decode(buffer, encoding, errors);
				355	if (unicode == NULL)
				356	goto onError;
				357	if (!PyUnicode_Check(unicode)) {
				358	PyErr_Format(PyExc_TypeError,
				359	"decoder did not return an unicode object (type=%s)",
				360	unicode->ob_type->tp_name);
				361	Py_DECREF(unicode);
				362	goto onError;
				363	}
				364	Py_DECREF(buffer);
				365	return unicode;
				366
				367	onError:
				368	Py_XDECREF(buffer);
				369	return NULL;
				370	}
				371
				372	PyObject PyUnicode_Encode(const Py_UNICODE s,
				373	int size,
				374	const char *encoding,
				375	const char *errors)
				376	{
				377	PyObject v, unicode;
				378
				379	unicode = PyUnicode_FromUnicode(s, size);
				380	if (unicode == NULL)
				381	return NULL;
				382	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				383	Py_DECREF(unicode);
				384	return v;
				385	}
				386
				387	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				388	const char *encoding,
				389	const char *errors)
				390	{
				391	PyObject *v;
				392
				393	if (!PyUnicode_Check(unicode)) {
				394	PyErr_BadArgument();
				395	goto onError;
				396	}
				397	/* Shortcut for the default encoding UTF-8 */
				398	if ((encoding == NULL \|\|
				399	(strcmp(encoding, "utf-8") == 0)) &&
				400	errors == NULL)
				401	return PyUnicode_AsUTF8String(unicode);
				402
				403	/* Encode via the codec registry */
				404	v = PyCodec_Encode(unicode, encoding, errors);
				405	if (v == NULL)
				406	goto onError;
				407	/* XXX Should we really enforce this ? */
				408	if (!PyString_Check(v)) {
				409	PyErr_Format(PyExc_TypeError,
				410	"encoder did not return a string object (type=%s)",
				411	v->ob_type->tp_name);
				412	Py_DECREF(v);
				413	goto onError;
				414	}
				415	return v;
				416
				417	onError:
				418	return NULL;
				419	}
				420
				421	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				422	{
				423	if (!PyUnicode_Check(unicode)) {
				424	PyErr_BadArgument();
				425	goto onError;
				426	}
				427	return PyUnicode_AS_UNICODE(unicode);
				428
				429	onError:
				430	return NULL;
				431	}
				432
				433	int PyUnicode_GetSize(PyObject *unicode)
				434	{
				435	if (!PyUnicode_Check(unicode)) {
				436	PyErr_BadArgument();
				437	goto onError;
				438	}
				439	return PyUnicode_GET_SIZE(unicode);
				440
				441	onError:
				442	return -1;
				443	}
				444
				445	/* --- UTF-8 Codec -------------------------------------------------------- */
				446
				447	static
				448	char utf8_code_length[256] = {
				449	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				450	illegal prefix. see RFC 2279 for details */
				451	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				452	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				453	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				454	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				455	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				456	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				457	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				458	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				459	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				460	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				461	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				462	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				463	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				464	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				465	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				466	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				467	};
				468
				469	static
				470	int utf8_decoding_error(const char **source,
				471	Py_UNICODE **dest,
				472	const char *errors,
				473	const char *details)
				474	{
				475	if ((errors == NULL) \|\|
				476	(strcmp(errors,"strict") == 0)) {
				477	PyErr_Format(PyExc_UnicodeError,
				478	"UTF-8 decoding error: %s",
				479	details);
				480	return -1;
				481	}
				482	else if (strcmp(errors,"ignore") == 0) {
				483	(*source)++;
				484	return 0;
				485	}
				486	else if (strcmp(errors,"replace") == 0) {
				487	(*source)++;
				488	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				489	(*dest)++;
				490	return 0;
				491	}
				492	else {
				493	PyErr_Format(PyExc_ValueError,
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	494	"UTF-8 decoding error; unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	495	errors);
				496	return -1;
				497	}
				498	}
				499
				500	#define UTF8_ERROR(details) do { \
				501	if (utf8_decoding_error(&s, &p, errors, details)) \
				502	goto onError; \
				503	continue; \
				504	} while (0)
				505
				506	PyObject PyUnicode_DecodeUTF8(const char s,
				507	int size,
				508	const char *errors)
				509	{
				510	int n;
				511	const char *e;
				512	PyUnicodeObject *unicode;
				513	Py_UNICODE *p;
				514
				515	/* Note: size will always be longer than the resulting Unicode
				516	character count */
				517	unicode = _PyUnicode_New(size);
				518	if (!unicode)
				519	return NULL;
				520	if (size == 0)
				521	return (PyObject *)unicode;
				522
				523	/* Unpack UTF-8 encoded data */
				524	p = unicode->str;
				525	e = s + size;
				526
				527	while (s < e) {
				528	register Py_UNICODE ch = (unsigned char)*s;
				529
				530	if (ch < 0x80) {
				531	*p++ = ch;
				532	s++;
				533	continue;
				534	}
				535
				536	n = utf8_code_length[ch];
				537
				538	if (s + n > e)
				539	UTF8_ERROR("unexpected end of data");
				540
				541	switch (n) {
				542
				543	case 0:
				544	UTF8_ERROR("unexpected code byte");
				545	break;
				546
				547	case 1:
				548	UTF8_ERROR("internal error");
				549	break;
				550
				551	case 2:
				552	if ((s[1] & 0xc0) != 0x80)
				553	UTF8_ERROR("invalid data");
				554	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				555	if (ch < 0x80)
				556	UTF8_ERROR("illegal encoding");
				557	else
				558	*p++ = ch;
				559	break;
				560
				561	case 3:
				562	if ((s[1] & 0xc0) != 0x80 \|\|
				563	(s[2] & 0xc0) != 0x80)
				564	UTF8_ERROR("invalid data");
				565	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				566	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				567	UTF8_ERROR("illegal encoding");
				568	else
				569	*p++ = ch;
				570	break;
				571
				572	default:
				573	/* Other sizes are only needed for UCS-4 */
				574	UTF8_ERROR("unsupported Unicode code range");
				575	}
				576	s += n;
				577	}
				578
				579	/* Adjust length */
				580	if (_PyUnicode_Resize(unicode, p - unicode->str))
				581	goto onError;
				582
				583	return (PyObject *)unicode;
				584
				585	onError:
				586	Py_DECREF(unicode);
				587	return NULL;
				588	}
				589
				590	#undef UTF8_ERROR
				591
				592	static
				593	int utf8_encoding_error(const Py_UNICODE **source,
				594	char **dest,
				595	const char *errors,
				596	const char *details)
				597	{
				598	if ((errors == NULL) \|\|
				599	(strcmp(errors,"strict") == 0)) {
				600	PyErr_Format(PyExc_UnicodeError,
				601	"UTF-8 encoding error: %s",
				602	details);
				603	return -1;
				604	}
				605	else if (strcmp(errors,"ignore") == 0) {
				606	return 0;
				607	}
				608	else if (strcmp(errors,"replace") == 0) {
				609	**dest = '?';
				610	(*dest)++;
				611	return 0;
				612	}
				613	else {
				614	PyErr_Format(PyExc_ValueError,
				615	"UTF-8 encoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	616	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	617	errors);
				618	return -1;
				619	}
				620	}
				621
				622	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				623	int size,
				624	const char *errors)
				625	{
				626	PyObject *v;
				627	char *p;
				628	char *q;
				629
				630	v = PyString_FromStringAndSize(NULL, 3 * size);
				631	if (v == NULL)
				632	return NULL;
				633	if (size == 0)
				634	goto done;
				635
				636	p = q = PyString_AS_STRING(v);
				637	while (size-- > 0) {
				638	Py_UNICODE ch = *s++;
				639	if (ch < 0x80)
				640	*p++ = (char) ch;
				641	else if (ch < 0x0800) {
				642	*p++ = 0xc0 \| (ch >> 6);
				643	*p++ = 0x80 \| (ch & 0x3f);
				644	} else if (0xD800 <= ch && ch <= 0xDFFF) {
				645	/* These byte ranges are reserved for UTF-16 surrogate
				646	bytes which the Python implementation currently does
				647	not support. */
				648	printf("code range problem: U+%04x\n", ch);
				649	if (utf8_encoding_error(&s, &p, errors,
				650	"unsupported code range"))
				651	goto onError;
				652	} else {
				653	*p++ = 0xe0 \| (ch >> 12);
				654	*p++ = 0x80 \| ((ch >> 6) & 0x3f);
				655	*p++ = 0x80 \| (ch & 0x3f);
				656	}
				657	}
				658	*p = '\0';
				659	_PyString_Resize(&v, p - q);
				660
				661	done:
				662	return v;
				663
				664	onError:
				665	Py_DECREF(v);
				666	return NULL;
				667	}
				668
				669	/* Return a Python string holding the UTF-8 encoded value of the
				670	Unicode object.
				671
				672	The resulting string is cached in the Unicode object for subsequent
				673	usage by this function. The cached version is needed to implement
				674	the character buffer interface.
				675
				676	The refcount of the string is not incremented.
				677
				678	*/
				679
				680	static
				681	PyObject utf8_string(PyUnicodeObject self,
				682	const char *errors)
				683	{
				684	PyObject *v = self->utf8str;
				685
				686	if (v)
				687	return v;
				688	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(self),
				689	PyUnicode_GET_SIZE(self),
				690	errors);
				691	if (v && errors == NULL)
				692	self->utf8str = v;
				693	return v;
				694	}
				695
				696	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				697	{
				698	PyObject *str;
				699
				700	if (!PyUnicode_Check(unicode)) {
				701	PyErr_BadArgument();
				702	return NULL;
				703	}
				704	str = utf8_string((PyUnicodeObject *)unicode, NULL);
				705	if (str == NULL)
				706	return NULL;
				707	Py_INCREF(str);
				708	return str;
				709	}
				710
				711	/* --- UTF-16 Codec ------------------------------------------------------- */
				712
				713	static
				714	int utf16_decoding_error(const Py_UNICODE **source,
				715	Py_UNICODE **dest,
				716	const char *errors,
				717	const char *details)
				718	{
				719	if ((errors == NULL) \|\|
				720	(strcmp(errors,"strict") == 0)) {
				721	PyErr_Format(PyExc_UnicodeError,
				722	"UTF-16 decoding error: %s",
				723	details);
				724	return -1;
				725	}
				726	else if (strcmp(errors,"ignore") == 0) {
				727	return 0;
				728	}
				729	else if (strcmp(errors,"replace") == 0) {
				730	if (dest) {
				731	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				732	(*dest)++;
				733	}
				734	return 0;
				735	}
				736	else {
				737	PyErr_Format(PyExc_ValueError,
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	738	"UTF-16 decoding error; unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	739	errors);
				740	return -1;
				741	}
				742	}
				743
				744	#define UTF16_ERROR(details) do { \
				745	if (utf16_decoding_error(&q, &p, errors, details)) \
				746	goto onError; \
				747	continue; \
				748	} while(0)
				749
				750	PyObject PyUnicode_DecodeUTF16(const char s,
				751	int size,
				752	const char *errors,
				753	int *byteorder)
				754	{
				755	PyUnicodeObject *unicode;
				756	Py_UNICODE *p;
				757	const Py_UNICODE q, e;
				758	int bo = 0;
				759
				760	/* size should be an even number */
				761	if (size % sizeof(Py_UNICODE) != 0) {
				762	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				763	return NULL;
				764	/* The remaining input chars are ignored if we fall through
				765	here... */
				766	}
				767
				768	/* Note: size will always be longer than the resulting Unicode
				769	character count */
				770	unicode = _PyUnicode_New(size);
				771	if (!unicode)
				772	return NULL;
				773	if (size == 0)
				774	return (PyObject *)unicode;
				775
				776	/* Unpack UTF-16 encoded data */
				777	p = unicode->str;
				778	q = (Py_UNICODE *)s;
				779	e = q + (size / sizeof(Py_UNICODE));
				780
				781	if (byteorder)
				782	bo = *byteorder;
				783
				784	while (q < e) {
				785	register Py_UNICODE ch = *q++;
				786
				787	/* Check for BOM marks (U+FEFF) in the input and adjust
				788	current byte order setting accordingly. Swap input
				789	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				790	!) */
				791	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				792	if (ch == 0xFEFF) {
				793	bo = -1;
				794	continue;
				795	} else if (ch == 0xFFFE) {
				796	bo = 1;
				797	continue;
				798	}
				799	if (bo == 1)
				800	ch = (ch >> 8) \| (ch << 8);
				801	#else
				802	if (ch == 0xFEFF) {
				803	bo = 1;
				804	continue;
				805	} else if (ch == 0xFFFE) {
				806	bo = -1;
				807	continue;
				808	}
				809	if (bo == -1)
				810	ch = (ch >> 8) \| (ch << 8);
				811	#endif
				812	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				813	*p++ = ch;
				814	continue;
				815	}
				816
				817	/* UTF-16 code pair: */
				818	if (q >= e)
				819	UTF16_ERROR("unexpected end of data");
				820	if (0xDC00 <= q && q <= 0xDFFF) {
				821	q++;
				822	if (0xD800 <= q && q <= 0xDBFF)
				823	/* This is valid data (a UTF-16 surrogate pair), but
				824	we are not able to store this information since our
				825	Py_UNICODE type only has 16 bits... this might
				826	change someday, even though it's unlikely. */
				827	UTF16_ERROR("code pairs are not supported");
				828	else
				829	continue;
				830	}
				831	UTF16_ERROR("illegal encoding");
				832	}
				833
				834	if (byteorder)
				835	*byteorder = bo;
				836
				837	/* Adjust length */
				838	if (_PyUnicode_Resize(unicode, p - unicode->str))
				839	goto onError;
				840
				841	return (PyObject *)unicode;
				842
				843	onError:
				844	Py_DECREF(unicode);
				845	return NULL;
				846	}
				847
				848	#undef UTF16_ERROR
				849
				850	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				851	int size,
				852	const char *errors,
				853	int byteorder)
				854	{
				855	PyObject *v;
				856	Py_UNICODE *p;
				857	char *q;
				858
				859	/* We don't create UTF-16 pairs... */
				860	v = PyString_FromStringAndSize(NULL,
				861	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				862	if (v == NULL)
				863	return NULL;
				864	if (size == 0)
				865	goto done;
				866
				867	q = PyString_AS_STRING(v);
				868	p = (Py_UNICODE *)q;
				869
				870	if (byteorder == 0)
				871	*p++ = 0xFEFF;
				872	if (byteorder == 0 \|\|
				873	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				874	byteorder == -1
				875	#else
				876	byteorder == 1
				877	#endif
				878	)
				879	memcpy(p, s, size * sizeof(Py_UNICODE));
				880	else
				881	while (size-- > 0) {
				882	Py_UNICODE ch = *s++;
				883	*p++ = (ch >> 8) \| (ch << 8);
				884	}
				885	done:
				886	return v;
				887	}
				888
				889	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				890	{
				891	if (!PyUnicode_Check(unicode)) {
				892	PyErr_BadArgument();
				893	return NULL;
				894	}
				895	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				896	PyUnicode_GET_SIZE(unicode),
				897	NULL,
				898	0);
				899	}
				900
				901	/* --- Unicode Escape Codec ----------------------------------------------- */
				902
				903	static
				904	int unicodeescape_decoding_error(const char **source,
				905	unsigned int *x,
				906	const char *errors,
				907	const char *details)
				908	{
				909	if ((errors == NULL) \|\|
				910	(strcmp(errors,"strict") == 0)) {
				911	PyErr_Format(PyExc_UnicodeError,
				912	"Unicode-Escape decoding error: %s",
				913	details);
				914	return -1;
				915	}
				916	else if (strcmp(errors,"ignore") == 0) {
				917	return 0;
				918	}
				919	else if (strcmp(errors,"replace") == 0) {
				920	*x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
				921	return 0;
				922	}
				923	else {
				924	PyErr_Format(PyExc_ValueError,
				925	"Unicode-Escape decoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	926	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	927	errors);
				928	return -1;
				929	}
				930	}
				931
				932	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				933	int size,
				934	const char *errors)
				935	{
				936	PyUnicodeObject *v;
				937	Py_UNICODE p = NULL, buf = NULL;
				938	const char *end;
				939
				940	/* Escaped strings will always be longer than the resulting
				941	Unicode string, so we start with size here and then reduce the
				942	length after conversion to the true value. */
				943	v = _PyUnicode_New(size);
				944	if (v == NULL)
				945	goto onError;
				946	if (size == 0)
				947	return (PyObject *)v;
				948	p = buf = PyUnicode_AS_UNICODE(v);
				949	end = s + size;
				950	while (s < end) {
				951	unsigned char c;
				952	unsigned int x;
				953	int i;
				954
				955	/* Non-escape characters are interpreted as Unicode ordinals */
				956	if (*s != '\\') {
				957	p++ = (unsigned char)s++;
				958	continue;
				959	}
				960
				961	/* \ - Escapes */
				962	s++;
				963	switch (*s++) {
				964
				965	/* \x escapes */
				966	case '\n': break;
				967	case '\\': *p++ = '\\'; break;
				968	case '\'': *p++ = '\''; break;
				969	case '\"': *p++ = '\"'; break;
				970	case 'b': *p++ = '\b'; break;
				971	case 'f': p++ = '\014'; break; / FF */
				972	case 't': *p++ = '\t'; break;
				973	case 'n': *p++ = '\n'; break;
				974	case 'r': *p++ = '\r'; break;
				975	case 'v': p++ = '\013'; break; / VT */
				976	case 'a': p++ = '\007'; break; / BEL, not classic C */
				977
				978	/* \OOO (octal) escapes */
				979	case '0': case '1': case '2': case '3':
				980	case '4': case '5': case '6': case '7':
				981	c = s[-1] - '0';
				982	if ('0' <= s && s <= '7') {
				983	c = (c<<3) + *s++ - '0';
				984	if ('0' <= s && s <= '7')
				985	c = (c<<3) + *s++ - '0';
				986	}
				987	*p++ = c;
				988	break;
				989
				990	/* \xXXXX escape with 0-4 hex digits */
				991	case 'x':
				992	x = 0;
				993	c = (unsigned char)*s;
				994	if (isxdigit(c)) {
				995	do {
				996	x = (x<<4) & ~0xF;
				997	if ('0' <= c && c <= '9')
				998	x += c - '0';
				999	else if ('a' <= c && c <= 'f')
				1000	x += 10 + c - 'a';
				1001	else
				1002	x += 10 + c - 'A';
				1003	c = (unsigned char)*++s;
				1004	} while (isxdigit(c));
				1005	*p++ = x;
				1006	} else {
				1007	*p++ = '\\';
				1008	*p++ = (unsigned char)s[-1];
				1009	}
				1010	break;
				1011
				1012	/* \uXXXX with 4 hex digits */
				1013	case 'u':
				1014	for (x = 0, i = 0; i < 4; i++) {
				1015	c = (unsigned char)s[i];
				1016	if (!isxdigit(c)) {
				1017	if (unicodeescape_decoding_error(&s, &x, errors,
				1018	"truncated \\uXXXX"))
				1019	goto onError;
				1020	i++;
				1021	break;
				1022	}
				1023	x = (x<<4) & ~0xF;
				1024	if (c >= '0' && c <= '9')
				1025	x += c - '0';
				1026	else if (c >= 'a' && c <= 'f')
				1027	x += 10 + c - 'a';
				1028	else
				1029	x += 10 + c - 'A';
				1030	}
				1031	s += i;
				1032	*p++ = x;
				1033	break;
				1034
				1035	default:
				1036	*p++ = '\\';
				1037	*p++ = (unsigned char)s[-1];
				1038	break;
				1039	}
				1040	}
				1041	_PyUnicode_Resize(v, (int)(p - buf));
				1042	return (PyObject *)v;
				1043
				1044	onError:
				1045	Py_XDECREF(v);
				1046	return NULL;
				1047	}
				1048
				1049	/* Return a Unicode-Escape string version of the Unicode object.
				1050
				1051	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1052	appropriate.
				1053
				1054	*/
				1055
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1056	static const Py_UNICODE findchar(const Py_UNICODE s,
				1057	int size,
				1058	Py_UNICODE ch);
				1059
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1060	static
				1061	PyObject unicodeescape_string(const Py_UNICODE s,
				1062	int size,
				1063	int quotes)
				1064	{
				1065	PyObject *repr;
				1066	char *p;
				1067	char *q;
				1068
				1069	static const char *hexdigit = "0123456789ABCDEF";
				1070
				1071	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1072	if (repr == NULL)
				1073	return NULL;
				1074
				1075	p = q = PyString_AS_STRING(repr);
				1076
				1077	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1078	*p++ = 'u';
				1079	*p++ = (findchar(s, size, '\'') &&
				1080	!findchar(s, size, '"')) ? '"' : '\'';
				1081	}
				1082	while (size-- > 0) {
				1083	Py_UNICODE ch = *s++;
				1084	/* Escape quotes */
				1085	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1086	*p++ = '\\';
				1087	*p++ = (char) ch;
				1088	}
				1089	/* Map 16-bit characters to '\uxxxx' */
				1090	else if (ch >= 256) {
				1091	*p++ = '\\';
				1092	*p++ = 'u';
				1093	*p++ = hexdigit[(ch >> 12) & 0xf];
				1094	*p++ = hexdigit[(ch >> 8) & 0xf];
				1095	*p++ = hexdigit[(ch >> 4) & 0xf];
				1096	*p++ = hexdigit[ch & 15];
				1097	}
				1098	/* Map non-printable US ASCII to '\ooo' */
				1099	else if (ch < ' ' \|\| ch >= 128) {
				1100	*p++ = '\\';
				1101	*p++ = hexdigit[(ch >> 6) & 7];
				1102	*p++ = hexdigit[(ch >> 3) & 7];
				1103	*p++ = hexdigit[ch & 7];
				1104	}
				1105	/* Copy everything else as-is */
				1106	else
				1107	*p++ = (char) ch;
				1108	}
				1109	if (quotes)
				1110	*p++ = q[1];
				1111
				1112	*p = '\0';
				1113	_PyString_Resize(&repr, p - q);
				1114
				1115	return repr;
				1116	}
				1117
				1118	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1119	int size)
				1120	{
				1121	return unicodeescape_string(s, size, 0);
				1122	}
				1123
				1124	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1125	{
				1126	if (!PyUnicode_Check(unicode)) {
				1127	PyErr_BadArgument();
				1128	return NULL;
				1129	}
				1130	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1131	PyUnicode_GET_SIZE(unicode));
				1132	}
				1133
				1134	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1135
				1136	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1137	int size,
				1138	const char *errors)
				1139	{
				1140	PyUnicodeObject *v;
				1141	Py_UNICODE p, buf;
				1142	const char *end;
				1143	const char *bs;
				1144
				1145	/* Escaped strings will always be longer than the resulting
				1146	Unicode string, so we start with size here and then reduce the
				1147	length after conversion to the true value. */
				1148	v = _PyUnicode_New(size);
				1149	if (v == NULL)
				1150	goto onError;
				1151	if (size == 0)
				1152	return (PyObject *)v;
				1153	p = buf = PyUnicode_AS_UNICODE(v);
				1154	end = s + size;
				1155	while (s < end) {
				1156	unsigned char c;
				1157	unsigned int x;
				1158	int i;
				1159
				1160	/* Non-escape characters are interpreted as Unicode ordinals */
				1161	if (*s != '\\') {
				1162	p++ = (unsigned char)s++;
				1163	continue;
				1164	}
				1165
				1166	/* \u-escapes are only interpreted iff the number of leading
				1167	backslashes if odd */
				1168	bs = s;
				1169	for (;s < end;) {
				1170	if (*s != '\\')
				1171	break;
				1172	p++ = (unsigned char)s++;
				1173	}
				1174	if (((s - bs) & 1) == 0 \|\|
				1175	s >= end \|\|
				1176	*s != 'u') {
				1177	continue;
				1178	}
				1179	p--;
				1180	s++;
				1181
				1182	/* \uXXXX with 4 hex digits */
				1183	for (x = 0, i = 0; i < 4; i++) {
				1184	c = (unsigned char)s[i];
				1185	if (!isxdigit(c)) {
				1186	if (unicodeescape_decoding_error(&s, &x, errors,
				1187	"truncated \\uXXXX"))
				1188	goto onError;
				1189	i++;
				1190	break;
				1191	}
				1192	x = (x<<4) & ~0xF;
				1193	if (c >= '0' && c <= '9')
				1194	x += c - '0';
				1195	else if (c >= 'a' && c <= 'f')
				1196	x += 10 + c - 'a';
				1197	else
				1198	x += 10 + c - 'A';
				1199	}
				1200	s += i;
				1201	*p++ = x;
				1202	}
				1203	_PyUnicode_Resize(v, (int)(p - buf));
				1204	return (PyObject *)v;
				1205
				1206	onError:
				1207	Py_XDECREF(v);
				1208	return NULL;
				1209	}
				1210
				1211	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1212	int size)
				1213	{
				1214	PyObject *repr;
				1215	char *p;
				1216	char *q;
				1217
				1218	static const char *hexdigit = "0123456789ABCDEF";
				1219
				1220	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1221	if (repr == NULL)
				1222	return NULL;
				1223
				1224	p = q = PyString_AS_STRING(repr);
				1225	while (size-- > 0) {
				1226	Py_UNICODE ch = *s++;
				1227	/* Map 16-bit characters to '\uxxxx' */
				1228	if (ch >= 256) {
				1229	*p++ = '\\';
				1230	*p++ = 'u';
				1231	*p++ = hexdigit[(ch >> 12) & 0xf];
				1232	*p++ = hexdigit[(ch >> 8) & 0xf];
				1233	*p++ = hexdigit[(ch >> 4) & 0xf];
				1234	*p++ = hexdigit[ch & 15];
				1235	}
				1236	/* Copy everything else as-is */
				1237	else
				1238	*p++ = (char) ch;
				1239	}
				1240	*p = '\0';
				1241	_PyString_Resize(&repr, p - q);
				1242
				1243	return repr;
				1244	}
				1245
				1246	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1247	{
				1248	if (!PyUnicode_Check(unicode)) {
				1249	PyErr_BadArgument();
				1250	return NULL;
				1251	}
				1252	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1253	PyUnicode_GET_SIZE(unicode));
				1254	}
				1255
				1256	/* --- Latin-1 Codec ------------------------------------------------------ */
				1257
				1258	PyObject PyUnicode_DecodeLatin1(const char s,
				1259	int size,
				1260	const char *errors)
				1261	{
				1262	PyUnicodeObject *v;
				1263	Py_UNICODE *p;
				1264
				1265	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1266	v = _PyUnicode_New(size);
				1267	if (v == NULL)
				1268	goto onError;
				1269	if (size == 0)
				1270	return (PyObject *)v;
				1271	p = PyUnicode_AS_UNICODE(v);
				1272	while (size-- > 0)
				1273	p++ = (unsigned char)s++;
				1274	return (PyObject *)v;
				1275
				1276	onError:
				1277	Py_XDECREF(v);
				1278	return NULL;
				1279	}
				1280
				1281	static
				1282	int latin1_encoding_error(const Py_UNICODE **source,
				1283	char **dest,
				1284	const char *errors,
				1285	const char *details)
				1286	{
				1287	if ((errors == NULL) \|\|
				1288	(strcmp(errors,"strict") == 0)) {
				1289	PyErr_Format(PyExc_UnicodeError,
				1290	"Latin-1 encoding error: %s",
				1291	details);
				1292	return -1;
				1293	}
				1294	else if (strcmp(errors,"ignore") == 0) {
				1295	return 0;
				1296	}
				1297	else if (strcmp(errors,"replace") == 0) {
				1298	**dest = '?';
				1299	return 0;
				1300	}
				1301	else {
				1302	PyErr_Format(PyExc_ValueError,
				1303	"Latin-1 encoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1304	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1305	errors);
				1306	return -1;
				1307	}
				1308	}
				1309
				1310	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1311	int size,
				1312	const char *errors)
				1313	{
				1314	PyObject *repr;
				1315	char *s;
				1316	repr = PyString_FromStringAndSize(NULL, size);
				1317	if (repr == NULL)
				1318	return NULL;
				1319
				1320	s = PyString_AS_STRING(repr);
				1321	while (size-- > 0) {
				1322	Py_UNICODE ch = *p++;
				1323	if (ch >= 256) {
				1324	if (latin1_encoding_error(&p, &s, errors,
				1325	"ordinal not in range(256)"))
				1326	goto onError;
				1327	}
				1328	else
				1329	*s++ = (char)ch;
				1330	}
				1331	return repr;
				1332
				1333	onError:
				1334	Py_DECREF(repr);
				1335	return NULL;
				1336	}
				1337
				1338	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1339	{
				1340	if (!PyUnicode_Check(unicode)) {
				1341	PyErr_BadArgument();
				1342	return NULL;
				1343	}
				1344	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1345	PyUnicode_GET_SIZE(unicode),
				1346	NULL);
				1347	}
				1348
				1349	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1350
				1351	static
				1352	int ascii_decoding_error(const char **source,
				1353	Py_UNICODE **dest,
				1354	const char *errors,
				1355	const char *details)
				1356	{
				1357	if ((errors == NULL) \|\|
				1358	(strcmp(errors,"strict") == 0)) {
				1359	PyErr_Format(PyExc_UnicodeError,
				1360	"ASCII decoding error: %s",
				1361	details);
				1362	return -1;
				1363	}
				1364	else if (strcmp(errors,"ignore") == 0) {
				1365	return 0;
				1366	}
				1367	else if (strcmp(errors,"replace") == 0) {
				1368	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1369	(*dest)++;
				1370	return 0;
				1371	}
				1372	else {
				1373	PyErr_Format(PyExc_ValueError,
				1374	"ASCII decoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1375	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1376	errors);
				1377	return -1;
				1378	}
				1379	}
				1380
				1381	PyObject PyUnicode_DecodeASCII(const char s,
				1382	int size,
				1383	const char *errors)
				1384	{
				1385	PyUnicodeObject *v;
				1386	Py_UNICODE *p;
				1387
				1388	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1389	v = _PyUnicode_New(size);
				1390	if (v == NULL)
				1391	goto onError;
				1392	if (size == 0)
				1393	return (PyObject *)v;
				1394	p = PyUnicode_AS_UNICODE(v);
				1395	while (size-- > 0) {
				1396	register unsigned char c;
				1397
				1398	c = (unsigned char)*s++;
				1399	if (c < 128)
				1400	*p++ = c;
				1401	else if (ascii_decoding_error(&s, &p, errors,
				1402	"ordinal not in range(128)"))
				1403	goto onError;
				1404	}
				1405	if (p - PyUnicode_AS_UNICODE(v) < size)
				1406	_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
				1407	return (PyObject *)v;
				1408
				1409	onError:
				1410	Py_XDECREF(v);
				1411	return NULL;
				1412	}
				1413
				1414	static
				1415	int ascii_encoding_error(const Py_UNICODE **source,
				1416	char **dest,
				1417	const char *errors,
				1418	const char *details)
				1419	{
				1420	if ((errors == NULL) \|\|
				1421	(strcmp(errors,"strict") == 0)) {
				1422	PyErr_Format(PyExc_UnicodeError,
				1423	"ASCII encoding error: %s",
				1424	details);
				1425	return -1;
				1426	}
				1427	else if (strcmp(errors,"ignore") == 0) {
				1428	return 0;
				1429	}
				1430	else if (strcmp(errors,"replace") == 0) {
				1431	**dest = '?';
				1432	return 0;
				1433	}
				1434	else {
				1435	PyErr_Format(PyExc_ValueError,
				1436	"ASCII encoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1437	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1438	errors);
				1439	return -1;
				1440	}
				1441	}
				1442
				1443	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1444	int size,
				1445	const char *errors)
				1446	{
				1447	PyObject *repr;
				1448	char *s;
				1449	repr = PyString_FromStringAndSize(NULL, size);
				1450	if (repr == NULL)
				1451	return NULL;
				1452
				1453	s = PyString_AS_STRING(repr);
				1454	while (size-- > 0) {
				1455	Py_UNICODE ch = *p++;
				1456	if (ch >= 128) {
				1457	if (ascii_encoding_error(&p, &s, errors,
				1458	"ordinal not in range(128)"))
				1459	goto onError;
				1460	}
				1461	else
				1462	*s++ = (char)ch;
				1463	}
				1464	return repr;
				1465
				1466	onError:
				1467	Py_DECREF(repr);
				1468	return NULL;
				1469	}
				1470
				1471	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1472	{
				1473	if (!PyUnicode_Check(unicode)) {
				1474	PyErr_BadArgument();
				1475	return NULL;
				1476	}
				1477	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1478	PyUnicode_GET_SIZE(unicode),
				1479	NULL);
				1480	}
				1481
				1482	/* --- Character Mapping Codec -------------------------------------------- */
				1483
				1484	static
				1485	int charmap_decoding_error(const char **source,
				1486	Py_UNICODE **dest,
				1487	const char *errors,
				1488	const char *details)
				1489	{
				1490	if ((errors == NULL) \|\|
				1491	(strcmp(errors,"strict") == 0)) {
				1492	PyErr_Format(PyExc_UnicodeError,
				1493	"charmap decoding error: %s",
				1494	details);
				1495	return -1;
				1496	}
				1497	else if (strcmp(errors,"ignore") == 0) {
				1498	return 0;
				1499	}
				1500	else if (strcmp(errors,"replace") == 0) {
				1501	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1502	(*dest)++;
				1503	return 0;
				1504	}
				1505	else {
				1506	PyErr_Format(PyExc_ValueError,
				1507	"charmap decoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1508	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1509	errors);
				1510	return -1;
				1511	}
				1512	}
				1513
				1514	PyObject PyUnicode_DecodeCharmap(const char s,
				1515	int size,
				1516	PyObject *mapping,
				1517	const char *errors)
				1518	{
				1519	PyUnicodeObject *v;
				1520	Py_UNICODE *p;
				1521
				1522	/* Default to Latin-1 */
				1523	if (mapping == NULL)
				1524	return PyUnicode_DecodeLatin1(s, size, errors);
				1525
				1526	v = _PyUnicode_New(size);
				1527	if (v == NULL)
				1528	goto onError;
				1529	if (size == 0)
				1530	return (PyObject *)v;
				1531	p = PyUnicode_AS_UNICODE(v);
				1532	while (size-- > 0) {
				1533	unsigned char ch = *s++;
				1534	PyObject w, x;
				1535
				1536	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1537	w = PyInt_FromLong((long)ch);
				1538	if (w == NULL)
				1539	goto onError;
				1540	x = PyObject_GetItem(mapping, w);
				1541	Py_DECREF(w);
				1542	if (x == NULL) {
				1543	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1544	/* No mapping found: default to Latin-1 mapping */
				1545	PyErr_Clear();
				1546	*p++ = (Py_UNICODE)ch;
				1547	continue;
				1548	}
				1549	goto onError;
				1550	}
				1551
				1552	/* Apply mapping */
				1553	if (PyInt_Check(x)) {
				1554	int value = PyInt_AS_LONG(x);
				1555	if (value < 0 \|\| value > 65535) {
				1556	PyErr_SetString(PyExc_TypeError,
				1557	"character mapping must be in range(65336)");
				1558	Py_DECREF(x);
				1559	goto onError;
				1560	}
				1561	*p++ = (Py_UNICODE)value;
				1562	}
				1563	else if (x == Py_None) {
				1564	/* undefined mapping */
				1565	if (charmap_decoding_error(&s, &p, errors,
				1566	"character maps to <undefined>")) {
				1567	Py_DECREF(x);
				1568	goto onError;
				1569	}
				1570	}
				1571	else if (PyUnicode_Check(x)) {
				1572	if (PyUnicode_GET_SIZE(x) != 1) {
				1573	/* 1-n mapping */
				1574	PyErr_SetString(PyExc_NotImplementedError,
				1575	"1-n mappings are currently not implemented");
				1576	Py_DECREF(x);
				1577	goto onError;
				1578	}
				1579	p++ = PyUnicode_AS_UNICODE(x);
				1580	}
				1581	else {
				1582	/* wrong return value */
				1583	PyErr_SetString(PyExc_TypeError,
				1584	"character mapping must return integer, None or unicode");
				1585	Py_DECREF(x);
				1586	goto onError;
				1587	}
				1588	Py_DECREF(x);
				1589	}
				1590	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1591	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1592	goto onError;
				1593	return (PyObject *)v;
				1594
				1595	onError:
				1596	Py_XDECREF(v);
				1597	return NULL;
				1598	}
				1599
				1600	static
				1601	int charmap_encoding_error(const Py_UNICODE **source,
				1602	char **dest,
				1603	const char *errors,
				1604	const char *details)
				1605	{
				1606	if ((errors == NULL) \|\|
				1607	(strcmp(errors,"strict") == 0)) {
				1608	PyErr_Format(PyExc_UnicodeError,
				1609	"charmap encoding error: %s",
				1610	details);
				1611	return -1;
				1612	}
				1613	else if (strcmp(errors,"ignore") == 0) {
				1614	return 0;
				1615	}
				1616	else if (strcmp(errors,"replace") == 0) {
				1617	**dest = '?';
				1618	(*dest)++;
				1619	return 0;
				1620	}
				1621	else {
				1622	PyErr_Format(PyExc_ValueError,
				1623	"charmap encoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1624	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1625	errors);
				1626	return -1;
				1627	}
				1628	}
				1629
				1630	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1631	int size,
				1632	PyObject *mapping,
				1633	const char *errors)
				1634	{
				1635	PyObject *v;
				1636	char *s;
				1637
				1638	/* Default to Latin-1 */
				1639	if (mapping == NULL)
				1640	return PyUnicode_EncodeLatin1(p, size, errors);
				1641
				1642	v = PyString_FromStringAndSize(NULL, size);
				1643	if (v == NULL)
				1644	return NULL;
				1645	s = PyString_AS_STRING(v);
				1646	while (size-- > 0) {
				1647	Py_UNICODE ch = *p++;
				1648	PyObject w, x;
				1649
				1650	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				1651	w = PyInt_FromLong((long)ch);
				1652	if (w == NULL)
				1653	goto onError;
				1654	x = PyObject_GetItem(mapping, w);
				1655	Py_DECREF(w);
				1656	if (x == NULL) {
				1657	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1658	/* No mapping found: default to Latin-1 mapping if possible */
				1659	PyErr_Clear();
				1660	if (ch < 256) {
				1661	*s++ = (char)ch;
				1662	continue;
				1663	}
				1664	else if (!charmap_encoding_error(&p, &s, errors,
				1665	"missing character mapping"))
				1666	continue;
				1667	}
				1668	goto onError;
				1669	}
				1670
				1671	/* Apply mapping */
				1672	if (PyInt_Check(x)) {
				1673	int value = PyInt_AS_LONG(x);
				1674	if (value < 0 \|\| value > 255) {
				1675	PyErr_SetString(PyExc_TypeError,
				1676	"character mapping must be in range(256)");
				1677	Py_DECREF(x);
				1678	goto onError;
				1679	}
				1680	*s++ = (char)value;
				1681	}
				1682	else if (x == Py_None) {
				1683	/* undefined mapping */
				1684	if (charmap_encoding_error(&p, &s, errors,
				1685	"character maps to <undefined>")) {
				1686	Py_DECREF(x);
				1687	goto onError;
				1688	}
				1689	}
				1690	else if (PyString_Check(x)) {
				1691	if (PyString_GET_SIZE(x) != 1) {
				1692	/* 1-n mapping */
				1693	PyErr_SetString(PyExc_NotImplementedError,
				1694	"1-n mappings are currently not implemented");
				1695	Py_DECREF(x);
				1696	goto onError;
				1697	}
				1698	s++ = PyString_AS_STRING(x);
				1699	}
				1700	else {
				1701	/* wrong return value */
				1702	PyErr_SetString(PyExc_TypeError,
				1703	"character mapping must return integer, None or unicode");
				1704	Py_DECREF(x);
				1705	goto onError;
				1706	}
				1707	Py_DECREF(x);
				1708	}
				1709	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				1710	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				1711	goto onError;
				1712	return v;
				1713
				1714	onError:
				1715	Py_DECREF(v);
				1716	return NULL;
				1717	}
				1718
				1719	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				1720	PyObject *mapping)
				1721	{
				1722	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				1723	PyErr_BadArgument();
				1724	return NULL;
				1725	}
				1726	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				1727	PyUnicode_GET_SIZE(unicode),
				1728	mapping,
				1729	NULL);
				1730	}
				1731
				1732	static
				1733	int translate_error(const Py_UNICODE **source,
				1734	Py_UNICODE **dest,
				1735	const char *errors,
				1736	const char *details)
				1737	{
				1738	if ((errors == NULL) \|\|
				1739	(strcmp(errors,"strict") == 0)) {
				1740	PyErr_Format(PyExc_UnicodeError,
				1741	"translate error: %s",
				1742	details);
				1743	return -1;
				1744	}
				1745	else if (strcmp(errors,"ignore") == 0) {
				1746	return 0;
				1747	}
				1748	else if (strcmp(errors,"replace") == 0) {
				1749	**dest = '?';
				1750	(*dest)++;
				1751	return 0;
				1752	}
				1753	else {
				1754	PyErr_Format(PyExc_ValueError,
				1755	"translate error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1756	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1757	errors);
				1758	return -1;
				1759	}
				1760	}
				1761
				1762	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				1763	int size,
				1764	PyObject *mapping,
				1765	const char *errors)
				1766	{
				1767	PyUnicodeObject *v;
				1768	Py_UNICODE *p;
				1769
				1770	if (mapping == NULL) {
				1771	PyErr_BadArgument();
				1772	return NULL;
				1773	}
				1774
				1775	/* Output will never be longer than input */
				1776	v = _PyUnicode_New(size);
				1777	if (v == NULL)
				1778	goto onError;
				1779	if (size == 0)
				1780	goto done;
				1781	p = PyUnicode_AS_UNICODE(v);
				1782	while (size-- > 0) {
				1783	Py_UNICODE ch = *s++;
				1784	PyObject w, x;
				1785
				1786	/* Get mapping */
				1787	w = PyInt_FromLong(ch);
				1788	if (w == NULL)
				1789	goto onError;
				1790	x = PyObject_GetItem(mapping, w);
				1791	Py_DECREF(w);
				1792	if (x == NULL) {
				1793	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1794	/* No mapping found: default to 1-1 mapping */
				1795	PyErr_Clear();
				1796	*p++ = ch;
				1797	continue;
				1798	}
				1799	goto onError;
				1800	}
				1801
				1802	/* Apply mapping */
				1803	if (PyInt_Check(x))
				1804	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				1805	else if (x == Py_None) {
				1806	/* undefined mapping */
				1807	if (translate_error(&s, &p, errors,
				1808	"character maps to <undefined>")) {
				1809	Py_DECREF(x);
				1810	goto onError;
				1811	}
				1812	}
				1813	else if (PyUnicode_Check(x)) {
				1814	if (PyUnicode_GET_SIZE(x) != 1) {
				1815	/* 1-n mapping */
				1816	PyErr_SetString(PyExc_NotImplementedError,
				1817	"1-n mappings are currently not implemented");
				1818	Py_DECREF(x);
				1819	goto onError;
				1820	}
				1821	p++ = PyUnicode_AS_UNICODE(x);
				1822	}
				1823	else {
				1824	/* wrong return value */
				1825	PyErr_SetString(PyExc_TypeError,
				1826	"translate mapping must return integer, None or unicode");
				1827	Py_DECREF(x);
				1828	goto onError;
				1829	}
				1830	Py_DECREF(x);
				1831	}
				1832	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1833	_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
				1834
				1835	done:
				1836	return (PyObject *)v;
				1837
				1838	onError:
				1839	Py_XDECREF(v);
				1840	return NULL;
				1841	}
				1842
				1843	PyObject PyUnicode_Translate(PyObject str,
				1844	PyObject *mapping,
				1845	const char *errors)
				1846	{
				1847	PyObject *result;
				1848
				1849	str = PyUnicode_FromObject(str);
				1850	if (str == NULL)
				1851	goto onError;
				1852	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				1853	PyUnicode_GET_SIZE(str),
				1854	mapping,
				1855	errors);
				1856	Py_DECREF(str);
				1857	return result;
				1858
				1859	onError:
				1860	Py_XDECREF(str);
				1861	return NULL;
				1862	}
				1863
				1864	/* --- Helpers ------------------------------------------------------------ */
				1865
				1866	static
				1867	int count(PyUnicodeObject *self,
				1868	int start,
				1869	int end,
				1870	PyUnicodeObject *substring)
				1871	{
				1872	int count = 0;
				1873
				1874	end -= substring->length;
				1875
				1876	while (start <= end)
				1877	if (Py_UNICODE_MATCH(self, start, substring)) {
				1878	count++;
				1879	start += substring->length;
				1880	} else
				1881	start++;
				1882
				1883	return count;
				1884	}
				1885
				1886	int PyUnicode_Count(PyObject *str,
				1887	PyObject *substr,
				1888	int start,
				1889	int end)
				1890	{
				1891	int result;
				1892
				1893	str = PyUnicode_FromObject(str);
				1894	if (str == NULL)
				1895	return -1;
				1896	substr = PyUnicode_FromObject(substr);
				1897	if (substr == NULL) {
				1898	Py_DECREF(substr);
				1899	return -1;
				1900	}
				1901
				1902	result = count((PyUnicodeObject *)str,
				1903	start, end,
				1904	(PyUnicodeObject *)substr);
				1905
				1906	Py_DECREF(str);
				1907	Py_DECREF(substr);
				1908	return result;
				1909	}
				1910
				1911	static
				1912	int findstring(PyUnicodeObject *self,
				1913	PyUnicodeObject *substring,
				1914	int start,
				1915	int end,
				1916	int direction)
				1917	{
				1918	if (start < 0)
				1919	start += self->length;
				1920	if (start < 0)
				1921	start = 0;
				1922
				1923	if (substring->length == 0)
				1924	return start;
				1925
				1926	if (end > self->length)
				1927	end = self->length;
				1928	if (end < 0)
				1929	end += self->length;
				1930	if (end < 0)
				1931	end = 0;
				1932
				1933	end -= substring->length;
				1934
				1935	if (direction < 0) {
				1936	for (; end >= start; end--)
				1937	if (Py_UNICODE_MATCH(self, end, substring))
				1938	return end;
				1939	} else {
				1940	for (; start <= end; start++)
				1941	if (Py_UNICODE_MATCH(self, start, substring))
				1942	return start;
				1943	}
				1944
				1945	return -1;
				1946	}
				1947
				1948	int PyUnicode_Find(PyObject *str,
				1949	PyObject *substr,
				1950	int start,
				1951	int end,
				1952	int direction)
				1953	{
				1954	int result;
				1955
				1956	str = PyUnicode_FromObject(str);
				1957	if (str == NULL)
				1958	return -1;
				1959	substr = PyUnicode_FromObject(substr);
				1960	if (substr == NULL) {
				1961	Py_DECREF(substr);
				1962	return -1;
				1963	}
				1964
				1965	result = findstring((PyUnicodeObject *)str,
				1966	(PyUnicodeObject *)substr,
				1967	start, end, direction);
				1968	Py_DECREF(str);
				1969	Py_DECREF(substr);
				1970	return result;
				1971	}
				1972
				1973	static
				1974	int tailmatch(PyUnicodeObject *self,
				1975	PyUnicodeObject *substring,
				1976	int start,
				1977	int end,
				1978	int direction)
				1979	{
				1980	if (start < 0)
				1981	start += self->length;
				1982	if (start < 0)
				1983	start = 0;
				1984
				1985	if (substring->length == 0)
				1986	return 1;
				1987
				1988	if (end > self->length)
				1989	end = self->length;
				1990	if (end < 0)
				1991	end += self->length;
				1992	if (end < 0)
				1993	end = 0;
				1994
				1995	end -= substring->length;
				1996	if (end < start)
				1997	return 0;
				1998
				1999	if (direction > 0) {
				2000	if (Py_UNICODE_MATCH(self, end, substring))
				2001	return 1;
				2002	} else {
				2003	if (Py_UNICODE_MATCH(self, start, substring))
				2004	return 1;
				2005	}
				2006
				2007	return 0;
				2008	}
				2009
				2010	int PyUnicode_Tailmatch(PyObject *str,
				2011	PyObject *substr,
				2012	int start,
				2013	int end,
				2014	int direction)
				2015	{
				2016	int result;
				2017
				2018	str = PyUnicode_FromObject(str);
				2019	if (str == NULL)
				2020	return -1;
				2021	substr = PyUnicode_FromObject(substr);
				2022	if (substr == NULL) {
				2023	Py_DECREF(substr);
				2024	return -1;
				2025	}
				2026
				2027	result = tailmatch((PyUnicodeObject *)str,
				2028	(PyUnicodeObject *)substr,
				2029	start, end, direction);
				2030	Py_DECREF(str);
				2031	Py_DECREF(substr);
				2032	return result;
				2033	}
				2034
				2035	static
				2036	const Py_UNICODE findchar(const Py_UNICODE s,
				2037	int size,
				2038	Py_UNICODE ch)
				2039	{
				2040	/* like wcschr, but doesn't stop at NULL characters */
				2041
				2042	while (size-- > 0) {
				2043	if (*s == ch)
				2044	return s;
				2045	s++;
				2046	}
				2047
				2048	return NULL;
				2049	}
				2050
				2051	/* Apply fixfct filter to the Unicode object self and return a
				2052	reference to the modified object */
				2053
				2054	static
				2055	PyObject fixup(PyUnicodeObject self,
				2056	int (fixfct)(PyUnicodeObject s))
				2057	{
				2058
				2059	PyUnicodeObject *u;
				2060
				2061	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2062	self->length);
				2063	if (u == NULL)
				2064	return NULL;
				2065	if (!fixfct(u)) {
				2066	/* fixfct should return TRUE if it modified the buffer. If
				2067	FALSE, return a reference to the original buffer instead
				2068	(to save space, not time) */
				2069	Py_INCREF(self);
				2070	Py_DECREF(u);
				2071	return (PyObject*) self;
				2072	}
				2073	return (PyObject*) u;
				2074	}
				2075
				2076	static
				2077	int fixupper(PyUnicodeObject *self)
				2078	{
				2079	int len = self->length;
				2080	Py_UNICODE *s = self->str;
				2081	int status = 0;
				2082
				2083	while (len-- > 0) {
				2084	register Py_UNICODE ch;
				2085
				2086	ch = Py_UNICODE_TOUPPER(*s);
				2087	if (ch != *s) {
				2088	status = 1;
				2089	*s = ch;
				2090	}
				2091	s++;
				2092	}
				2093
				2094	return status;
				2095	}
				2096
				2097	static
				2098	int fixlower(PyUnicodeObject *self)
				2099	{
				2100	int len = self->length;
				2101	Py_UNICODE *s = self->str;
				2102	int status = 0;
				2103
				2104	while (len-- > 0) {
				2105	register Py_UNICODE ch;
				2106
				2107	ch = Py_UNICODE_TOLOWER(*s);
				2108	if (ch != *s) {
				2109	status = 1;
				2110	*s = ch;
				2111	}
				2112	s++;
				2113	}
				2114
				2115	return status;
				2116	}
				2117
				2118	static
				2119	int fixswapcase(PyUnicodeObject *self)
				2120	{
				2121	int len = self->length;
				2122	Py_UNICODE *s = self->str;
				2123	int status = 0;
				2124
				2125	while (len-- > 0) {
				2126	if (Py_UNICODE_ISUPPER(*s)) {
				2127	s = Py_UNICODE_TOLOWER(s);
				2128	status = 1;
				2129	} else if (Py_UNICODE_ISLOWER(*s)) {
				2130	s = Py_UNICODE_TOUPPER(s);
				2131	status = 1;
				2132	}
				2133	s++;
				2134	}
				2135
				2136	return status;
				2137	}
				2138
				2139	static
				2140	int fixcapitalize(PyUnicodeObject *self)
				2141	{
				2142	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2143	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2144	return 1;
				2145	}
				2146	return 0;
				2147	}
				2148
				2149	static
				2150	int fixtitle(PyUnicodeObject *self)
				2151	{
				2152	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2153	register Py_UNICODE *e;
				2154	int previous_is_cased;
				2155
				2156	/* Shortcut for single character strings */
				2157	if (PyUnicode_GET_SIZE(self) == 1) {
				2158	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2159	if (*p != ch) {
				2160	*p = ch;
				2161	return 1;
				2162	}
				2163	else
				2164	return 0;
				2165	}
				2166
				2167	e = p + PyUnicode_GET_SIZE(self);
				2168	previous_is_cased = 0;
				2169	for (; p < e; p++) {
				2170	register const Py_UNICODE ch = *p;
				2171
				2172	if (previous_is_cased)
				2173	*p = Py_UNICODE_TOLOWER(ch);
				2174	else
				2175	*p = Py_UNICODE_TOTITLE(ch);
				2176
				2177	if (Py_UNICODE_ISLOWER(ch) \|\|
				2178	Py_UNICODE_ISUPPER(ch) \|\|
				2179	Py_UNICODE_ISTITLE(ch))
				2180	previous_is_cased = 1;
				2181	else
				2182	previous_is_cased = 0;
				2183	}
				2184	return 1;
				2185	}
				2186
				2187	PyObject PyUnicode_Join(PyObject separator,
				2188	PyObject *seq)
				2189	{
				2190	Py_UNICODE *sep;
				2191	int seplen;
				2192	PyUnicodeObject *res = NULL;
				2193	int reslen = 0;
				2194	Py_UNICODE *p;
				2195	int seqlen = 0;
				2196	int sz = 100;
				2197	int i;
				2198
				2199	seqlen = PySequence_Length(seq);
				2200	if (seqlen < 0 && PyErr_Occurred())
				2201	return NULL;
				2202
				2203	if (separator == NULL) {
				2204	Py_UNICODE blank = ' ';
				2205	sep = &blank;
				2206	seplen = 1;
				2207	}
				2208	else {
				2209	separator = PyUnicode_FromObject(separator);
				2210	if (separator == NULL)
				2211	return NULL;
				2212	sep = PyUnicode_AS_UNICODE(separator);
				2213	seplen = PyUnicode_GET_SIZE(separator);
				2214	}
				2215
				2216	res = _PyUnicode_New(sz);
				2217	if (res == NULL)
				2218	goto onError;
				2219	p = PyUnicode_AS_UNICODE(res);
				2220	reslen = 0;
				2221
				2222	for (i = 0; i < seqlen; i++) {
				2223	int itemlen;
				2224	PyObject *item;
				2225
				2226	item = PySequence_GetItem(seq, i);
				2227	if (item == NULL)
				2228	goto onError;
				2229	if (!PyUnicode_Check(item)) {
				2230	PyObject *v;
				2231	v = PyUnicode_FromObject(item);
				2232	Py_DECREF(item);
				2233	item = v;
				2234	if (item == NULL)
				2235	goto onError;
				2236	}
				2237	itemlen = PyUnicode_GET_SIZE(item);
				2238	while (reslen + itemlen + seplen >= sz) {
				2239	if (_PyUnicode_Resize(res, sz*2))
				2240	goto onError;
				2241	sz *= 2;
				2242	p = PyUnicode_AS_UNICODE(res) + reslen;
				2243	}
				2244	if (i > 0) {
				2245	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2246	p += seplen;
				2247	reslen += seplen;
				2248	}
				2249	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2250	p += itemlen;
				2251	reslen += itemlen;
				2252	Py_DECREF(item);
				2253	}
				2254	if (_PyUnicode_Resize(res, reslen))
				2255	goto onError;
				2256
				2257	Py_XDECREF(separator);
				2258	return (PyObject *)res;
				2259
				2260	onError:
				2261	Py_XDECREF(separator);
				2262	Py_DECREF(res);
				2263	return NULL;
				2264	}
				2265
				2266	static
				2267	PyUnicodeObject pad(PyUnicodeObject self,
				2268	int left,
				2269	int right,
				2270	Py_UNICODE fill)
				2271	{
				2272	PyUnicodeObject *u;
				2273
				2274	if (left < 0)
				2275	left = 0;
				2276	if (right < 0)
				2277	right = 0;
				2278
				2279	if (left == 0 && right == 0) {
				2280	Py_INCREF(self);
				2281	return self;
				2282	}
				2283
				2284	u = _PyUnicode_New(left + self->length + right);
				2285	if (u) {
				2286	if (left)
				2287	Py_UNICODE_FILL(u->str, fill, left);
				2288	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2289	if (right)
				2290	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2291	}
				2292
				2293	return u;
				2294	}
				2295
				2296	#define SPLIT_APPEND(data, left, right) \
				2297	str = PyUnicode_FromUnicode(data + left, right - left); \
				2298	if (!str) \
				2299	goto onError; \
				2300	if (PyList_Append(list, str)) { \
				2301	Py_DECREF(str); \
				2302	goto onError; \
				2303	} \
				2304	else \
				2305	Py_DECREF(str);
				2306
				2307	static
				2308	PyObject split_whitespace(PyUnicodeObject self,
				2309	PyObject *list,
				2310	int maxcount)
				2311	{
				2312	register int i;
				2313	register int j;
				2314	int len = self->length;
				2315	PyObject *str;
				2316
				2317	for (i = j = 0; i < len; ) {
				2318	/* find a token */
				2319	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2320	i++;
				2321	j = i;
				2322	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2323	i++;
				2324	if (j < i) {
				2325	if (maxcount-- <= 0)
				2326	break;
				2327	SPLIT_APPEND(self->str, j, i);
				2328	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2329	i++;
				2330	j = i;
				2331	}
				2332	}
				2333	if (j < len) {
				2334	SPLIT_APPEND(self->str, j, len);
				2335	}
				2336	return list;
				2337
				2338	onError:
				2339	Py_DECREF(list);
				2340	return NULL;
				2341	}
				2342
				2343	PyObject PyUnicode_Splitlines(PyObject string,
				2344	int maxcount)
				2345	{
				2346	register int i;
				2347	register int j;
				2348	int len;
				2349	PyObject *list;
				2350	PyObject *str;
				2351	Py_UNICODE *data;
				2352
				2353	string = PyUnicode_FromObject(string);
				2354	if (string == NULL)
				2355	return NULL;
				2356	data = PyUnicode_AS_UNICODE(string);
				2357	len = PyUnicode_GET_SIZE(string);
				2358
				2359	if (maxcount < 0)
				2360	maxcount = INT_MAX;
				2361
				2362	list = PyList_New(0);
				2363	if (!list)
				2364	goto onError;
				2365
				2366	for (i = j = 0; i < len; ) {
				2367	/* Find a line and append it */
				2368	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2369	i++;
				2370	if (maxcount-- <= 0)
				2371	break;
				2372	SPLIT_APPEND(data, j, i);
				2373
				2374	/* Skip the line break reading CRLF as one line break */
				2375	if (i < len) {
				2376	if (data[i] == '\r' && i + 1 < len &&
				2377	data[i+1] == '\n')
				2378	i += 2;
				2379	else
				2380	i++;
				2381	}
				2382	j = i;
				2383	}
				2384	if (j < len) {
				2385	SPLIT_APPEND(data, j, len);
				2386	}
				2387
				2388	Py_DECREF(string);
				2389	return list;
				2390
				2391	onError:
				2392	Py_DECREF(list);
				2393	Py_DECREF(string);
				2394	return NULL;
				2395	}
				2396
				2397	static
				2398	PyObject split_char(PyUnicodeObject self,
				2399	PyObject *list,
				2400	Py_UNICODE ch,
				2401	int maxcount)
				2402	{
				2403	register int i;
				2404	register int j;
				2405	int len = self->length;
				2406	PyObject *str;
				2407
				2408	for (i = j = 0; i < len; ) {
				2409	if (self->str[i] == ch) {
				2410	if (maxcount-- <= 0)
				2411	break;
				2412	SPLIT_APPEND(self->str, j, i);
				2413	i = j = i + 1;
				2414	} else
				2415	i++;
				2416	}
				2417	if (j <= len) {
				2418	SPLIT_APPEND(self->str, j, len);
				2419	}
				2420	return list;
				2421
				2422	onError:
				2423	Py_DECREF(list);
				2424	return NULL;
				2425	}
				2426
				2427	static
				2428	PyObject split_substring(PyUnicodeObject self,
				2429	PyObject *list,
				2430	PyUnicodeObject *substring,
				2431	int maxcount)
				2432	{
				2433	register int i;
				2434	register int j;
				2435	int len = self->length;
				2436	int sublen = substring->length;
				2437	PyObject *str;
				2438
				2439	for (i = j = 0; i < len - sublen; ) {
				2440	if (Py_UNICODE_MATCH(self, i, substring)) {
				2441	if (maxcount-- <= 0)
				2442	break;
				2443	SPLIT_APPEND(self->str, j, i);
				2444	i = j = i + sublen;
				2445	} else
				2446	i++;
				2447	}
				2448	if (j <= len) {
				2449	SPLIT_APPEND(self->str, j, len);
				2450	}
				2451	return list;
				2452
				2453	onError:
				2454	Py_DECREF(list);
				2455	return NULL;
				2456	}
				2457
				2458	#undef SPLIT_APPEND
				2459
				2460	static
				2461	PyObject split(PyUnicodeObject self,
				2462	PyUnicodeObject *substring,
				2463	int maxcount)
				2464	{
				2465	PyObject *list;
				2466
				2467	if (maxcount < 0)
				2468	maxcount = INT_MAX;
				2469
				2470	list = PyList_New(0);
				2471	if (!list)
				2472	return NULL;
				2473
				2474	if (substring == NULL)
				2475	return split_whitespace(self,list,maxcount);
				2476
				2477	else if (substring->length == 1)
				2478	return split_char(self,list,substring->str[0],maxcount);
				2479
				2480	else if (substring->length == 0) {
				2481	Py_DECREF(list);
				2482	PyErr_SetString(PyExc_ValueError, "empty separator");
				2483	return NULL;
				2484	}
				2485	else
				2486	return split_substring(self,list,substring,maxcount);
				2487	}
				2488
				2489	static
				2490	PyObject strip(PyUnicodeObject self,
				2491	int left,
				2492	int right)
				2493	{
				2494	Py_UNICODE *p = self->str;
				2495	int start = 0;
				2496	int end = self->length;
				2497
				2498	if (left)
				2499	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2500	start++;
				2501
				2502	if (right)
				2503	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2504	end--;
				2505
				2506	if (start == 0 && end == self->length) {
				2507	/* couldn't strip anything off, return original string */
				2508	Py_INCREF(self);
				2509	return (PyObject*) self;
				2510	}
				2511
				2512	return (PyObject*) PyUnicode_FromUnicode(
				2513	self->str + start,
				2514	end - start
				2515	);
				2516	}
				2517
				2518	static
				2519	PyObject replace(PyUnicodeObject self,
				2520	PyUnicodeObject *str1,
				2521	PyUnicodeObject *str2,
				2522	int maxcount)
				2523	{
				2524	PyUnicodeObject *u;
				2525
				2526	if (maxcount < 0)
				2527	maxcount = INT_MAX;
				2528
				2529	if (str1->length == 1 && str2->length == 1) {
				2530	int i;
				2531
				2532	/* replace characters */
				2533	if (!findchar(self->str, self->length, str1->str[0])) {
				2534	/* nothing to replace, return original string */
				2535	Py_INCREF(self);
				2536	u = self;
				2537	} else {
				2538	Py_UNICODE u1 = str1->str[0];
				2539	Py_UNICODE u2 = str2->str[0];
				2540
				2541	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2542	self->str,
				2543	self->length
				2544	);
				2545	if (u)
				2546	for (i = 0; i < u->length; i++)
				2547	if (u->str[i] == u1) {
				2548	if (--maxcount < 0)
				2549	break;
				2550	u->str[i] = u2;
				2551	}
				2552	}
				2553
				2554	} else {
				2555	int n, i;
				2556	Py_UNICODE *p;
				2557
				2558	/* replace strings */
				2559	n = count(self, 0, self->length, str1);
				2560	if (n > maxcount)
				2561	n = maxcount;
				2562	if (n == 0) {
				2563	/* nothing to replace, return original string */
				2564	Py_INCREF(self);
				2565	u = self;
				2566	} else {
				2567	u = _PyUnicode_New(
				2568	self->length + n * (str2->length - str1->length));
				2569	if (u) {
				2570	i = 0;
				2571	p = u->str;
				2572	while (i <= self->length - str1->length)
				2573	if (Py_UNICODE_MATCH(self, i, str1)) {
				2574	/* replace string segment */
				2575	Py_UNICODE_COPY(p, str2->str, str2->length);
				2576	p += str2->length;
				2577	i += str1->length;
				2578	if (--n <= 0) {
				2579	/* copy remaining part */
				2580	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				2581	break;
				2582	}
				2583	} else
				2584	*p++ = self->str[i++];
				2585	}
				2586	}
				2587	}
				2588
				2589	return (PyObject *) u;
				2590	}
				2591
				2592	/* --- Unicode Object Methods --------------------------------------------- */
				2593
				2594	static char title__doc__[] =
				2595	"S.title() -> unicode\n\
				2596	\n\
				2597	Return a titlecased version of S, i.e. words start with title case\n\
				2598	characters, all remaining cased characters have lower case.";
				2599
				2600	static PyObject*
				2601	unicode_title(PyUnicodeObject self, PyObject args)
				2602	{
				2603	if (!PyArg_NoArgs(args))
				2604	return NULL;
				2605	return fixup(self, fixtitle);
				2606	}
				2607
				2608	static char capitalize__doc__[] =
				2609	"S.capitalize() -> unicode\n\
				2610	\n\
				2611	Return a capitalized version of S, i.e. make the first character\n\
				2612	have upper case.";
				2613
				2614	static PyObject*
				2615	unicode_capitalize(PyUnicodeObject self, PyObject args)
				2616	{
				2617	if (!PyArg_NoArgs(args))
				2618	return NULL;
				2619	return fixup(self, fixcapitalize);
				2620	}
				2621
				2622	#if 0
				2623	static char capwords__doc__[] =
				2624	"S.capwords() -> unicode\n\
				2625	\n\
				2626	Apply .capitalize() to all words in S and return the result with\n\
				2627	normalized whitespace (all whitespace strings are replaced by ' ').";
				2628
				2629	static PyObject*
				2630	unicode_capwords(PyUnicodeObject self, PyObject args)
				2631	{
				2632	PyObject *list;
				2633	PyObject *item;
				2634	int i;
				2635
				2636	if (!PyArg_NoArgs(args))
				2637	return NULL;
				2638
				2639	/* Split into words */
				2640	list = split(self, NULL, -1);
				2641	if (!list)
				2642	return NULL;
				2643
				2644	/* Capitalize each word */
				2645	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				2646	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				2647	fixcapitalize);
				2648	if (item == NULL)
				2649	goto onError;
				2650	Py_DECREF(PyList_GET_ITEM(list, i));
				2651	PyList_SET_ITEM(list, i, item);
				2652	}
				2653
				2654	/* Join the words to form a new string */
				2655	item = PyUnicode_Join(NULL, list);
				2656
				2657	onError:
				2658	Py_DECREF(list);
				2659	return (PyObject *)item;
				2660	}
				2661	#endif
				2662
				2663	static char center__doc__[] =
				2664	"S.center(width) -> unicode\n\
				2665	\n\
				2666	Return S centered in a Unicode string of length width. Padding is done\n\
				2667	using spaces.";
				2668
				2669	static PyObject *
				2670	unicode_center(PyUnicodeObject self, PyObject args)
				2671	{
				2672	int marg, left;
				2673	int width;
				2674
				2675	if (!PyArg_ParseTuple(args, "i:center", &width))
				2676	return NULL;
				2677
				2678	if (self->length >= width) {
				2679	Py_INCREF(self);
				2680	return (PyObject*) self;
				2681	}
				2682
				2683	marg = width - self->length;
				2684	left = marg / 2 + (marg & width & 1);
				2685
				2686	return (PyObject*) pad(self, left, marg - left, ' ');
				2687	}
				2688
				2689	static int
				2690	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				2691	{
				2692	int len1, len2;
				2693	Py_UNICODE *s1 = str1->str;
				2694	Py_UNICODE *s2 = str2->str;
				2695
				2696	len1 = str1->length;
				2697	len2 = str2->length;
				2698
				2699	while (len1 > 0 && len2 > 0) {
				2700	int cmp = (s1++) - (s2++);
				2701	if (cmp)
				2702	/* This should make Christian happy! */
				2703	return (cmp < 0) ? -1 : (cmp != 0);
				2704	len1--, len2--;
				2705	}
				2706
				2707	return (len1 < len2) ? -1 : (len1 != len2);
				2708	}
				2709
				2710	int PyUnicode_Compare(PyObject *left,
				2711	PyObject *right)
				2712	{
				2713	PyUnicodeObject u = NULL, v = NULL;
				2714	int result;
				2715
				2716	/* Coerce the two arguments */
				2717	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2718	if (u == NULL)
				2719	goto onError;
				2720	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2721	if (v == NULL)
				2722	goto onError;
				2723
				2724	/* Shortcut for emtpy or interned objects */
				2725	if (v == u) {
				2726	Py_DECREF(u);
				2727	Py_DECREF(v);
				2728	return 0;
				2729	}
				2730
				2731	result = unicode_compare(u, v);
				2732
				2733	Py_DECREF(u);
				2734	Py_DECREF(v);
				2735	return result;
				2736
				2737	onError:
				2738	Py_XDECREF(u);
				2739	Py_XDECREF(v);
				2740	return -1;
				2741	}
				2742
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2743	int PyUnicode_Contains(PyObject *container,
				2744	PyObject *element)
				2745	{
				2746	PyUnicodeObject u = NULL, v = NULL;
				2747	int result;
				2748	register const Py_UNICODE p, e;
				2749	register Py_UNICODE ch;
				2750
				2751	/* Coerce the two arguments */
				2752	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				2753	if (u == NULL)
				2754	goto onError;
				2755	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
				2756	if (v == NULL)
				2757	goto onError;
				2758
				2759	/* Check v in u */
				2760	if (PyUnicode_GET_SIZE(v) != 1) {
				2761	PyErr_SetString(PyExc_TypeError,
				2762	"string member test needs char left operand");
				2763	goto onError;
				2764	}
				2765	ch = *PyUnicode_AS_UNICODE(v);
				2766	p = PyUnicode_AS_UNICODE(u);
				2767	e = p + PyUnicode_GET_SIZE(u);
				2768	result = 0;
				2769	while (p < e) {
				2770	if (*p++ == ch) {
				2771	result = 1;
				2772	break;
				2773	}
				2774	}
				2775
				2776	Py_DECREF(u);
				2777	Py_DECREF(v);
				2778	return result;
				2779
				2780	onError:
				2781	Py_XDECREF(u);
				2782	Py_XDECREF(v);
				2783	return -1;
				2784	}
				2785
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2786	/* Concat to string or Unicode object giving a new Unicode object. */
				2787
				2788	PyObject PyUnicode_Concat(PyObject left,
				2789	PyObject *right)
				2790	{
				2791	PyUnicodeObject u = NULL, v = NULL, *w;
				2792
				2793	/* Coerce the two arguments */
				2794	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2795	if (u == NULL)
				2796	goto onError;
				2797	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2798	if (v == NULL)
				2799	goto onError;
				2800
				2801	/* Shortcuts */
				2802	if (v == unicode_empty) {
				2803	Py_DECREF(v);
				2804	return (PyObject *)u;
				2805	}
				2806	if (u == unicode_empty) {
				2807	Py_DECREF(u);
				2808	return (PyObject *)v;
				2809	}
				2810
				2811	/* Concat the two Unicode strings */
				2812	w = _PyUnicode_New(u->length + v->length);
				2813	if (w == NULL)
				2814	goto onError;
				2815	Py_UNICODE_COPY(w->str, u->str, u->length);
				2816	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				2817
				2818	Py_DECREF(u);
				2819	Py_DECREF(v);
				2820	return (PyObject *)w;
				2821
				2822	onError:
				2823	Py_XDECREF(u);
				2824	Py_XDECREF(v);
				2825	return NULL;
				2826	}
				2827
				2828	static char count__doc__[] =
				2829	"S.count(sub[, start[, end]]) -> int\n\
				2830	\n\
				2831	Return the number of occurrences of substring sub in Unicode string\n\
				2832	S[start:end]. Optional arguments start and end are\n\
				2833	interpreted as in slice notation.";
				2834
				2835	static PyObject *
				2836	unicode_count(PyUnicodeObject self, PyObject args)
				2837	{
				2838	PyUnicodeObject *substring;
				2839	int start = 0;
				2840	int end = INT_MAX;
				2841	PyObject *result;
				2842
				2843	if (!PyArg_ParseTuple(args, "O\|ii:count", &substring, &start, &end))
				2844	return NULL;
				2845
				2846	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				2847	(PyObject *)substring);
				2848	if (substring == NULL)
				2849	return NULL;
				2850
				2851	if (substring->length == 0) {
				2852	Py_DECREF(substring);
				2853	return PyInt_FromLong((long) 0);
				2854	}
				2855
				2856	if (start < 0)
				2857	start += self->length;
				2858	if (start < 0)
				2859	start = 0;
				2860	if (end > self->length)
				2861	end = self->length;
				2862	if (end < 0)
				2863	end += self->length;
				2864	if (end < 0)
				2865	end = 0;
				2866
				2867	result = PyInt_FromLong((long) count(self, start, end, substring));
				2868
				2869	Py_DECREF(substring);
				2870	return result;
				2871	}
				2872
				2873	static char encode__doc__[] =
				2874	"S.encode([encoding[,errors]]) -> string\n\
				2875	\n\
				2876	Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
				2877	errors may be given to set a different error handling scheme. Default\n\
				2878	is 'strict' meaning that encoding errors raise a ValueError. Other\n\
				2879	possible values are 'ignore' and 'replace'.";
				2880
				2881	static PyObject *
				2882	unicode_encode(PyUnicodeObject self, PyObject args)
				2883	{
				2884	char *encoding = NULL;
				2885	char *errors = NULL;
				2886	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				2887	return NULL;
				2888	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				2889	}
				2890
				2891	static char expandtabs__doc__[] =
				2892	"S.expandtabs([tabsize]) -> unicode\n\
				2893	\n\
				2894	Return a copy of S where all tab characters are expanded using spaces.\n\
				2895	If tabsize is not given, a tab size of 8 characters is assumed.";
				2896
				2897	static PyObject*
				2898	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				2899	{
				2900	Py_UNICODE *e;
				2901	Py_UNICODE *p;
				2902	Py_UNICODE *q;
				2903	int i, j;
				2904	PyUnicodeObject *u;
				2905	int tabsize = 8;
				2906
				2907	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				2908	return NULL;
				2909
				2910	/* First pass: determine size of ouput string */
				2911	i = j = 0;
				2912	e = self->str + self->length;
				2913	for (p = self->str; p < e; p++)
				2914	if (*p == '\t') {
				2915	if (tabsize > 0)
				2916	j += tabsize - (j % tabsize);
				2917	}
				2918	else {
				2919	j++;
				2920	if (p == '\n' \|\| p == '\r') {
				2921	i += j;
				2922	j = 0;
				2923	}
				2924	}
				2925
				2926	/* Second pass: create output string and fill it */
				2927	u = _PyUnicode_New(i + j);
				2928	if (!u)
				2929	return NULL;
				2930
				2931	j = 0;
				2932	q = u->str;
				2933
				2934	for (p = self->str; p < e; p++)
				2935	if (*p == '\t') {
				2936	if (tabsize > 0) {
				2937	i = tabsize - (j % tabsize);
				2938	j += i;
				2939	while (i--)
				2940	*q++ = ' ';
				2941	}
				2942	}
				2943	else {
				2944	j++;
				2945	q++ = p;
				2946	if (p == '\n' \|\| p == '\r')
				2947	j = 0;
				2948	}
				2949
				2950	return (PyObject*) u;
				2951	}
				2952
				2953	static char find__doc__[] =
				2954	"S.find(sub [,start [,end]]) -> int\n\
				2955	\n\
				2956	Return the lowest index in S where substring sub is found,\n\
				2957	such that sub is contained within s[start,end]. Optional\n\
				2958	arguments start and end are interpreted as in slice notation.\n\
				2959	\n\
				2960	Return -1 on failure.";
				2961
				2962	static PyObject *
				2963	unicode_find(PyUnicodeObject self, PyObject args)
				2964	{
				2965	PyUnicodeObject *substring;
				2966	int start = 0;
				2967	int end = INT_MAX;
				2968	PyObject *result;
				2969
				2970	if (!PyArg_ParseTuple(args, "O\|ii:find", &substring, &start, &end))
				2971	return NULL;
				2972	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				2973	(PyObject *)substring);
				2974	if (substring == NULL)
				2975	return NULL;
				2976
				2977	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				2978
				2979	Py_DECREF(substring);
				2980	return result;
				2981	}
				2982
				2983	static PyObject *
				2984	unicode_getitem(PyUnicodeObject *self, int index)
				2985	{
				2986	if (index < 0 \|\| index >= self->length) {
				2987	PyErr_SetString(PyExc_IndexError, "string index out of range");
				2988	return NULL;
				2989	}
				2990
				2991	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				2992	}
				2993
				2994	static long
				2995	unicode_hash(PyUnicodeObject *self)
				2996	{
				2997	long hash;
				2998	PyObject *utf8;
				2999
				3000	/* Since Unicode objects compare equal to their UTF-8 string
				3001	counterparts, they should also use the UTF-8 strings as basis
				3002	for their hash value. This is needed to assure that strings and
				3003	Unicode objects behave in the same way as dictionary
				3004	keys. Unfortunately, this costs some performance and also some
				3005	memory if the cached UTF-8 representation is not used later
				3006	on. */
				3007	if (self->hash != -1)
				3008	return self->hash;
				3009	utf8 = utf8_string(self, NULL);
				3010	if (utf8 == NULL)
				3011	return -1;
				3012	hash = PyObject_Hash(utf8);
				3013	if (hash == -1)
				3014	return -1;
				3015	self->hash = hash;
				3016	return hash;
				3017	}
				3018
				3019	static char index__doc__[] =
				3020	"S.index(sub [,start [,end]]) -> int\n\
				3021	\n\
				3022	Like S.find() but raise ValueError when the substring is not found.";
				3023
				3024	static PyObject *
				3025	unicode_index(PyUnicodeObject self, PyObject args)
				3026	{
				3027	int result;
				3028	PyUnicodeObject *substring;
				3029	int start = 0;
				3030	int end = INT_MAX;
				3031
				3032	if (!PyArg_ParseTuple(args, "O\|ii:index", &substring, &start, &end))
				3033	return NULL;
				3034
				3035	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3036	(PyObject *)substring);
				3037	if (substring == NULL)
				3038	return NULL;
				3039
				3040	result = findstring(self, substring, start, end, 1);
				3041
				3042	Py_DECREF(substring);
				3043	if (result < 0) {
				3044	PyErr_SetString(PyExc_ValueError, "substring not found");
				3045	return NULL;
				3046	}
				3047	return PyInt_FromLong(result);
				3048	}
				3049
				3050	static char islower__doc__[] =
				3051	"S.islower() -> int\n\
				3052	\n\
				3053	Return 1 if all cased characters in S are lowercase and there is\n\
				3054	at least one cased character in S, 0 otherwise.";
				3055
				3056	static PyObject*
				3057	unicode_islower(PyUnicodeObject self, PyObject args)
				3058	{
				3059	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3060	register const Py_UNICODE *e;
				3061	int cased;
				3062
				3063	if (!PyArg_NoArgs(args))
				3064	return NULL;
				3065
				3066	/* Shortcut for single character strings */
				3067	if (PyUnicode_GET_SIZE(self) == 1)
				3068	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3069
				3070	e = p + PyUnicode_GET_SIZE(self);
				3071	cased = 0;
				3072	for (; p < e; p++) {
				3073	register const Py_UNICODE ch = *p;
				3074
				3075	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3076	return PyInt_FromLong(0);
				3077	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3078	cased = 1;
				3079	}
				3080	return PyInt_FromLong(cased);
				3081	}
				3082
				3083	static char isupper__doc__[] =
				3084	"S.isupper() -> int\n\
				3085	\n\
				3086	Return 1 if all cased characters in S are uppercase and there is\n\
				3087	at least one cased character in S, 0 otherwise.";
				3088
				3089	static PyObject*
				3090	unicode_isupper(PyUnicodeObject self, PyObject args)
				3091	{
				3092	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3093	register const Py_UNICODE *e;
				3094	int cased;
				3095
				3096	if (!PyArg_NoArgs(args))
				3097	return NULL;
				3098
				3099	/* Shortcut for single character strings */
				3100	if (PyUnicode_GET_SIZE(self) == 1)
				3101	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3102
				3103	e = p + PyUnicode_GET_SIZE(self);
				3104	cased = 0;
				3105	for (; p < e; p++) {
				3106	register const Py_UNICODE ch = *p;
				3107
				3108	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3109	return PyInt_FromLong(0);
				3110	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3111	cased = 1;
				3112	}
				3113	return PyInt_FromLong(cased);
				3114	}
				3115
				3116	static char istitle__doc__[] =
				3117	"S.istitle() -> int\n\
				3118	\n\
				3119	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3120	may only follow uncased characters and lowercase characters only cased\n\
				3121	ones. Return 0 otherwise.";
				3122
				3123	static PyObject*
				3124	unicode_istitle(PyUnicodeObject self, PyObject args)
				3125	{
				3126	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3127	register const Py_UNICODE *e;
				3128	int cased, previous_is_cased;
				3129
				3130	if (!PyArg_NoArgs(args))
				3131	return NULL;
				3132
				3133	/* Shortcut for single character strings */
				3134	if (PyUnicode_GET_SIZE(self) == 1)
				3135	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3136	(Py_UNICODE_ISUPPER(*p) != 0));
				3137
				3138	e = p + PyUnicode_GET_SIZE(self);
				3139	cased = 0;
				3140	previous_is_cased = 0;
				3141	for (; p < e; p++) {
				3142	register const Py_UNICODE ch = *p;
				3143
				3144	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3145	if (previous_is_cased)
				3146	return PyInt_FromLong(0);
				3147	previous_is_cased = 1;
				3148	cased = 1;
				3149	}
				3150	else if (Py_UNICODE_ISLOWER(ch)) {
				3151	if (!previous_is_cased)
				3152	return PyInt_FromLong(0);
				3153	previous_is_cased = 1;
				3154	cased = 1;
				3155	}
				3156	else
				3157	previous_is_cased = 0;
				3158	}
				3159	return PyInt_FromLong(cased);
				3160	}
				3161
				3162	static char isspace__doc__[] =
				3163	"S.isspace() -> int\n\
				3164	\n\
				3165	Return 1 if there are only whitespace characters in S,\n\
				3166	0 otherwise.";
				3167
				3168	static PyObject*
				3169	unicode_isspace(PyUnicodeObject self, PyObject args)
				3170	{
				3171	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3172	register const Py_UNICODE *e;
				3173
				3174	if (!PyArg_NoArgs(args))
				3175	return NULL;
				3176
				3177	/* Shortcut for single character strings */
				3178	if (PyUnicode_GET_SIZE(self) == 1 &&
				3179	Py_UNICODE_ISSPACE(*p))
				3180	return PyInt_FromLong(1);
				3181
				3182	e = p + PyUnicode_GET_SIZE(self);
				3183	for (; p < e; p++) {
				3184	if (!Py_UNICODE_ISSPACE(*p))
				3185	return PyInt_FromLong(0);
				3186	}
				3187	return PyInt_FromLong(1);
				3188	}
				3189
				3190	static char isdecimal__doc__[] =
				3191	"S.isdecimal() -> int\n\
				3192	\n\
				3193	Return 1 if there are only decimal characters in S,\n\
				3194	0 otherwise.";
				3195
				3196	static PyObject*
				3197	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3198	{
				3199	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3200	register const Py_UNICODE *e;
				3201
				3202	if (!PyArg_NoArgs(args))
				3203	return NULL;
				3204
				3205	/* Shortcut for single character strings */
				3206	if (PyUnicode_GET_SIZE(self) == 1 &&
				3207	Py_UNICODE_ISDECIMAL(*p))
				3208	return PyInt_FromLong(1);
				3209
				3210	e = p + PyUnicode_GET_SIZE(self);
				3211	for (; p < e; p++) {
				3212	if (!Py_UNICODE_ISDECIMAL(*p))
				3213	return PyInt_FromLong(0);
				3214	}
				3215	return PyInt_FromLong(1);
				3216	}
				3217
				3218	static char isdigit__doc__[] =
				3219	"S.isdigit() -> int\n\
				3220	\n\
				3221	Return 1 if there are only digit characters in S,\n\
				3222	0 otherwise.";
				3223
				3224	static PyObject*
				3225	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3226	{
				3227	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3228	register const Py_UNICODE *e;
				3229
				3230	if (!PyArg_NoArgs(args))
				3231	return NULL;
				3232
				3233	/* Shortcut for single character strings */
				3234	if (PyUnicode_GET_SIZE(self) == 1 &&
				3235	Py_UNICODE_ISDIGIT(*p))
				3236	return PyInt_FromLong(1);
				3237
				3238	e = p + PyUnicode_GET_SIZE(self);
				3239	for (; p < e; p++) {
				3240	if (!Py_UNICODE_ISDIGIT(*p))
				3241	return PyInt_FromLong(0);
				3242	}
				3243	return PyInt_FromLong(1);
				3244	}
				3245
				3246	static char isnumeric__doc__[] =
				3247	"S.isnumeric() -> int\n\
				3248	\n\
				3249	Return 1 if there are only numeric characters in S,\n\
				3250	0 otherwise.";
				3251
				3252	static PyObject*
				3253	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3254	{
				3255	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3256	register const Py_UNICODE *e;
				3257
				3258	if (!PyArg_NoArgs(args))
				3259	return NULL;
				3260
				3261	/* Shortcut for single character strings */
				3262	if (PyUnicode_GET_SIZE(self) == 1 &&
				3263	Py_UNICODE_ISNUMERIC(*p))
				3264	return PyInt_FromLong(1);
				3265
				3266	e = p + PyUnicode_GET_SIZE(self);
				3267	for (; p < e; p++) {
				3268	if (!Py_UNICODE_ISNUMERIC(*p))
				3269	return PyInt_FromLong(0);
				3270	}
				3271	return PyInt_FromLong(1);
				3272	}
				3273
				3274	static char join__doc__[] =
				3275	"S.join(sequence) -> unicode\n\
				3276	\n\
				3277	Return a string which is the concatenation of the strings in the\n\
				3278	sequence. The separator between elements is S.";
				3279
				3280	static PyObject*
				3281	unicode_join(PyUnicodeObject self, PyObject args)
				3282	{
				3283	PyObject *data;
				3284	if (!PyArg_ParseTuple(args, "O:join", &data))
				3285	return NULL;
				3286
				3287	return PyUnicode_Join((PyObject *)self, data);
				3288	}
				3289
				3290	static int
				3291	unicode_length(PyUnicodeObject *self)
				3292	{
				3293	return self->length;
				3294	}
				3295
				3296	static char ljust__doc__[] =
				3297	"S.ljust(width) -> unicode\n\
				3298	\n\
				3299	Return S left justified in a Unicode string of length width. Padding is\n\
				3300	done using spaces.";
				3301
				3302	static PyObject *
				3303	unicode_ljust(PyUnicodeObject self, PyObject args)
				3304	{
				3305	int width;
				3306	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3307	return NULL;
				3308
				3309	if (self->length >= width) {
				3310	Py_INCREF(self);
				3311	return (PyObject*) self;
				3312	}
				3313
				3314	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3315	}
				3316
				3317	static char lower__doc__[] =
				3318	"S.lower() -> unicode\n\
				3319	\n\
				3320	Return a copy of the string S converted to lowercase.";
				3321
				3322	static PyObject*
				3323	unicode_lower(PyUnicodeObject self, PyObject args)
				3324	{
				3325	if (!PyArg_NoArgs(args))
				3326	return NULL;
				3327	return fixup(self, fixlower);
				3328	}
				3329
				3330	static char lstrip__doc__[] =
				3331	"S.lstrip() -> unicode\n\
				3332	\n\
				3333	Return a copy of the string S with leading whitespace removed.";
				3334
				3335	static PyObject *
				3336	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3337	{
				3338	if (!PyArg_NoArgs(args))
				3339	return NULL;
				3340	return strip(self, 1, 0);
				3341	}
				3342
				3343	static PyObject*
				3344	unicode_repeat(PyUnicodeObject *str, int len)
				3345	{
				3346	PyUnicodeObject *u;
				3347	Py_UNICODE *p;
				3348
				3349	if (len < 0)
				3350	len = 0;
				3351
				3352	if (len == 1) {
				3353	/* no repeat, return original string */
				3354	Py_INCREF(str);
				3355	return (PyObject*) str;
				3356	}
				3357
				3358	u = _PyUnicode_New(len * str->length);
				3359	if (!u)
				3360	return NULL;
				3361
				3362	p = u->str;
				3363
				3364	while (len-- > 0) {
				3365	Py_UNICODE_COPY(p, str->str, str->length);
				3366	p += str->length;
				3367	}
				3368
				3369	return (PyObject*) u;
				3370	}
				3371
				3372	PyObject PyUnicode_Replace(PyObject obj,
				3373	PyObject *subobj,
				3374	PyObject *replobj,
				3375	int maxcount)
				3376	{
				3377	PyObject *self;
				3378	PyObject *str1;
				3379	PyObject *str2;
				3380	PyObject *result;
				3381
				3382	self = PyUnicode_FromObject(obj);
				3383	if (self == NULL)
				3384	return NULL;
				3385	str1 = PyUnicode_FromObject(subobj);
				3386	if (str1 == NULL) {
				3387	Py_DECREF(self);
				3388	return NULL;
				3389	}
				3390	str2 = PyUnicode_FromObject(replobj);
				3391	if (str2 == NULL) {
				3392	Py_DECREF(self);
				3393	Py_DECREF(str1);
				3394	return NULL;
				3395	}
				3396	result = replace((PyUnicodeObject *)self,
				3397	(PyUnicodeObject *)str1,
				3398	(PyUnicodeObject *)str2,
				3399	maxcount);
				3400	Py_DECREF(self);
				3401	Py_DECREF(str1);
				3402	Py_DECREF(str2);
				3403	return result;
				3404	}
				3405
				3406	static char replace__doc__[] =
				3407	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3408	\n\
				3409	Return a copy of S with all occurrences of substring\n\
				3410	old replaced by new. If the optional argument maxsplit is\n\
				3411	given, only the first maxsplit occurrences are replaced.";
				3412
				3413	static PyObject*
				3414	unicode_replace(PyUnicodeObject self, PyObject args)
				3415	{
				3416	PyUnicodeObject *str1;
				3417	PyUnicodeObject *str2;
				3418	int maxcount = -1;
				3419	PyObject *result;
				3420
				3421	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3422	return NULL;
				3423	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3424	if (str1 == NULL)
				3425	return NULL;
				3426	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3427	if (str2 == NULL)
				3428	return NULL;
				3429
				3430	result = replace(self, str1, str2, maxcount);
				3431
				3432	Py_DECREF(str1);
				3433	Py_DECREF(str2);
				3434	return result;
				3435	}
				3436
				3437	static
				3438	PyObject unicode_repr(PyObject unicode)
				3439	{
				3440	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				3441	PyUnicode_GET_SIZE(unicode),
				3442	1);
				3443	}
				3444
				3445	static char rfind__doc__[] =
				3446	"S.rfind(sub [,start [,end]]) -> int\n\
				3447	\n\
				3448	Return the highest index in S where substring sub is found,\n\
				3449	such that sub is contained within s[start,end]. Optional\n\
				3450	arguments start and end are interpreted as in slice notation.\n\
				3451	\n\
				3452	Return -1 on failure.";
				3453
				3454	static PyObject *
				3455	unicode_rfind(PyUnicodeObject self, PyObject args)
				3456	{
				3457	PyUnicodeObject *substring;
				3458	int start = 0;
				3459	int end = INT_MAX;
				3460	PyObject *result;
				3461
				3462	if (!PyArg_ParseTuple(args, "O\|ii:rfind", &substring, &start, &end))
				3463	return NULL;
				3464	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3465	(PyObject *)substring);
				3466	if (substring == NULL)
				3467	return NULL;
				3468
				3469	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				3470
				3471	Py_DECREF(substring);
				3472	return result;
				3473	}
				3474
				3475	static char rindex__doc__[] =
				3476	"S.rindex(sub [,start [,end]]) -> int\n\
				3477	\n\
				3478	Like S.rfind() but raise ValueError when the substring is not found.";
				3479
				3480	static PyObject *
				3481	unicode_rindex(PyUnicodeObject self, PyObject args)
				3482	{
				3483	int result;
				3484	PyUnicodeObject *substring;
				3485	int start = 0;
				3486	int end = INT_MAX;
				3487
				3488	if (!PyArg_ParseTuple(args, "O\|ii:rindex", &substring, &start, &end))
				3489	return NULL;
				3490	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3491	(PyObject *)substring);
				3492	if (substring == NULL)
				3493	return NULL;
				3494
				3495	result = findstring(self, substring, start, end, -1);
				3496
				3497	Py_DECREF(substring);
				3498	if (result < 0) {
				3499	PyErr_SetString(PyExc_ValueError, "substring not found");
				3500	return NULL;
				3501	}
				3502	return PyInt_FromLong(result);
				3503	}
				3504
				3505	static char rjust__doc__[] =
				3506	"S.rjust(width) -> unicode\n\
				3507	\n\
				3508	Return S right justified in a Unicode string of length width. Padding is\n\
				3509	done using spaces.";
				3510
				3511	static PyObject *
				3512	unicode_rjust(PyUnicodeObject self, PyObject args)
				3513	{
				3514	int width;
				3515	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				3516	return NULL;
				3517
				3518	if (self->length >= width) {
				3519	Py_INCREF(self);
				3520	return (PyObject*) self;
				3521	}
				3522
				3523	return (PyObject*) pad(self, width - self->length, 0, ' ');
				3524	}
				3525
				3526	static char rstrip__doc__[] =
				3527	"S.rstrip() -> unicode\n\
				3528	\n\
				3529	Return a copy of the string S with trailing whitespace removed.";
				3530
				3531	static PyObject *
				3532	unicode_rstrip(PyUnicodeObject self, PyObject args)
				3533	{
				3534	if (!PyArg_NoArgs(args))
				3535	return NULL;
				3536	return strip(self, 0, 1);
				3537	}
				3538
				3539	static PyObject*
				3540	unicode_slice(PyUnicodeObject *self, int start, int end)
				3541	{
				3542	/* standard clamping */
				3543	if (start < 0)
				3544	start = 0;
				3545	if (end < 0)
				3546	end = 0;
				3547	if (end > self->length)
				3548	end = self->length;
				3549	if (start == 0 && end == self->length) {
				3550	/* full slice, return original string */
				3551	Py_INCREF(self);
				3552	return (PyObject*) self;
				3553	}
				3554	if (start > end)
				3555	start = end;
				3556	/* copy slice */
				3557	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				3558	end - start);
				3559	}
				3560
				3561	PyObject PyUnicode_Split(PyObject s,
				3562	PyObject *sep,
				3563	int maxsplit)
				3564	{
				3565	PyObject *result;
				3566
				3567	s = PyUnicode_FromObject(s);
				3568	if (s == NULL)
				3569	return NULL;
				3570	if (sep != NULL) {
				3571	sep = PyUnicode_FromObject(sep);
				3572	if (sep == NULL) {
				3573	Py_DECREF(s);
				3574	return NULL;
				3575	}
				3576	}
				3577
				3578	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				3579
				3580	Py_DECREF(s);
				3581	Py_XDECREF(sep);
				3582	return result;
				3583	}
				3584
				3585	static char split__doc__[] =
				3586	"S.split([sep [,maxsplit]]) -> list of strings\n\
				3587	\n\
				3588	Return a list of the words in S, using sep as the\n\
				3589	delimiter string. If maxsplit is given, at most maxsplit\n\
				3590	splits are done. If sep is not specified, any whitespace string\n\
				3591	is a separator.";
				3592
				3593	static PyObject*
				3594	unicode_split(PyUnicodeObject self, PyObject args)
				3595	{
				3596	PyObject *substring = Py_None;
				3597	int maxcount = -1;
				3598
				3599	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				3600	return NULL;
				3601
				3602	if (substring == Py_None)
				3603	return split(self, NULL, maxcount);
				3604	else if (PyUnicode_Check(substring))
				3605	return split(self, (PyUnicodeObject *)substring, maxcount);
				3606	else
				3607	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				3608	}
				3609
				3610	static char splitlines__doc__[] =
				3611	"S.splitlines([maxsplit]]) -> list of strings\n\
				3612	\n\
				3613	Return a list of the lines in S, breaking at line boundaries.\n\
				3614	If maxsplit is given, at most maxsplit are done. Line breaks are not\n\
				3615	included in the resulting list.";
				3616
				3617	static PyObject*
				3618	unicode_splitlines(PyUnicodeObject self, PyObject args)
				3619	{
				3620	int maxcount = -1;
				3621
				3622	if (!PyArg_ParseTuple(args, "\|i:splitlines", &maxcount))
				3623	return NULL;
				3624
				3625	return PyUnicode_Splitlines((PyObject *)self, maxcount);
				3626	}
				3627
				3628	static
				3629	PyObject unicode_str(PyUnicodeObject self)
				3630	{
				3631	return PyUnicode_AsUTF8String((PyObject *)self);
				3632	}
				3633
				3634	static char strip__doc__[] =
				3635	"S.strip() -> unicode\n\
				3636	\n\
				3637	Return a copy of S with leading and trailing whitespace removed.";
				3638
				3639	static PyObject *
				3640	unicode_strip(PyUnicodeObject self, PyObject args)
				3641	{
				3642	if (!PyArg_NoArgs(args))
				3643	return NULL;
				3644	return strip(self, 1, 1);
				3645	}
				3646
				3647	static char swapcase__doc__[] =
				3648	"S.swapcase() -> unicode\n\
				3649	\n\
				3650	Return a copy of S with uppercase characters converted to lowercase\n\
				3651	and vice versa.";
				3652
				3653	static PyObject*
				3654	unicode_swapcase(PyUnicodeObject self, PyObject args)
				3655	{
				3656	if (!PyArg_NoArgs(args))
				3657	return NULL;
				3658	return fixup(self, fixswapcase);
				3659	}
				3660
				3661	static char translate__doc__[] =
				3662	"S.translate(table) -> unicode\n\
				3663	\n\
				3664	Return a copy of the string S, where all characters have been mapped\n\
				3665	through the given translation table, which must be a mapping of\n\
				3666	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				3667	are left untouched. Characters mapped to None are deleted.";
				3668
				3669	static PyObject*
				3670	unicode_translate(PyUnicodeObject self, PyObject args)
				3671	{
				3672	PyObject *table;
				3673
				3674	if (!PyArg_ParseTuple(args, "O:translate", &table))
				3675	return NULL;
				3676	return PyUnicode_TranslateCharmap(self->str,
				3677	self->length,
				3678	table,
				3679	"ignore");
				3680	}
				3681
				3682	static char upper__doc__[] =
				3683	"S.upper() -> unicode\n\
				3684	\n\
				3685	Return a copy of S converted to uppercase.";
				3686
				3687	static PyObject*
				3688	unicode_upper(PyUnicodeObject self, PyObject args)
				3689	{
				3690	if (!PyArg_NoArgs(args))
				3691	return NULL;
				3692	return fixup(self, fixupper);
				3693	}
				3694
				3695	#if 0
				3696	static char zfill__doc__[] =
				3697	"S.zfill(width) -> unicode\n\
				3698	\n\
				3699	Pad a numeric string x with zeros on the left, to fill a field\n\
				3700	of the specified width. The string x is never truncated.";
				3701
				3702	static PyObject *
				3703	unicode_zfill(PyUnicodeObject self, PyObject args)
				3704	{
				3705	int fill;
				3706	PyUnicodeObject *u;
				3707
				3708	int width;
				3709	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				3710	return NULL;
				3711
				3712	if (self->length >= width) {
				3713	Py_INCREF(self);
				3714	return (PyObject*) self;
				3715	}
				3716
				3717	fill = width - self->length;
				3718
				3719	u = pad(self, fill, 0, '0');
				3720
				3721	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				3722	/* move sign to beginning of string */
				3723	u->str[0] = u->str[fill];
				3724	u->str[fill] = '0';
				3725	}
				3726
				3727	return (PyObject*) u;
				3728	}
				3729	#endif
				3730
				3731	#if 0
				3732	static PyObject*
				3733	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				3734	{
				3735	if (!PyArg_NoArgs(args))
				3736	return NULL;
				3737	return PyInt_FromLong(unicode_freelist_size);
				3738	}
				3739	#endif
				3740
				3741	static char startswith__doc__[] =
				3742	"S.startswith(prefix[, start[, end]]) -> int\n\
				3743	\n\
				3744	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				3745	optional start, test S beginning at that position. With optional end, stop\n\
				3746	comparing S at that position.";
				3747
				3748	static PyObject *
				3749	unicode_startswith(PyUnicodeObject *self,
				3750	PyObject *args)
				3751	{
				3752	PyUnicodeObject *substring;
				3753	int start = 0;
				3754	int end = INT_MAX;
				3755	PyObject *result;
				3756
				3757	if (!PyArg_ParseTuple(args, "O\|ii:startswith", &substring, &start, &end))
				3758	return NULL;
				3759	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3760	(PyObject *)substring);
				3761	if (substring == NULL)
				3762	return NULL;
				3763
				3764	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				3765
				3766	Py_DECREF(substring);
				3767	return result;
				3768	}
				3769
				3770
				3771	static char endswith__doc__[] =
				3772	"S.endswith(suffix[, start[, end]]) -> int\n\
				3773	\n\
				3774	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				3775	optional start, test S beginning at that position. With optional end, stop\n\
				3776	comparing S at that position.";
				3777
				3778	static PyObject *
				3779	unicode_endswith(PyUnicodeObject *self,
				3780	PyObject *args)
				3781	{
				3782	PyUnicodeObject *substring;
				3783	int start = 0;
				3784	int end = INT_MAX;
				3785	PyObject *result;
				3786
				3787	if (!PyArg_ParseTuple(args, "O\|ii:endswith", &substring, &start, &end))
				3788	return NULL;
				3789	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3790	(PyObject *)substring);
				3791	if (substring == NULL)
				3792	return NULL;
				3793
				3794	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				3795
				3796	Py_DECREF(substring);
				3797	return result;
				3798	}
				3799
				3800
				3801	static PyMethodDef unicode_methods[] = {
				3802
				3803	/* Order is according to common usage: often used methods should
				3804	appear first, since lookup is done sequentially. */
				3805
				3806	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				3807	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				3808	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				3809	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				3810	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				3811	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				3812	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				3813	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				3814	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				3815	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				3816	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				3817	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				3818	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				3819	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				3820	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				3821	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				3822	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				3823	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				3824	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				3825	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				3826	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				3827	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				3828	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				3829	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				3830	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				3831	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				3832	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				3833	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				3834	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				3835	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				3836	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				3837	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				3838	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
				3839	#if 0
				3840	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				3841	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				3842	#endif
				3843
				3844	#if 0
				3845	/* This one is just used for debugging the implementation. */
				3846	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				3847	#endif
				3848
				3849	{NULL, NULL}
				3850	};
				3851
				3852	static PyObject *
				3853	unicode_getattr(PyUnicodeObject self, char name)
				3854	{
				3855	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				3856	}
				3857
				3858	static PySequenceMethods unicode_as_sequence = {
				3859	(inquiry) unicode_length, /* sq_length */
				3860	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				3861	(intargfunc) unicode_repeat, /* sq_repeat */
				3862	(intargfunc) unicode_getitem, /* sq_item */
				3863	(intintargfunc) unicode_slice, /* sq_slice */
				3864	0, /* sq_ass_item */
				3865	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3866	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3867	};
				3868
				3869	static int
				3870	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				3871	int index,
				3872	const void **ptr)
				3873	{
				3874	if (index != 0) {
				3875	PyErr_SetString(PyExc_SystemError,
				3876	"accessing non-existent unicode segment");
				3877	return -1;
				3878	}
				3879	ptr = (void ) self->str;
				3880	return PyUnicode_GET_DATA_SIZE(self);
				3881	}
				3882
				3883	static int
				3884	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				3885	const void **ptr)
				3886	{
				3887	PyErr_SetString(PyExc_TypeError,
				3888	"cannot use unicode as modifyable buffer");
				3889	return -1;
				3890	}
				3891
				3892	static int
				3893	unicode_buffer_getsegcount(PyUnicodeObject *self,
				3894	int *lenp)
				3895	{
				3896	if (lenp)
				3897	*lenp = PyUnicode_GET_DATA_SIZE(self);
				3898	return 1;
				3899	}
				3900
				3901	static int
				3902	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				3903	int index,
				3904	const void **ptr)
				3905	{
				3906	PyObject *str;
				3907
				3908	if (index != 0) {
				3909	PyErr_SetString(PyExc_SystemError,
				3910	"accessing non-existent unicode segment");
				3911	return -1;
				3912	}
				3913	str = utf8_string(self, NULL);
				3914	if (str == NULL)
				3915	return -1;
				3916	ptr = (void ) PyString_AS_STRING(str);
				3917	return PyString_GET_SIZE(str);
				3918	}
				3919
				3920	/* Helpers for PyUnicode_Format() */
				3921
				3922	static PyObject *
				3923	getnextarg(args, arglen, p_argidx)
				3924	PyObject *args;
				3925	int arglen;
				3926	int *p_argidx;
				3927	{
				3928	int argidx = *p_argidx;
				3929	if (argidx < arglen) {
				3930	(*p_argidx)++;
				3931	if (arglen < 0)
				3932	return args;
				3933	else
				3934	return PyTuple_GetItem(args, argidx);
				3935	}
				3936	PyErr_SetString(PyExc_TypeError,
				3937	"not enough arguments for format string");
				3938	return NULL;
				3939	}
				3940
				3941	#define F_LJUST (1<<0)
				3942	#define F_SIGN (1<<1)
				3943	#define F_BLANK (1<<2)
				3944	#define F_ALT (1<<3)
				3945	#define F_ZERO (1<<4)
				3946
				3947	static
				3948	#ifdef HAVE_STDARG_PROTOTYPES
				3949	int usprintf(register Py_UNICODE buffer, char format, ...)
				3950	#else
				3951	int usprintf(va_alist) va_dcl
				3952	#endif
				3953	{
				3954	register int i;
				3955	int len;
				3956	va_list va;
				3957	char *charbuffer;
				3958	#ifdef HAVE_STDARG_PROTOTYPES
				3959	va_start(va, format);
				3960	#else
				3961	Py_UNICODE *args;
				3962	char *format;
				3963
				3964	va_start(va);
				3965	buffer = va_arg(va, Py_UNICODE *);
				3966	format = va_arg(va, char *);
				3967	#endif
				3968
				3969	/* First, format the string as char array, then expand to Py_UNICODE
				3970	array. */
				3971	charbuffer = (char *)buffer;
				3972	len = vsprintf(charbuffer, format, va);
				3973	for (i = len - 1; i >= 0; i--)
				3974	buffer[i] = (Py_UNICODE) charbuffer[i];
				3975
				3976	va_end(va);
				3977	return len;
				3978	}
				3979
				3980	static int
				3981	formatfloat(Py_UNICODE *buf,
				3982	int flags,
				3983	int prec,
				3984	int type,
				3985	PyObject *v)
				3986	{
				3987	char fmt[20];
				3988	double x;
				3989
				3990	x = PyFloat_AsDouble(v);
				3991	if (x == -1.0 && PyErr_Occurred())
				3992	return -1;
				3993	if (prec < 0)
				3994	prec = 6;
				3995	if (prec > 50)
				3996	prec = 50; /* Arbitrary limitation */
				3997	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				3998	type = 'g';
				3999	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
				4000	return usprintf(buf, fmt, x);
				4001	}
				4002
				4003	static int
				4004	formatint(Py_UNICODE *buf,
				4005	int flags,
				4006	int prec,
				4007	int type,
				4008	PyObject *v)
				4009	{
				4010	char fmt[20];
				4011	long x;
				4012
				4013	x = PyInt_AsLong(v);
				4014	if (x == -1 && PyErr_Occurred())
				4015	return -1;
				4016	if (prec < 0)
				4017	prec = 1;
				4018	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4019	return usprintf(buf, fmt, x);
				4020	}
				4021
				4022	static int
				4023	formatchar(Py_UNICODE *buf,
				4024	PyObject *v)
				4025	{
				4026	if (PyUnicode_Check(v))
				4027	buf[0] = PyUnicode_AS_UNICODE(v)[0];
				4028
				4029	else if (PyString_Check(v))
				4030	buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
				4031
				4032	else {
				4033	/* Integer input truncated to a character */
				4034	long x;
				4035	x = PyInt_AsLong(v);
				4036	if (x == -1 && PyErr_Occurred())
				4037	return -1;
				4038	buf[0] = (char) x;
				4039	}
				4040	buf[1] = '\0';
				4041	return 1;
				4042	}
				4043
				4044	PyObject PyUnicode_Format(PyObject format,
				4045	PyObject *args)
				4046	{
				4047	Py_UNICODE fmt, res;
				4048	int fmtcnt, rescnt, reslen, arglen, argidx;
				4049	int args_owned = 0;
				4050	PyUnicodeObject *result = NULL;
				4051	PyObject *dict = NULL;
				4052	PyObject *uformat;
				4053
				4054	if (format == NULL \|\| args == NULL) {
				4055	PyErr_BadInternalCall();
				4056	return NULL;
				4057	}
				4058	uformat = PyUnicode_FromObject(format);
				4059	fmt = PyUnicode_AS_UNICODE(uformat);
				4060	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4061
				4062	reslen = rescnt = fmtcnt + 100;
				4063	result = _PyUnicode_New(reslen);
				4064	if (result == NULL)
				4065	goto onError;
				4066	res = PyUnicode_AS_UNICODE(result);
				4067
				4068	if (PyTuple_Check(args)) {
				4069	arglen = PyTuple_Size(args);
				4070	argidx = 0;
				4071	}
				4072	else {
				4073	arglen = -1;
				4074	argidx = -2;
				4075	}
				4076	if (args->ob_type->tp_as_mapping)
				4077	dict = args;
				4078
				4079	while (--fmtcnt >= 0) {
				4080	if (*fmt != '%') {
				4081	if (--rescnt < 0) {
				4082	rescnt = fmtcnt + 100;
				4083	reslen += rescnt;
				4084	if (_PyUnicode_Resize(result, reslen) < 0)
				4085	return NULL;
				4086	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4087	--rescnt;
				4088	}
				4089	res++ = fmt++;
				4090	}
				4091	else {
				4092	/* Got a format specifier */
				4093	int flags = 0;
				4094	int width = -1;
				4095	int prec = -1;
				4096	int size = 0;
				4097	Py_UNICODE c = '\0';
				4098	Py_UNICODE fill;
				4099	PyObject *v = NULL;
				4100	PyObject *temp = NULL;
				4101	Py_UNICODE *buf;
				4102	Py_UNICODE sign;
				4103	int len;
				4104	Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
				4105
				4106	fmt++;
				4107	if (*fmt == '(') {
				4108	Py_UNICODE *keystart;
				4109	int keylen;
				4110	PyObject *key;
				4111	int pcount = 1;
				4112
				4113	if (dict == NULL) {
				4114	PyErr_SetString(PyExc_TypeError,
				4115	"format requires a mapping");
				4116	goto onError;
				4117	}
				4118	++fmt;
				4119	--fmtcnt;
				4120	keystart = fmt;
				4121	/* Skip over balanced parentheses */
				4122	while (pcount > 0 && --fmtcnt >= 0) {
				4123	if (*fmt == ')')
				4124	--pcount;
				4125	else if (*fmt == '(')
				4126	++pcount;
				4127	fmt++;
				4128	}
				4129	keylen = fmt - keystart - 1;
				4130	if (fmtcnt < 0 \|\| pcount > 0) {
				4131	PyErr_SetString(PyExc_ValueError,
				4132	"incomplete format key");
				4133	goto onError;
				4134	}
				4135	/* keys are converted to strings (using UTF-8) and
				4136	then looked up since Python uses strings to hold
				4137	variables names etc. in its namespaces and we
				4138	wouldn't want to break common idioms. The
				4139	alternative would be using Unicode objects for the
				4140	lookup but u"abc" and "abc" have different hash
				4141	values (on purpose). */
				4142	key = PyUnicode_EncodeUTF8(keystart,
				4143	keylen,
				4144	NULL);
				4145	if (key == NULL)
				4146	goto onError;
				4147	if (args_owned) {
				4148	Py_DECREF(args);
				4149	args_owned = 0;
				4150	}
				4151	args = PyObject_GetItem(dict, key);
				4152	Py_DECREF(key);
				4153	if (args == NULL) {
				4154	goto onError;
				4155	}
				4156	args_owned = 1;
				4157	arglen = -1;
				4158	argidx = -2;
				4159	}
				4160	while (--fmtcnt >= 0) {
				4161	switch (c = *fmt++) {
				4162	case '-': flags \|= F_LJUST; continue;
				4163	case '+': flags \|= F_SIGN; continue;
				4164	case ' ': flags \|= F_BLANK; continue;
				4165	case '#': flags \|= F_ALT; continue;
				4166	case '0': flags \|= F_ZERO; continue;
				4167	}
				4168	break;
				4169	}
				4170	if (c == '*') {
				4171	v = getnextarg(args, arglen, &argidx);
				4172	if (v == NULL)
				4173	goto onError;
				4174	if (!PyInt_Check(v)) {
				4175	PyErr_SetString(PyExc_TypeError,
				4176	"* wants int");
				4177	goto onError;
				4178	}
				4179	width = PyInt_AsLong(v);
				4180	if (width < 0) {
				4181	flags \|= F_LJUST;
				4182	width = -width;
				4183	}
				4184	if (--fmtcnt >= 0)
				4185	c = *fmt++;
				4186	}
				4187	else if (c >= '0' && c <= '9') {
				4188	width = c - '0';
				4189	while (--fmtcnt >= 0) {
				4190	c = *fmt++;
				4191	if (c < '0' \|\| c > '9')
				4192	break;
				4193	if ((width*10) / 10 != width) {
				4194	PyErr_SetString(PyExc_ValueError,
				4195	"width too big");
				4196	goto onError;
				4197	}
				4198	width = width*10 + (c - '0');
				4199	}
				4200	}
				4201	if (c == '.') {
				4202	prec = 0;
				4203	if (--fmtcnt >= 0)
				4204	c = *fmt++;
				4205	if (c == '*') {
				4206	v = getnextarg(args, arglen, &argidx);
				4207	if (v == NULL)
				4208	goto onError;
				4209	if (!PyInt_Check(v)) {
				4210	PyErr_SetString(PyExc_TypeError,
				4211	"* wants int");
				4212	goto onError;
				4213	}
				4214	prec = PyInt_AsLong(v);
				4215	if (prec < 0)
				4216	prec = 0;
				4217	if (--fmtcnt >= 0)
				4218	c = *fmt++;
				4219	}
				4220	else if (c >= '0' && c <= '9') {
				4221	prec = c - '0';
				4222	while (--fmtcnt >= 0) {
				4223	c = Py_CHARMASK(*fmt++);
				4224	if (c < '0' \|\| c > '9')
				4225	break;
				4226	if ((prec*10) / 10 != prec) {
				4227	PyErr_SetString(PyExc_ValueError,
				4228	"prec too big");
				4229	goto onError;
				4230	}
				4231	prec = prec*10 + (c - '0');
				4232	}
				4233	}
				4234	} /* prec */
				4235	if (fmtcnt >= 0) {
				4236	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4237	size = c;
				4238	if (--fmtcnt >= 0)
				4239	c = *fmt++;
				4240	}
				4241	}
				4242	if (fmtcnt < 0) {
				4243	PyErr_SetString(PyExc_ValueError,
				4244	"incomplete format");
				4245	goto onError;
				4246	}
				4247	if (c != '%') {
				4248	v = getnextarg(args, arglen, &argidx);
				4249	if (v == NULL)
				4250	goto onError;
				4251	}
				4252	sign = 0;
				4253	fill = ' ';
				4254	switch (c) {
				4255
				4256	case '%':
				4257	buf = tmpbuf;
				4258	buf[0] = '%';
				4259	len = 1;
				4260	break;
				4261
				4262	case 's':
				4263	case 'r':
				4264	if (PyUnicode_Check(v) && c == 's') {
				4265	temp = v;
				4266	Py_INCREF(temp);
				4267	}
				4268	else {
				4269	PyObject *unicode;
				4270	if (c == 's')
				4271	temp = PyObject_Str(v);
				4272	else
				4273	temp = PyObject_Repr(v);
				4274	if (temp == NULL)
				4275	goto onError;
				4276	if (!PyString_Check(temp)) {
				4277	/* XXX Note: this should never happen, since
				4278	PyObject_Repr() and PyObject_Str() assure
				4279	this */
				4280	Py_DECREF(temp);
				4281	PyErr_SetString(PyExc_TypeError,
				4282	"%s argument has non-string str()");
				4283	goto onError;
				4284	}
				4285	unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
				4286	PyString_GET_SIZE(temp),
				4287	"strict");
				4288	Py_DECREF(temp);
				4289	temp = unicode;
				4290	if (temp == NULL)
				4291	goto onError;
				4292	}
				4293	buf = PyUnicode_AS_UNICODE(temp);
				4294	len = PyUnicode_GET_SIZE(temp);
				4295	if (prec >= 0 && len > prec)
				4296	len = prec;
				4297	break;
				4298
				4299	case 'i':
				4300	case 'd':
				4301	case 'u':
				4302	case 'o':
				4303	case 'x':
				4304	case 'X':
				4305	if (c == 'i')
				4306	c = 'd';
				4307	buf = tmpbuf;
				4308	len = formatint(buf, flags, prec, c, v);
				4309	if (len < 0)
				4310	goto onError;
				4311	sign = (c == 'd');
				4312	if (flags & F_ZERO) {
				4313	fill = '0';
				4314	if ((flags&F_ALT) &&
				4315	(c == 'x' \|\| c == 'X') &&
				4316	buf[0] == '0' && buf[1] == c) {
				4317	res++ = buf++;
				4318	res++ = buf++;
				4319	rescnt -= 2;
				4320	len -= 2;
				4321	width -= 2;
				4322	if (width < 0)
				4323	width = 0;
				4324	}
				4325	}
				4326	break;
				4327
				4328	case 'e':
				4329	case 'E':
				4330	case 'f':
				4331	case 'g':
				4332	case 'G':
				4333	buf = tmpbuf;
				4334	len = formatfloat(buf, flags, prec, c, v);
				4335	if (len < 0)
				4336	goto onError;
				4337	sign = 1;
				4338	if (flags&F_ZERO)
				4339	fill = '0';
				4340	break;
				4341
				4342	case 'c':
				4343	buf = tmpbuf;
				4344	len = formatchar(buf, v);
				4345	if (len < 0)
				4346	goto onError;
				4347	break;
				4348
				4349	default:
				4350	PyErr_Format(PyExc_ValueError,
				4351	"unsupported format character '%c' (0x%x)",
				4352	c, c);
				4353	goto onError;
				4354	}
				4355	if (sign) {
				4356	if (buf == '-' \|\| buf == '+') {
				4357	sign = *buf++;
				4358	len--;
				4359	}
				4360	else if (flags & F_SIGN)
				4361	sign = '+';
				4362	else if (flags & F_BLANK)
				4363	sign = ' ';
				4364	else
				4365	sign = 0;
				4366	}
				4367	if (width < len)
				4368	width = len;
				4369	if (rescnt < width + (sign != 0)) {
				4370	reslen -= rescnt;
				4371	rescnt = width + fmtcnt + 100;
				4372	reslen += rescnt;
				4373	if (_PyUnicode_Resize(result, reslen) < 0)
				4374	return NULL;
				4375	res = PyUnicode_AS_UNICODE(result)
				4376	+ reslen - rescnt;
				4377	}
				4378	if (sign) {
				4379	if (fill != ' ')
				4380	*res++ = sign;
				4381	rescnt--;
				4382	if (width > len)
				4383	width--;
				4384	}
				4385	if (width > len && !(flags & F_LJUST)) {
				4386	do {
				4387	--rescnt;
				4388	*res++ = fill;
				4389	} while (--width > len);
				4390	}
				4391	if (sign && fill == ' ')
				4392	*res++ = sign;
				4393	memcpy(res, buf, len * sizeof(Py_UNICODE));
				4394	res += len;
				4395	rescnt -= len;
				4396	while (--width >= len) {
				4397	--rescnt;
				4398	*res++ = ' ';
				4399	}
				4400	if (dict && (argidx < arglen) && c != '%') {
				4401	PyErr_SetString(PyExc_TypeError,
				4402	"not all arguments converted");
				4403	goto onError;
				4404	}
				4405	Py_XDECREF(temp);
				4406	} /* '%' */
				4407	} /* until end */
				4408	if (argidx < arglen && !dict) {
				4409	PyErr_SetString(PyExc_TypeError,
				4410	"not all arguments converted");
				4411	goto onError;
				4412	}
				4413
				4414	if (args_owned) {
				4415	Py_DECREF(args);
				4416	}
				4417	Py_DECREF(uformat);
				4418	_PyUnicode_Resize(result, reslen - rescnt);
				4419	return (PyObject *)result;
				4420
				4421	onError:
				4422	Py_XDECREF(result);
				4423	Py_DECREF(uformat);
				4424	if (args_owned) {
				4425	Py_DECREF(args);
				4426	}
				4427	return NULL;
				4428	}
				4429
				4430	static PyBufferProcs unicode_as_buffer = {
				4431	(getreadbufferproc) unicode_buffer_getreadbuf,
				4432	(getwritebufferproc) unicode_buffer_getwritebuf,
				4433	(getsegcountproc) unicode_buffer_getsegcount,
				4434	(getcharbufferproc) unicode_buffer_getcharbuf,
				4435	};
				4436
				4437	PyTypeObject PyUnicode_Type = {
				4438	PyObject_HEAD_INIT(&PyType_Type)
				4439	0, /* ob_size */
				4440	"unicode", /* tp_name */
				4441	sizeof(PyUnicodeObject), /* tp_size */
				4442	0, /* tp_itemsize */
				4443	/* Slots */
				4444	(destructor)_PyUnicode_Free, /* tp_dealloc */
				4445	0, /* tp_print */
				4446	(getattrfunc)unicode_getattr, /* tp_getattr */
				4447	0, /* tp_setattr */
				4448	(cmpfunc) unicode_compare, /* tp_compare */
				4449	(reprfunc) unicode_repr, /* tp_repr */
				4450	0, /* tp_as_number */
				4451	&unicode_as_sequence, /* tp_as_sequence */
				4452	0, /* tp_as_mapping */
				4453	(hashfunc) unicode_hash, /* tp_hash*/
				4454	0, /* tp_call*/
				4455	(reprfunc) unicode_str, /* tp_str */
				4456	(getattrofunc) NULL, /* tp_getattro */
				4457	(setattrofunc) NULL, /* tp_setattro */
				4458	&unicode_as_buffer, /* tp_as_buffer */
				4459	Py_TPFLAGS_DEFAULT, /* tp_flags */
				4460	};
				4461
				4462	/* Initialize the Unicode implementation */
				4463
				4464	void _PyUnicode_Init()
				4465	{
				4466	/* Doublecheck the configuration... */
				4467	if (sizeof(Py_UNICODE) != 2)
				4468	Py_FatalError("Unicode configuration error: "
				4469	"sizeof(Py_UNICODE) != 2 bytes");
				4470
				4471	unicode_empty = _PyUnicode_New(0);
				4472	}
				4473
				4474	/* Finalize the Unicode implementation */
				4475
				4476	void
				4477	_PyUnicode_Fini()
				4478	{
				4479	PyUnicodeObject *u = unicode_freelist;
				4480
				4481	while (u != NULL) {
				4482	PyUnicodeObject *v = u;
				4483	u = (PyUnicodeObject *)u;
				4484	free(v);
				4485	}
				4486	Py_XDECREF(unicode_empty);
				4487	}