Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 9c35e2d54457943334d967614dff7b35f52f828d [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
				4	modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
				5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
				69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	79	/* Limit for the Unicode object free list */
				80
				81	#define MAX_UNICODE_FREELIST_SIZE 1024
				82
				83	/* Limit for the Unicode object free list stay alive optimization.
				84
				85	The implementation will keep allocated Unicode memory intact for
				86	all objects on the free list having a size less than this
				87	limit. This reduces malloc() overhead for small Unicode objects.
				88
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	89	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	90	(sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
				91	malloc()-overhead) bytes of unused garbage.
				92
				93	Setting the limit to 0 effectively turns the feature off.
				94
				95	XXX The feature is currently turned off because there are
				96	apparently some lingering bugs in its implementation which I
				97	haven't yet been able to sort out.
				98
				99	*/
				100
				101	#define STAYALIVE_SIZE_LIMIT 0
				102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
				111	/* --- Globals ------------------------------------------------------------ */
				112
				113	/* The empty Unicode object */
				114	static PyUnicodeObject *unicode_empty = NULL;
				115
				116	/* Free list for Unicode objects */
				117	static PyUnicodeObject *unicode_freelist = NULL;
				118	static int unicode_freelist_size = 0;
				119
				120	/* --- Unicode Object ----------------------------------------------------- */
				121
				122	static
				123	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				124	int length)
				125	{
				126	void *oldstr;
				127
				128	/* Shortcut if there's nothing to do. */
				129	if (unicode->length == length)
				130	return 0;
				131
				132	/* Resizing unicode_empty is not allowed. */
				133	if (unicode == unicode_empty) {
				134	PyErr_SetString(PyExc_SystemError,
				135	"can't resize empty unicode object");
				136	return -1;
				137	}
				138
				139	/* We allocate one more byte to make sure the string is
				140	Ux0000 terminated -- XXX is this needed ? */
				141	oldstr = unicode->str;
				142	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				143	if (!unicode->str) {
				144	unicode->str = oldstr;
				145	PyErr_NoMemory();
				146	return -1;
				147	}
				148	unicode->str[length] = 0;
				149	unicode->length = length;
				150
				151	/* Reset the object caches */
				152	if (unicode->utf8str) {
				153	Py_DECREF(unicode->utf8str);
				154	unicode->utf8str = NULL;
				155	}
				156	unicode->hash = -1;
				157
				158	return 0;
				159	}
				160
				161	/* We allocate one more byte to make sure the string is
				162	Ux0000 terminated -- XXX is this needed ?
				163
				164	XXX This allocator could further be enhanced by assuring that the
				165	free list never reduces its size below 1.
				166
				167	*/
				168
				169	static
				170	PyUnicodeObject *_PyUnicode_New(int length)
				171	{
				172	register PyUnicodeObject *unicode;
				173
				174	/* Optimization for empty strings */
				175	if (length == 0 && unicode_empty != NULL) {
				176	Py_INCREF(unicode_empty);
				177	return unicode_empty;
				178	}
				179
				180	/* Unicode freelist & memory allocation */
				181	if (unicode_freelist) {
				182	unicode = unicode_freelist;
				183	unicode_freelist = (PyUnicodeObject *)unicode_freelist;
				184	unicode_freelist_size--;
				185	unicode->ob_type = &PyUnicode_Type;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	186	_Py_NewReference((PyObject *)unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	187	if (unicode->str) {
				188	if (unicode->length < length &&
				189	_PyUnicode_Resize(unicode, length)) {
				190	free(unicode->str);
				191	PyMem_DEL(unicode);
				192	return NULL;
				193	}
				194	}
				195	else
				196	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				197	}
				198	else {
				199	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				200	if (unicode == NULL)
				201	return NULL;
				202	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				203	}
				204
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	205	if (!unicode->str)
				206	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	207	unicode->str[length] = 0;
				208	unicode->length = length;
				209	unicode->hash = -1;
				210	unicode->utf8str = NULL;
				211	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212
				213	onError:
				214	_Py_ForgetReference((PyObject *)unicode);
				215	PyMem_DEL(unicode);
				216	PyErr_NoMemory();
				217	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	}
				219
				220	static
				221	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				222	{
				223	Py_XDECREF(unicode->utf8str);
				224	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
				225	if (unicode->length >= STAYALIVE_SIZE_LIMIT) {
				226	free(unicode->str);
				227	unicode->str = NULL;
				228	unicode->length = 0;
				229	}
				230	(PyUnicodeObject *)unicode = unicode_freelist;
				231	unicode_freelist = unicode;
				232	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	233	}
				234	else {
				235	free(unicode->str);
				236	PyMem_DEL(unicode);
				237	}
				238	}
				239
				240	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				241	int size)
				242	{
				243	PyUnicodeObject *unicode;
				244
				245	unicode = _PyUnicode_New(size);
				246	if (!unicode)
				247	return NULL;
				248
				249	/* Copy the Unicode data into the new object */
				250	if (u != NULL)
				251	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				252
				253	return (PyObject *)unicode;
				254	}
				255
				256	#ifdef HAVE_WCHAR_H
				257
				258	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				259	int size)
				260	{
				261	PyUnicodeObject *unicode;
				262
				263	if (w == NULL) {
				264	PyErr_BadInternalCall();
				265	return NULL;
				266	}
				267
				268	unicode = _PyUnicode_New(size);
				269	if (!unicode)
				270	return NULL;
				271
				272	/* Copy the wchar_t data into the new object */
				273	#ifdef HAVE_USABLE_WCHAR_T
				274	memcpy(unicode->str, w, size * sizeof(wchar_t));
				275	#else
				276	{
				277	register Py_UNICODE *u;
				278	register int i;
				279	u = PyUnicode_AS_UNICODE(unicode);
				280	for (i = size; i >= 0; i--)
				281	u++ = w++;
				282	}
				283	#endif
				284
				285	return (PyObject *)unicode;
				286	}
				287
				288	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				289	register wchar_t *w,
				290	int size)
				291	{
				292	if (unicode == NULL) {
				293	PyErr_BadInternalCall();
				294	return -1;
				295	}
				296	if (size > PyUnicode_GET_SIZE(unicode))
				297	size = PyUnicode_GET_SIZE(unicode);
				298	#ifdef HAVE_USABLE_WCHAR_T
				299	memcpy(w, unicode->str, size * sizeof(wchar_t));
				300	#else
				301	{
				302	register Py_UNICODE *u;
				303	register int i;
				304	u = PyUnicode_AS_UNICODE(unicode);
				305	for (i = size; i >= 0; i--)
				306	w++ = u++;
				307	}
				308	#endif
				309
				310	return size;
				311	}
				312
				313	#endif
				314
				315	PyObject PyUnicode_FromObject(register PyObject obj)
				316	{
				317	const char *s;
				318	int len;
				319
				320	if (obj == NULL) {
				321	PyErr_BadInternalCall();
				322	return NULL;
				323	}
				324	else if (PyUnicode_Check(obj)) {
				325	Py_INCREF(obj);
				326	return obj;
				327	}
				328	else if (PyString_Check(obj)) {
				329	s = PyString_AS_STRING(obj);
				330	len = PyString_GET_SIZE(obj);
				331	}
				332	else if (PyObject_AsCharBuffer(obj, &s, &len))
				333	return NULL;
				334	if (len == 0) {
				335	Py_INCREF(unicode_empty);
				336	return (PyObject *)unicode_empty;
				337	}
				338	return PyUnicode_DecodeUTF8(s, len, "strict");
				339	}
				340
				341	PyObject PyUnicode_Decode(const char s,
				342	int size,
				343	const char *encoding,
				344	const char *errors)
				345	{
				346	PyObject buffer = NULL, unicode;
				347
				348	/* Shortcut for the default encoding UTF-8 */
				349	if (encoding == NULL \|\|
				350	(strcmp(encoding, "utf-8") == 0))
				351	return PyUnicode_DecodeUTF8(s, size, errors);
				352
				353	/* Decode via the codec registry */
				354	buffer = PyBuffer_FromMemory((void *)s, size);
				355	if (buffer == NULL)
				356	goto onError;
				357	unicode = PyCodec_Decode(buffer, encoding, errors);
				358	if (unicode == NULL)
				359	goto onError;
				360	if (!PyUnicode_Check(unicode)) {
				361	PyErr_Format(PyExc_TypeError,
				362	"decoder did not return an unicode object (type=%s)",
				363	unicode->ob_type->tp_name);
				364	Py_DECREF(unicode);
				365	goto onError;
				366	}
				367	Py_DECREF(buffer);
				368	return unicode;
				369
				370	onError:
				371	Py_XDECREF(buffer);
				372	return NULL;
				373	}
				374
				375	PyObject PyUnicode_Encode(const Py_UNICODE s,
				376	int size,
				377	const char *encoding,
				378	const char *errors)
				379	{
				380	PyObject v, unicode;
				381
				382	unicode = PyUnicode_FromUnicode(s, size);
				383	if (unicode == NULL)
				384	return NULL;
				385	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				386	Py_DECREF(unicode);
				387	return v;
				388	}
				389
				390	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				391	const char *encoding,
				392	const char *errors)
				393	{
				394	PyObject *v;
				395
				396	if (!PyUnicode_Check(unicode)) {
				397	PyErr_BadArgument();
				398	goto onError;
				399	}
				400	/* Shortcut for the default encoding UTF-8 */
				401	if ((encoding == NULL \|\|
				402	(strcmp(encoding, "utf-8") == 0)) &&
				403	errors == NULL)
				404	return PyUnicode_AsUTF8String(unicode);
				405
				406	/* Encode via the codec registry */
				407	v = PyCodec_Encode(unicode, encoding, errors);
				408	if (v == NULL)
				409	goto onError;
				410	/* XXX Should we really enforce this ? */
				411	if (!PyString_Check(v)) {
				412	PyErr_Format(PyExc_TypeError,
				413	"encoder did not return a string object (type=%s)",
				414	v->ob_type->tp_name);
				415	Py_DECREF(v);
				416	goto onError;
				417	}
				418	return v;
				419
				420	onError:
				421	return NULL;
				422	}
				423
				424	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				425	{
				426	if (!PyUnicode_Check(unicode)) {
				427	PyErr_BadArgument();
				428	goto onError;
				429	}
				430	return PyUnicode_AS_UNICODE(unicode);
				431
				432	onError:
				433	return NULL;
				434	}
				435
				436	int PyUnicode_GetSize(PyObject *unicode)
				437	{
				438	if (!PyUnicode_Check(unicode)) {
				439	PyErr_BadArgument();
				440	goto onError;
				441	}
				442	return PyUnicode_GET_SIZE(unicode);
				443
				444	onError:
				445	return -1;
				446	}
				447
				448	/* --- UTF-8 Codec -------------------------------------------------------- */
				449
				450	static
				451	char utf8_code_length[256] = {
				452	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				453	illegal prefix. see RFC 2279 for details */
				454	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				455	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				456	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				457	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				458	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				459	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				460	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				461	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				462	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				463	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				464	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				465	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				466	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				467	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				468	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				469	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				470	};
				471
				472	static
				473	int utf8_decoding_error(const char **source,
				474	Py_UNICODE **dest,
				475	const char *errors,
				476	const char *details)
				477	{
				478	if ((errors == NULL) \|\|
				479	(strcmp(errors,"strict") == 0)) {
				480	PyErr_Format(PyExc_UnicodeError,
				481	"UTF-8 decoding error: %s",
				482	details);
				483	return -1;
				484	}
				485	else if (strcmp(errors,"ignore") == 0) {
				486	(*source)++;
				487	return 0;
				488	}
				489	else if (strcmp(errors,"replace") == 0) {
				490	(*source)++;
				491	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				492	(*dest)++;
				493	return 0;
				494	}
				495	else {
				496	PyErr_Format(PyExc_ValueError,
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	497	"UTF-8 decoding error; unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	498	errors);
				499	return -1;
				500	}
				501	}
				502
				503	#define UTF8_ERROR(details) do { \
				504	if (utf8_decoding_error(&s, &p, errors, details)) \
				505	goto onError; \
				506	continue; \
				507	} while (0)
				508
				509	PyObject PyUnicode_DecodeUTF8(const char s,
				510	int size,
				511	const char *errors)
				512	{
				513	int n;
				514	const char *e;
				515	PyUnicodeObject *unicode;
				516	Py_UNICODE *p;
				517
				518	/* Note: size will always be longer than the resulting Unicode
				519	character count */
				520	unicode = _PyUnicode_New(size);
				521	if (!unicode)
				522	return NULL;
				523	if (size == 0)
				524	return (PyObject *)unicode;
				525
				526	/* Unpack UTF-8 encoded data */
				527	p = unicode->str;
				528	e = s + size;
				529
				530	while (s < e) {
				531	register Py_UNICODE ch = (unsigned char)*s;
				532
				533	if (ch < 0x80) {
				534	*p++ = ch;
				535	s++;
				536	continue;
				537	}
				538
				539	n = utf8_code_length[ch];
				540
				541	if (s + n > e)
				542	UTF8_ERROR("unexpected end of data");
				543
				544	switch (n) {
				545
				546	case 0:
				547	UTF8_ERROR("unexpected code byte");
				548	break;
				549
				550	case 1:
				551	UTF8_ERROR("internal error");
				552	break;
				553
				554	case 2:
				555	if ((s[1] & 0xc0) != 0x80)
				556	UTF8_ERROR("invalid data");
				557	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				558	if (ch < 0x80)
				559	UTF8_ERROR("illegal encoding");
				560	else
				561	*p++ = ch;
				562	break;
				563
				564	case 3:
				565	if ((s[1] & 0xc0) != 0x80 \|\|
				566	(s[2] & 0xc0) != 0x80)
				567	UTF8_ERROR("invalid data");
				568	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				569	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				570	UTF8_ERROR("illegal encoding");
				571	else
				572	*p++ = ch;
				573	break;
				574
				575	default:
				576	/* Other sizes are only needed for UCS-4 */
				577	UTF8_ERROR("unsupported Unicode code range");
				578	}
				579	s += n;
				580	}
				581
				582	/* Adjust length */
				583	if (_PyUnicode_Resize(unicode, p - unicode->str))
				584	goto onError;
				585
				586	return (PyObject *)unicode;
				587
				588	onError:
				589	Py_DECREF(unicode);
				590	return NULL;
				591	}
				592
				593	#undef UTF8_ERROR
				594
				595	static
				596	int utf8_encoding_error(const Py_UNICODE **source,
				597	char **dest,
				598	const char *errors,
				599	const char *details)
				600	{
				601	if ((errors == NULL) \|\|
				602	(strcmp(errors,"strict") == 0)) {
				603	PyErr_Format(PyExc_UnicodeError,
				604	"UTF-8 encoding error: %s",
				605	details);
				606	return -1;
				607	}
				608	else if (strcmp(errors,"ignore") == 0) {
				609	return 0;
				610	}
				611	else if (strcmp(errors,"replace") == 0) {
				612	**dest = '?';
				613	(*dest)++;
				614	return 0;
				615	}
				616	else {
				617	PyErr_Format(PyExc_ValueError,
				618	"UTF-8 encoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	619	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	620	errors);
				621	return -1;
				622	}
				623	}
				624
				625	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				626	int size,
				627	const char *errors)
				628	{
				629	PyObject *v;
				630	char *p;
				631	char *q;
				632
				633	v = PyString_FromStringAndSize(NULL, 3 * size);
				634	if (v == NULL)
				635	return NULL;
				636	if (size == 0)
				637	goto done;
				638
				639	p = q = PyString_AS_STRING(v);
				640	while (size-- > 0) {
				641	Py_UNICODE ch = *s++;
				642	if (ch < 0x80)
				643	*p++ = (char) ch;
				644	else if (ch < 0x0800) {
				645	*p++ = 0xc0 \| (ch >> 6);
				646	*p++ = 0x80 \| (ch & 0x3f);
				647	} else if (0xD800 <= ch && ch <= 0xDFFF) {
				648	/* These byte ranges are reserved for UTF-16 surrogate
				649	bytes which the Python implementation currently does
				650	not support. */
				651	printf("code range problem: U+%04x\n", ch);
				652	if (utf8_encoding_error(&s, &p, errors,
				653	"unsupported code range"))
				654	goto onError;
				655	} else {
				656	*p++ = 0xe0 \| (ch >> 12);
				657	*p++ = 0x80 \| ((ch >> 6) & 0x3f);
				658	*p++ = 0x80 \| (ch & 0x3f);
				659	}
				660	}
				661	*p = '\0';
				662	_PyString_Resize(&v, p - q);
				663
				664	done:
				665	return v;
				666
				667	onError:
				668	Py_DECREF(v);
				669	return NULL;
				670	}
				671
				672	/* Return a Python string holding the UTF-8 encoded value of the
				673	Unicode object.
				674
				675	The resulting string is cached in the Unicode object for subsequent
				676	usage by this function. The cached version is needed to implement
				677	the character buffer interface.
				678
				679	The refcount of the string is not incremented.
				680
				681	*/
				682
				683	static
				684	PyObject utf8_string(PyUnicodeObject self,
				685	const char *errors)
				686	{
				687	PyObject *v = self->utf8str;
				688
				689	if (v)
				690	return v;
				691	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(self),
				692	PyUnicode_GET_SIZE(self),
				693	errors);
				694	if (v && errors == NULL)
				695	self->utf8str = v;
				696	return v;
				697	}
				698
				699	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				700	{
				701	PyObject *str;
				702
				703	if (!PyUnicode_Check(unicode)) {
				704	PyErr_BadArgument();
				705	return NULL;
				706	}
				707	str = utf8_string((PyUnicodeObject *)unicode, NULL);
				708	if (str == NULL)
				709	return NULL;
				710	Py_INCREF(str);
				711	return str;
				712	}
				713
				714	/* --- UTF-16 Codec ------------------------------------------------------- */
				715
				716	static
				717	int utf16_decoding_error(const Py_UNICODE **source,
				718	Py_UNICODE **dest,
				719	const char *errors,
				720	const char *details)
				721	{
				722	if ((errors == NULL) \|\|
				723	(strcmp(errors,"strict") == 0)) {
				724	PyErr_Format(PyExc_UnicodeError,
				725	"UTF-16 decoding error: %s",
				726	details);
				727	return -1;
				728	}
				729	else if (strcmp(errors,"ignore") == 0) {
				730	return 0;
				731	}
				732	else if (strcmp(errors,"replace") == 0) {
				733	if (dest) {
				734	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				735	(*dest)++;
				736	}
				737	return 0;
				738	}
				739	else {
				740	PyErr_Format(PyExc_ValueError,
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	741	"UTF-16 decoding error; unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	742	errors);
				743	return -1;
				744	}
				745	}
				746
				747	#define UTF16_ERROR(details) do { \
				748	if (utf16_decoding_error(&q, &p, errors, details)) \
				749	goto onError; \
				750	continue; \
				751	} while(0)
				752
				753	PyObject PyUnicode_DecodeUTF16(const char s,
				754	int size,
				755	const char *errors,
				756	int *byteorder)
				757	{
				758	PyUnicodeObject *unicode;
				759	Py_UNICODE *p;
				760	const Py_UNICODE q, e;
				761	int bo = 0;
				762
				763	/* size should be an even number */
				764	if (size % sizeof(Py_UNICODE) != 0) {
				765	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				766	return NULL;
				767	/* The remaining input chars are ignored if we fall through
				768	here... */
				769	}
				770
				771	/* Note: size will always be longer than the resulting Unicode
				772	character count */
				773	unicode = _PyUnicode_New(size);
				774	if (!unicode)
				775	return NULL;
				776	if (size == 0)
				777	return (PyObject *)unicode;
				778
				779	/* Unpack UTF-16 encoded data */
				780	p = unicode->str;
				781	q = (Py_UNICODE *)s;
				782	e = q + (size / sizeof(Py_UNICODE));
				783
				784	if (byteorder)
				785	bo = *byteorder;
				786
				787	while (q < e) {
				788	register Py_UNICODE ch = *q++;
				789
				790	/* Check for BOM marks (U+FEFF) in the input and adjust
				791	current byte order setting accordingly. Swap input
				792	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				793	!) */
				794	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				795	if (ch == 0xFEFF) {
				796	bo = -1;
				797	continue;
				798	} else if (ch == 0xFFFE) {
				799	bo = 1;
				800	continue;
				801	}
				802	if (bo == 1)
				803	ch = (ch >> 8) \| (ch << 8);
				804	#else
				805	if (ch == 0xFEFF) {
				806	bo = 1;
				807	continue;
				808	} else if (ch == 0xFFFE) {
				809	bo = -1;
				810	continue;
				811	}
				812	if (bo == -1)
				813	ch = (ch >> 8) \| (ch << 8);
				814	#endif
				815	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				816	*p++ = ch;
				817	continue;
				818	}
				819
				820	/* UTF-16 code pair: */
				821	if (q >= e)
				822	UTF16_ERROR("unexpected end of data");
				823	if (0xDC00 <= q && q <= 0xDFFF) {
				824	q++;
				825	if (0xD800 <= q && q <= 0xDBFF)
				826	/* This is valid data (a UTF-16 surrogate pair), but
				827	we are not able to store this information since our
				828	Py_UNICODE type only has 16 bits... this might
				829	change someday, even though it's unlikely. */
				830	UTF16_ERROR("code pairs are not supported");
				831	else
				832	continue;
				833	}
				834	UTF16_ERROR("illegal encoding");
				835	}
				836
				837	if (byteorder)
				838	*byteorder = bo;
				839
				840	/* Adjust length */
				841	if (_PyUnicode_Resize(unicode, p - unicode->str))
				842	goto onError;
				843
				844	return (PyObject *)unicode;
				845
				846	onError:
				847	Py_DECREF(unicode);
				848	return NULL;
				849	}
				850
				851	#undef UTF16_ERROR
				852
				853	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				854	int size,
				855	const char *errors,
				856	int byteorder)
				857	{
				858	PyObject *v;
				859	Py_UNICODE *p;
				860	char *q;
				861
				862	/* We don't create UTF-16 pairs... */
				863	v = PyString_FromStringAndSize(NULL,
				864	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				865	if (v == NULL)
				866	return NULL;
				867	if (size == 0)
				868	goto done;
				869
				870	q = PyString_AS_STRING(v);
				871	p = (Py_UNICODE *)q;
				872
				873	if (byteorder == 0)
				874	*p++ = 0xFEFF;
				875	if (byteorder == 0 \|\|
				876	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				877	byteorder == -1
				878	#else
				879	byteorder == 1
				880	#endif
				881	)
				882	memcpy(p, s, size * sizeof(Py_UNICODE));
				883	else
				884	while (size-- > 0) {
				885	Py_UNICODE ch = *s++;
				886	*p++ = (ch >> 8) \| (ch << 8);
				887	}
				888	done:
				889	return v;
				890	}
				891
				892	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				893	{
				894	if (!PyUnicode_Check(unicode)) {
				895	PyErr_BadArgument();
				896	return NULL;
				897	}
				898	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				899	PyUnicode_GET_SIZE(unicode),
				900	NULL,
				901	0);
				902	}
				903
				904	/* --- Unicode Escape Codec ----------------------------------------------- */
				905
				906	static
				907	int unicodeescape_decoding_error(const char **source,
				908	unsigned int *x,
				909	const char *errors,
				910	const char *details)
				911	{
				912	if ((errors == NULL) \|\|
				913	(strcmp(errors,"strict") == 0)) {
				914	PyErr_Format(PyExc_UnicodeError,
				915	"Unicode-Escape decoding error: %s",
				916	details);
				917	return -1;
				918	}
				919	else if (strcmp(errors,"ignore") == 0) {
				920	return 0;
				921	}
				922	else if (strcmp(errors,"replace") == 0) {
				923	*x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
				924	return 0;
				925	}
				926	else {
				927	PyErr_Format(PyExc_ValueError,
				928	"Unicode-Escape decoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	929	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	930	errors);
				931	return -1;
				932	}
				933	}
				934
				935	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				936	int size,
				937	const char *errors)
				938	{
				939	PyUnicodeObject *v;
				940	Py_UNICODE p = NULL, buf = NULL;
				941	const char *end;
				942
				943	/* Escaped strings will always be longer than the resulting
				944	Unicode string, so we start with size here and then reduce the
				945	length after conversion to the true value. */
				946	v = _PyUnicode_New(size);
				947	if (v == NULL)
				948	goto onError;
				949	if (size == 0)
				950	return (PyObject *)v;
				951	p = buf = PyUnicode_AS_UNICODE(v);
				952	end = s + size;
				953	while (s < end) {
				954	unsigned char c;
				955	unsigned int x;
				956	int i;
				957
				958	/* Non-escape characters are interpreted as Unicode ordinals */
				959	if (*s != '\\') {
				960	p++ = (unsigned char)s++;
				961	continue;
				962	}
				963
				964	/* \ - Escapes */
				965	s++;
				966	switch (*s++) {
				967
				968	/* \x escapes */
				969	case '\n': break;
				970	case '\\': *p++ = '\\'; break;
				971	case '\'': *p++ = '\''; break;
				972	case '\"': *p++ = '\"'; break;
				973	case 'b': *p++ = '\b'; break;
				974	case 'f': p++ = '\014'; break; / FF */
				975	case 't': *p++ = '\t'; break;
				976	case 'n': *p++ = '\n'; break;
				977	case 'r': *p++ = '\r'; break;
				978	case 'v': p++ = '\013'; break; / VT */
				979	case 'a': p++ = '\007'; break; / BEL, not classic C */
				980
				981	/* \OOO (octal) escapes */
				982	case '0': case '1': case '2': case '3':
				983	case '4': case '5': case '6': case '7':
				984	c = s[-1] - '0';
				985	if ('0' <= s && s <= '7') {
				986	c = (c<<3) + *s++ - '0';
				987	if ('0' <= s && s <= '7')
				988	c = (c<<3) + *s++ - '0';
				989	}
				990	*p++ = c;
				991	break;
				992
				993	/* \xXXXX escape with 0-4 hex digits */
				994	case 'x':
				995	x = 0;
				996	c = (unsigned char)*s;
				997	if (isxdigit(c)) {
				998	do {
				999	x = (x<<4) & ~0xF;
				1000	if ('0' <= c && c <= '9')
				1001	x += c - '0';
				1002	else if ('a' <= c && c <= 'f')
				1003	x += 10 + c - 'a';
				1004	else
				1005	x += 10 + c - 'A';
				1006	c = (unsigned char)*++s;
				1007	} while (isxdigit(c));
				1008	*p++ = x;
				1009	} else {
				1010	*p++ = '\\';
				1011	*p++ = (unsigned char)s[-1];
				1012	}
				1013	break;
				1014
				1015	/* \uXXXX with 4 hex digits */
				1016	case 'u':
				1017	for (x = 0, i = 0; i < 4; i++) {
				1018	c = (unsigned char)s[i];
				1019	if (!isxdigit(c)) {
				1020	if (unicodeescape_decoding_error(&s, &x, errors,
				1021	"truncated \\uXXXX"))
				1022	goto onError;
				1023	i++;
				1024	break;
				1025	}
				1026	x = (x<<4) & ~0xF;
				1027	if (c >= '0' && c <= '9')
				1028	x += c - '0';
				1029	else if (c >= 'a' && c <= 'f')
				1030	x += 10 + c - 'a';
				1031	else
				1032	x += 10 + c - 'A';
				1033	}
				1034	s += i;
				1035	*p++ = x;
				1036	break;
				1037
				1038	default:
				1039	*p++ = '\\';
				1040	*p++ = (unsigned char)s[-1];
				1041	break;
				1042	}
				1043	}
				1044	_PyUnicode_Resize(v, (int)(p - buf));
				1045	return (PyObject *)v;
				1046
				1047	onError:
				1048	Py_XDECREF(v);
				1049	return NULL;
				1050	}
				1051
				1052	/* Return a Unicode-Escape string version of the Unicode object.
				1053
				1054	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1055	appropriate.
				1056
				1057	*/
				1058
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1059	static const Py_UNICODE findchar(const Py_UNICODE s,
				1060	int size,
				1061	Py_UNICODE ch);
				1062
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1063	static
				1064	PyObject unicodeescape_string(const Py_UNICODE s,
				1065	int size,
				1066	int quotes)
				1067	{
				1068	PyObject *repr;
				1069	char *p;
				1070	char *q;
				1071
				1072	static const char *hexdigit = "0123456789ABCDEF";
				1073
				1074	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1075	if (repr == NULL)
				1076	return NULL;
				1077
				1078	p = q = PyString_AS_STRING(repr);
				1079
				1080	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1081	*p++ = 'u';
				1082	*p++ = (findchar(s, size, '\'') &&
				1083	!findchar(s, size, '"')) ? '"' : '\'';
				1084	}
				1085	while (size-- > 0) {
				1086	Py_UNICODE ch = *s++;
				1087	/* Escape quotes */
				1088	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1089	*p++ = '\\';
				1090	*p++ = (char) ch;
				1091	}
				1092	/* Map 16-bit characters to '\uxxxx' */
				1093	else if (ch >= 256) {
				1094	*p++ = '\\';
				1095	*p++ = 'u';
				1096	*p++ = hexdigit[(ch >> 12) & 0xf];
				1097	*p++ = hexdigit[(ch >> 8) & 0xf];
				1098	*p++ = hexdigit[(ch >> 4) & 0xf];
				1099	*p++ = hexdigit[ch & 15];
				1100	}
				1101	/* Map non-printable US ASCII to '\ooo' */
				1102	else if (ch < ' ' \|\| ch >= 128) {
				1103	*p++ = '\\';
				1104	*p++ = hexdigit[(ch >> 6) & 7];
				1105	*p++ = hexdigit[(ch >> 3) & 7];
				1106	*p++ = hexdigit[ch & 7];
				1107	}
				1108	/* Copy everything else as-is */
				1109	else
				1110	*p++ = (char) ch;
				1111	}
				1112	if (quotes)
				1113	*p++ = q[1];
				1114
				1115	*p = '\0';
				1116	_PyString_Resize(&repr, p - q);
				1117
				1118	return repr;
				1119	}
				1120
				1121	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1122	int size)
				1123	{
				1124	return unicodeescape_string(s, size, 0);
				1125	}
				1126
				1127	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1128	{
				1129	if (!PyUnicode_Check(unicode)) {
				1130	PyErr_BadArgument();
				1131	return NULL;
				1132	}
				1133	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1134	PyUnicode_GET_SIZE(unicode));
				1135	}
				1136
				1137	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1138
				1139	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1140	int size,
				1141	const char *errors)
				1142	{
				1143	PyUnicodeObject *v;
				1144	Py_UNICODE p, buf;
				1145	const char *end;
				1146	const char *bs;
				1147
				1148	/* Escaped strings will always be longer than the resulting
				1149	Unicode string, so we start with size here and then reduce the
				1150	length after conversion to the true value. */
				1151	v = _PyUnicode_New(size);
				1152	if (v == NULL)
				1153	goto onError;
				1154	if (size == 0)
				1155	return (PyObject *)v;
				1156	p = buf = PyUnicode_AS_UNICODE(v);
				1157	end = s + size;
				1158	while (s < end) {
				1159	unsigned char c;
				1160	unsigned int x;
				1161	int i;
				1162
				1163	/* Non-escape characters are interpreted as Unicode ordinals */
				1164	if (*s != '\\') {
				1165	p++ = (unsigned char)s++;
				1166	continue;
				1167	}
				1168
				1169	/* \u-escapes are only interpreted iff the number of leading
				1170	backslashes if odd */
				1171	bs = s;
				1172	for (;s < end;) {
				1173	if (*s != '\\')
				1174	break;
				1175	p++ = (unsigned char)s++;
				1176	}
				1177	if (((s - bs) & 1) == 0 \|\|
				1178	s >= end \|\|
				1179	*s != 'u') {
				1180	continue;
				1181	}
				1182	p--;
				1183	s++;
				1184
				1185	/* \uXXXX with 4 hex digits */
				1186	for (x = 0, i = 0; i < 4; i++) {
				1187	c = (unsigned char)s[i];
				1188	if (!isxdigit(c)) {
				1189	if (unicodeescape_decoding_error(&s, &x, errors,
				1190	"truncated \\uXXXX"))
				1191	goto onError;
				1192	i++;
				1193	break;
				1194	}
				1195	x = (x<<4) & ~0xF;
				1196	if (c >= '0' && c <= '9')
				1197	x += c - '0';
				1198	else if (c >= 'a' && c <= 'f')
				1199	x += 10 + c - 'a';
				1200	else
				1201	x += 10 + c - 'A';
				1202	}
				1203	s += i;
				1204	*p++ = x;
				1205	}
				1206	_PyUnicode_Resize(v, (int)(p - buf));
				1207	return (PyObject *)v;
				1208
				1209	onError:
				1210	Py_XDECREF(v);
				1211	return NULL;
				1212	}
				1213
				1214	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1215	int size)
				1216	{
				1217	PyObject *repr;
				1218	char *p;
				1219	char *q;
				1220
				1221	static const char *hexdigit = "0123456789ABCDEF";
				1222
				1223	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1224	if (repr == NULL)
				1225	return NULL;
				1226
				1227	p = q = PyString_AS_STRING(repr);
				1228	while (size-- > 0) {
				1229	Py_UNICODE ch = *s++;
				1230	/* Map 16-bit characters to '\uxxxx' */
				1231	if (ch >= 256) {
				1232	*p++ = '\\';
				1233	*p++ = 'u';
				1234	*p++ = hexdigit[(ch >> 12) & 0xf];
				1235	*p++ = hexdigit[(ch >> 8) & 0xf];
				1236	*p++ = hexdigit[(ch >> 4) & 0xf];
				1237	*p++ = hexdigit[ch & 15];
				1238	}
				1239	/* Copy everything else as-is */
				1240	else
				1241	*p++ = (char) ch;
				1242	}
				1243	*p = '\0';
				1244	_PyString_Resize(&repr, p - q);
				1245
				1246	return repr;
				1247	}
				1248
				1249	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1250	{
				1251	if (!PyUnicode_Check(unicode)) {
				1252	PyErr_BadArgument();
				1253	return NULL;
				1254	}
				1255	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1256	PyUnicode_GET_SIZE(unicode));
				1257	}
				1258
				1259	/* --- Latin-1 Codec ------------------------------------------------------ */
				1260
				1261	PyObject PyUnicode_DecodeLatin1(const char s,
				1262	int size,
				1263	const char *errors)
				1264	{
				1265	PyUnicodeObject *v;
				1266	Py_UNICODE *p;
				1267
				1268	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1269	v = _PyUnicode_New(size);
				1270	if (v == NULL)
				1271	goto onError;
				1272	if (size == 0)
				1273	return (PyObject *)v;
				1274	p = PyUnicode_AS_UNICODE(v);
				1275	while (size-- > 0)
				1276	p++ = (unsigned char)s++;
				1277	return (PyObject *)v;
				1278
				1279	onError:
				1280	Py_XDECREF(v);
				1281	return NULL;
				1282	}
				1283
				1284	static
				1285	int latin1_encoding_error(const Py_UNICODE **source,
				1286	char **dest,
				1287	const char *errors,
				1288	const char *details)
				1289	{
				1290	if ((errors == NULL) \|\|
				1291	(strcmp(errors,"strict") == 0)) {
				1292	PyErr_Format(PyExc_UnicodeError,
				1293	"Latin-1 encoding error: %s",
				1294	details);
				1295	return -1;
				1296	}
				1297	else if (strcmp(errors,"ignore") == 0) {
				1298	return 0;
				1299	}
				1300	else if (strcmp(errors,"replace") == 0) {
				1301	**dest = '?';
				1302	return 0;
				1303	}
				1304	else {
				1305	PyErr_Format(PyExc_ValueError,
				1306	"Latin-1 encoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1307	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1308	errors);
				1309	return -1;
				1310	}
				1311	}
				1312
				1313	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1314	int size,
				1315	const char *errors)
				1316	{
				1317	PyObject *repr;
				1318	char *s;
				1319	repr = PyString_FromStringAndSize(NULL, size);
				1320	if (repr == NULL)
				1321	return NULL;
				1322
				1323	s = PyString_AS_STRING(repr);
				1324	while (size-- > 0) {
				1325	Py_UNICODE ch = *p++;
				1326	if (ch >= 256) {
				1327	if (latin1_encoding_error(&p, &s, errors,
				1328	"ordinal not in range(256)"))
				1329	goto onError;
				1330	}
				1331	else
				1332	*s++ = (char)ch;
				1333	}
				1334	return repr;
				1335
				1336	onError:
				1337	Py_DECREF(repr);
				1338	return NULL;
				1339	}
				1340
				1341	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1342	{
				1343	if (!PyUnicode_Check(unicode)) {
				1344	PyErr_BadArgument();
				1345	return NULL;
				1346	}
				1347	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1348	PyUnicode_GET_SIZE(unicode),
				1349	NULL);
				1350	}
				1351
				1352	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1353
				1354	static
				1355	int ascii_decoding_error(const char **source,
				1356	Py_UNICODE **dest,
				1357	const char *errors,
				1358	const char *details)
				1359	{
				1360	if ((errors == NULL) \|\|
				1361	(strcmp(errors,"strict") == 0)) {
				1362	PyErr_Format(PyExc_UnicodeError,
				1363	"ASCII decoding error: %s",
				1364	details);
				1365	return -1;
				1366	}
				1367	else if (strcmp(errors,"ignore") == 0) {
				1368	return 0;
				1369	}
				1370	else if (strcmp(errors,"replace") == 0) {
				1371	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1372	(*dest)++;
				1373	return 0;
				1374	}
				1375	else {
				1376	PyErr_Format(PyExc_ValueError,
				1377	"ASCII decoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1378	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1379	errors);
				1380	return -1;
				1381	}
				1382	}
				1383
				1384	PyObject PyUnicode_DecodeASCII(const char s,
				1385	int size,
				1386	const char *errors)
				1387	{
				1388	PyUnicodeObject *v;
				1389	Py_UNICODE *p;
				1390
				1391	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1392	v = _PyUnicode_New(size);
				1393	if (v == NULL)
				1394	goto onError;
				1395	if (size == 0)
				1396	return (PyObject *)v;
				1397	p = PyUnicode_AS_UNICODE(v);
				1398	while (size-- > 0) {
				1399	register unsigned char c;
				1400
				1401	c = (unsigned char)*s++;
				1402	if (c < 128)
				1403	*p++ = c;
				1404	else if (ascii_decoding_error(&s, &p, errors,
				1405	"ordinal not in range(128)"))
				1406	goto onError;
				1407	}
				1408	if (p - PyUnicode_AS_UNICODE(v) < size)
				1409	_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
				1410	return (PyObject *)v;
				1411
				1412	onError:
				1413	Py_XDECREF(v);
				1414	return NULL;
				1415	}
				1416
				1417	static
				1418	int ascii_encoding_error(const Py_UNICODE **source,
				1419	char **dest,
				1420	const char *errors,
				1421	const char *details)
				1422	{
				1423	if ((errors == NULL) \|\|
				1424	(strcmp(errors,"strict") == 0)) {
				1425	PyErr_Format(PyExc_UnicodeError,
				1426	"ASCII encoding error: %s",
				1427	details);
				1428	return -1;
				1429	}
				1430	else if (strcmp(errors,"ignore") == 0) {
				1431	return 0;
				1432	}
				1433	else if (strcmp(errors,"replace") == 0) {
				1434	**dest = '?';
				1435	return 0;
				1436	}
				1437	else {
				1438	PyErr_Format(PyExc_ValueError,
				1439	"ASCII encoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1440	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1441	errors);
				1442	return -1;
				1443	}
				1444	}
				1445
				1446	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1447	int size,
				1448	const char *errors)
				1449	{
				1450	PyObject *repr;
				1451	char *s;
				1452	repr = PyString_FromStringAndSize(NULL, size);
				1453	if (repr == NULL)
				1454	return NULL;
				1455
				1456	s = PyString_AS_STRING(repr);
				1457	while (size-- > 0) {
				1458	Py_UNICODE ch = *p++;
				1459	if (ch >= 128) {
				1460	if (ascii_encoding_error(&p, &s, errors,
				1461	"ordinal not in range(128)"))
				1462	goto onError;
				1463	}
				1464	else
				1465	*s++ = (char)ch;
				1466	}
				1467	return repr;
				1468
				1469	onError:
				1470	Py_DECREF(repr);
				1471	return NULL;
				1472	}
				1473
				1474	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1475	{
				1476	if (!PyUnicode_Check(unicode)) {
				1477	PyErr_BadArgument();
				1478	return NULL;
				1479	}
				1480	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1481	PyUnicode_GET_SIZE(unicode),
				1482	NULL);
				1483	}
				1484
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1485	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame^]	1486
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1487	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame^]	1488
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1489	PyObject PyUnicode_DecodeMBCS(const char s,
				1490	int size,
				1491	const char *errors)
				1492	{
				1493	PyUnicodeObject *v;
				1494	Py_UNICODE *p;
				1495
				1496	/* First get the size of the result */
				1497	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
				1498	if (usize==0)
				1499	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1500
				1501	v = _PyUnicode_New(usize);
				1502	if (v == NULL)
				1503	return NULL;
				1504	if (usize == 0)
				1505	return (PyObject *)v;
				1506	p = PyUnicode_AS_UNICODE(v);
				1507	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1508	Py_DECREF(v);
				1509	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1510	}
				1511
				1512	return (PyObject *)v;
				1513	}
				1514
				1515	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1516	int size,
				1517	const char *errors)
				1518	{
				1519	PyObject *repr;
				1520	char *s;
				1521
				1522	/* First get the size of the result */
				1523	DWORD mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
				1524	if (mbcssize==0)
				1525	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1526
				1527	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1528	if (repr == NULL)
				1529	return NULL;
				1530	if (mbcssize==0)
				1531	return repr;
				1532
				1533	/* Do the conversion */
				1534	s = PyString_AS_STRING(repr);
				1535	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1536	Py_DECREF(repr);
				1537	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1538	}
				1539	return repr;
				1540	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame^]	1541
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1542	#endif /* MS_WIN32 */
				1543
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1544	/* --- Character Mapping Codec -------------------------------------------- */
				1545
				1546	static
				1547	int charmap_decoding_error(const char **source,
				1548	Py_UNICODE **dest,
				1549	const char *errors,
				1550	const char *details)
				1551	{
				1552	if ((errors == NULL) \|\|
				1553	(strcmp(errors,"strict") == 0)) {
				1554	PyErr_Format(PyExc_UnicodeError,
				1555	"charmap decoding error: %s",
				1556	details);
				1557	return -1;
				1558	}
				1559	else if (strcmp(errors,"ignore") == 0) {
				1560	return 0;
				1561	}
				1562	else if (strcmp(errors,"replace") == 0) {
				1563	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1564	(*dest)++;
				1565	return 0;
				1566	}
				1567	else {
				1568	PyErr_Format(PyExc_ValueError,
				1569	"charmap decoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1570	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1571	errors);
				1572	return -1;
				1573	}
				1574	}
				1575
				1576	PyObject PyUnicode_DecodeCharmap(const char s,
				1577	int size,
				1578	PyObject *mapping,
				1579	const char *errors)
				1580	{
				1581	PyUnicodeObject *v;
				1582	Py_UNICODE *p;
				1583
				1584	/* Default to Latin-1 */
				1585	if (mapping == NULL)
				1586	return PyUnicode_DecodeLatin1(s, size, errors);
				1587
				1588	v = _PyUnicode_New(size);
				1589	if (v == NULL)
				1590	goto onError;
				1591	if (size == 0)
				1592	return (PyObject *)v;
				1593	p = PyUnicode_AS_UNICODE(v);
				1594	while (size-- > 0) {
				1595	unsigned char ch = *s++;
				1596	PyObject w, x;
				1597
				1598	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1599	w = PyInt_FromLong((long)ch);
				1600	if (w == NULL)
				1601	goto onError;
				1602	x = PyObject_GetItem(mapping, w);
				1603	Py_DECREF(w);
				1604	if (x == NULL) {
				1605	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1606	/* No mapping found: default to Latin-1 mapping */
				1607	PyErr_Clear();
				1608	*p++ = (Py_UNICODE)ch;
				1609	continue;
				1610	}
				1611	goto onError;
				1612	}
				1613
				1614	/* Apply mapping */
				1615	if (PyInt_Check(x)) {
				1616	int value = PyInt_AS_LONG(x);
				1617	if (value < 0 \|\| value > 65535) {
				1618	PyErr_SetString(PyExc_TypeError,
				1619	"character mapping must be in range(65336)");
				1620	Py_DECREF(x);
				1621	goto onError;
				1622	}
				1623	*p++ = (Py_UNICODE)value;
				1624	}
				1625	else if (x == Py_None) {
				1626	/* undefined mapping */
				1627	if (charmap_decoding_error(&s, &p, errors,
				1628	"character maps to <undefined>")) {
				1629	Py_DECREF(x);
				1630	goto onError;
				1631	}
				1632	}
				1633	else if (PyUnicode_Check(x)) {
				1634	if (PyUnicode_GET_SIZE(x) != 1) {
				1635	/* 1-n mapping */
				1636	PyErr_SetString(PyExc_NotImplementedError,
				1637	"1-n mappings are currently not implemented");
				1638	Py_DECREF(x);
				1639	goto onError;
				1640	}
				1641	p++ = PyUnicode_AS_UNICODE(x);
				1642	}
				1643	else {
				1644	/* wrong return value */
				1645	PyErr_SetString(PyExc_TypeError,
				1646	"character mapping must return integer, None or unicode");
				1647	Py_DECREF(x);
				1648	goto onError;
				1649	}
				1650	Py_DECREF(x);
				1651	}
				1652	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1653	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1654	goto onError;
				1655	return (PyObject *)v;
				1656
				1657	onError:
				1658	Py_XDECREF(v);
				1659	return NULL;
				1660	}
				1661
				1662	static
				1663	int charmap_encoding_error(const Py_UNICODE **source,
				1664	char **dest,
				1665	const char *errors,
				1666	const char *details)
				1667	{
				1668	if ((errors == NULL) \|\|
				1669	(strcmp(errors,"strict") == 0)) {
				1670	PyErr_Format(PyExc_UnicodeError,
				1671	"charmap encoding error: %s",
				1672	details);
				1673	return -1;
				1674	}
				1675	else if (strcmp(errors,"ignore") == 0) {
				1676	return 0;
				1677	}
				1678	else if (strcmp(errors,"replace") == 0) {
				1679	**dest = '?';
				1680	(*dest)++;
				1681	return 0;
				1682	}
				1683	else {
				1684	PyErr_Format(PyExc_ValueError,
				1685	"charmap encoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1686	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1687	errors);
				1688	return -1;
				1689	}
				1690	}
				1691
				1692	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1693	int size,
				1694	PyObject *mapping,
				1695	const char *errors)
				1696	{
				1697	PyObject *v;
				1698	char *s;
				1699
				1700	/* Default to Latin-1 */
				1701	if (mapping == NULL)
				1702	return PyUnicode_EncodeLatin1(p, size, errors);
				1703
				1704	v = PyString_FromStringAndSize(NULL, size);
				1705	if (v == NULL)
				1706	return NULL;
				1707	s = PyString_AS_STRING(v);
				1708	while (size-- > 0) {
				1709	Py_UNICODE ch = *p++;
				1710	PyObject w, x;
				1711
				1712	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				1713	w = PyInt_FromLong((long)ch);
				1714	if (w == NULL)
				1715	goto onError;
				1716	x = PyObject_GetItem(mapping, w);
				1717	Py_DECREF(w);
				1718	if (x == NULL) {
				1719	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1720	/* No mapping found: default to Latin-1 mapping if possible */
				1721	PyErr_Clear();
				1722	if (ch < 256) {
				1723	*s++ = (char)ch;
				1724	continue;
				1725	}
				1726	else if (!charmap_encoding_error(&p, &s, errors,
				1727	"missing character mapping"))
				1728	continue;
				1729	}
				1730	goto onError;
				1731	}
				1732
				1733	/* Apply mapping */
				1734	if (PyInt_Check(x)) {
				1735	int value = PyInt_AS_LONG(x);
				1736	if (value < 0 \|\| value > 255) {
				1737	PyErr_SetString(PyExc_TypeError,
				1738	"character mapping must be in range(256)");
				1739	Py_DECREF(x);
				1740	goto onError;
				1741	}
				1742	*s++ = (char)value;
				1743	}
				1744	else if (x == Py_None) {
				1745	/* undefined mapping */
				1746	if (charmap_encoding_error(&p, &s, errors,
				1747	"character maps to <undefined>")) {
				1748	Py_DECREF(x);
				1749	goto onError;
				1750	}
				1751	}
				1752	else if (PyString_Check(x)) {
				1753	if (PyString_GET_SIZE(x) != 1) {
				1754	/* 1-n mapping */
				1755	PyErr_SetString(PyExc_NotImplementedError,
				1756	"1-n mappings are currently not implemented");
				1757	Py_DECREF(x);
				1758	goto onError;
				1759	}
				1760	s++ = PyString_AS_STRING(x);
				1761	}
				1762	else {
				1763	/* wrong return value */
				1764	PyErr_SetString(PyExc_TypeError,
				1765	"character mapping must return integer, None or unicode");
				1766	Py_DECREF(x);
				1767	goto onError;
				1768	}
				1769	Py_DECREF(x);
				1770	}
				1771	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				1772	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				1773	goto onError;
				1774	return v;
				1775
				1776	onError:
				1777	Py_DECREF(v);
				1778	return NULL;
				1779	}
				1780
				1781	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				1782	PyObject *mapping)
				1783	{
				1784	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				1785	PyErr_BadArgument();
				1786	return NULL;
				1787	}
				1788	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				1789	PyUnicode_GET_SIZE(unicode),
				1790	mapping,
				1791	NULL);
				1792	}
				1793
				1794	static
				1795	int translate_error(const Py_UNICODE **source,
				1796	Py_UNICODE **dest,
				1797	const char *errors,
				1798	const char *details)
				1799	{
				1800	if ((errors == NULL) \|\|
				1801	(strcmp(errors,"strict") == 0)) {
				1802	PyErr_Format(PyExc_UnicodeError,
				1803	"translate error: %s",
				1804	details);
				1805	return -1;
				1806	}
				1807	else if (strcmp(errors,"ignore") == 0) {
				1808	return 0;
				1809	}
				1810	else if (strcmp(errors,"replace") == 0) {
				1811	**dest = '?';
				1812	(*dest)++;
				1813	return 0;
				1814	}
				1815	else {
				1816	PyErr_Format(PyExc_ValueError,
				1817	"translate error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1818	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1819	errors);
				1820	return -1;
				1821	}
				1822	}
				1823
				1824	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				1825	int size,
				1826	PyObject *mapping,
				1827	const char *errors)
				1828	{
				1829	PyUnicodeObject *v;
				1830	Py_UNICODE *p;
				1831
				1832	if (mapping == NULL) {
				1833	PyErr_BadArgument();
				1834	return NULL;
				1835	}
				1836
				1837	/* Output will never be longer than input */
				1838	v = _PyUnicode_New(size);
				1839	if (v == NULL)
				1840	goto onError;
				1841	if (size == 0)
				1842	goto done;
				1843	p = PyUnicode_AS_UNICODE(v);
				1844	while (size-- > 0) {
				1845	Py_UNICODE ch = *s++;
				1846	PyObject w, x;
				1847
				1848	/* Get mapping */
				1849	w = PyInt_FromLong(ch);
				1850	if (w == NULL)
				1851	goto onError;
				1852	x = PyObject_GetItem(mapping, w);
				1853	Py_DECREF(w);
				1854	if (x == NULL) {
				1855	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1856	/* No mapping found: default to 1-1 mapping */
				1857	PyErr_Clear();
				1858	*p++ = ch;
				1859	continue;
				1860	}
				1861	goto onError;
				1862	}
				1863
				1864	/* Apply mapping */
				1865	if (PyInt_Check(x))
				1866	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				1867	else if (x == Py_None) {
				1868	/* undefined mapping */
				1869	if (translate_error(&s, &p, errors,
				1870	"character maps to <undefined>")) {
				1871	Py_DECREF(x);
				1872	goto onError;
				1873	}
				1874	}
				1875	else if (PyUnicode_Check(x)) {
				1876	if (PyUnicode_GET_SIZE(x) != 1) {
				1877	/* 1-n mapping */
				1878	PyErr_SetString(PyExc_NotImplementedError,
				1879	"1-n mappings are currently not implemented");
				1880	Py_DECREF(x);
				1881	goto onError;
				1882	}
				1883	p++ = PyUnicode_AS_UNICODE(x);
				1884	}
				1885	else {
				1886	/* wrong return value */
				1887	PyErr_SetString(PyExc_TypeError,
				1888	"translate mapping must return integer, None or unicode");
				1889	Py_DECREF(x);
				1890	goto onError;
				1891	}
				1892	Py_DECREF(x);
				1893	}
				1894	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1895	_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
				1896
				1897	done:
				1898	return (PyObject *)v;
				1899
				1900	onError:
				1901	Py_XDECREF(v);
				1902	return NULL;
				1903	}
				1904
				1905	PyObject PyUnicode_Translate(PyObject str,
				1906	PyObject *mapping,
				1907	const char *errors)
				1908	{
				1909	PyObject *result;
				1910
				1911	str = PyUnicode_FromObject(str);
				1912	if (str == NULL)
				1913	goto onError;
				1914	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				1915	PyUnicode_GET_SIZE(str),
				1916	mapping,
				1917	errors);
				1918	Py_DECREF(str);
				1919	return result;
				1920
				1921	onError:
				1922	Py_XDECREF(str);
				1923	return NULL;
				1924	}
				1925
				1926	/* --- Helpers ------------------------------------------------------------ */
				1927
				1928	static
				1929	int count(PyUnicodeObject *self,
				1930	int start,
				1931	int end,
				1932	PyUnicodeObject *substring)
				1933	{
				1934	int count = 0;
				1935
				1936	end -= substring->length;
				1937
				1938	while (start <= end)
				1939	if (Py_UNICODE_MATCH(self, start, substring)) {
				1940	count++;
				1941	start += substring->length;
				1942	} else
				1943	start++;
				1944
				1945	return count;
				1946	}
				1947
				1948	int PyUnicode_Count(PyObject *str,
				1949	PyObject *substr,
				1950	int start,
				1951	int end)
				1952	{
				1953	int result;
				1954
				1955	str = PyUnicode_FromObject(str);
				1956	if (str == NULL)
				1957	return -1;
				1958	substr = PyUnicode_FromObject(substr);
				1959	if (substr == NULL) {
				1960	Py_DECREF(substr);
				1961	return -1;
				1962	}
				1963
				1964	result = count((PyUnicodeObject *)str,
				1965	start, end,
				1966	(PyUnicodeObject *)substr);
				1967
				1968	Py_DECREF(str);
				1969	Py_DECREF(substr);
				1970	return result;
				1971	}
				1972
				1973	static
				1974	int findstring(PyUnicodeObject *self,
				1975	PyUnicodeObject *substring,
				1976	int start,
				1977	int end,
				1978	int direction)
				1979	{
				1980	if (start < 0)
				1981	start += self->length;
				1982	if (start < 0)
				1983	start = 0;
				1984
				1985	if (substring->length == 0)
				1986	return start;
				1987
				1988	if (end > self->length)
				1989	end = self->length;
				1990	if (end < 0)
				1991	end += self->length;
				1992	if (end < 0)
				1993	end = 0;
				1994
				1995	end -= substring->length;
				1996
				1997	if (direction < 0) {
				1998	for (; end >= start; end--)
				1999	if (Py_UNICODE_MATCH(self, end, substring))
				2000	return end;
				2001	} else {
				2002	for (; start <= end; start++)
				2003	if (Py_UNICODE_MATCH(self, start, substring))
				2004	return start;
				2005	}
				2006
				2007	return -1;
				2008	}
				2009
				2010	int PyUnicode_Find(PyObject *str,
				2011	PyObject *substr,
				2012	int start,
				2013	int end,
				2014	int direction)
				2015	{
				2016	int result;
				2017
				2018	str = PyUnicode_FromObject(str);
				2019	if (str == NULL)
				2020	return -1;
				2021	substr = PyUnicode_FromObject(substr);
				2022	if (substr == NULL) {
				2023	Py_DECREF(substr);
				2024	return -1;
				2025	}
				2026
				2027	result = findstring((PyUnicodeObject *)str,
				2028	(PyUnicodeObject *)substr,
				2029	start, end, direction);
				2030	Py_DECREF(str);
				2031	Py_DECREF(substr);
				2032	return result;
				2033	}
				2034
				2035	static
				2036	int tailmatch(PyUnicodeObject *self,
				2037	PyUnicodeObject *substring,
				2038	int start,
				2039	int end,
				2040	int direction)
				2041	{
				2042	if (start < 0)
				2043	start += self->length;
				2044	if (start < 0)
				2045	start = 0;
				2046
				2047	if (substring->length == 0)
				2048	return 1;
				2049
				2050	if (end > self->length)
				2051	end = self->length;
				2052	if (end < 0)
				2053	end += self->length;
				2054	if (end < 0)
				2055	end = 0;
				2056
				2057	end -= substring->length;
				2058	if (end < start)
				2059	return 0;
				2060
				2061	if (direction > 0) {
				2062	if (Py_UNICODE_MATCH(self, end, substring))
				2063	return 1;
				2064	} else {
				2065	if (Py_UNICODE_MATCH(self, start, substring))
				2066	return 1;
				2067	}
				2068
				2069	return 0;
				2070	}
				2071
				2072	int PyUnicode_Tailmatch(PyObject *str,
				2073	PyObject *substr,
				2074	int start,
				2075	int end,
				2076	int direction)
				2077	{
				2078	int result;
				2079
				2080	str = PyUnicode_FromObject(str);
				2081	if (str == NULL)
				2082	return -1;
				2083	substr = PyUnicode_FromObject(substr);
				2084	if (substr == NULL) {
				2085	Py_DECREF(substr);
				2086	return -1;
				2087	}
				2088
				2089	result = tailmatch((PyUnicodeObject *)str,
				2090	(PyUnicodeObject *)substr,
				2091	start, end, direction);
				2092	Py_DECREF(str);
				2093	Py_DECREF(substr);
				2094	return result;
				2095	}
				2096
				2097	static
				2098	const Py_UNICODE findchar(const Py_UNICODE s,
				2099	int size,
				2100	Py_UNICODE ch)
				2101	{
				2102	/* like wcschr, but doesn't stop at NULL characters */
				2103
				2104	while (size-- > 0) {
				2105	if (*s == ch)
				2106	return s;
				2107	s++;
				2108	}
				2109
				2110	return NULL;
				2111	}
				2112
				2113	/* Apply fixfct filter to the Unicode object self and return a
				2114	reference to the modified object */
				2115
				2116	static
				2117	PyObject fixup(PyUnicodeObject self,
				2118	int (fixfct)(PyUnicodeObject s))
				2119	{
				2120
				2121	PyUnicodeObject *u;
				2122
				2123	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2124	self->length);
				2125	if (u == NULL)
				2126	return NULL;
				2127	if (!fixfct(u)) {
				2128	/* fixfct should return TRUE if it modified the buffer. If
				2129	FALSE, return a reference to the original buffer instead
				2130	(to save space, not time) */
				2131	Py_INCREF(self);
				2132	Py_DECREF(u);
				2133	return (PyObject*) self;
				2134	}
				2135	return (PyObject*) u;
				2136	}
				2137
				2138	static
				2139	int fixupper(PyUnicodeObject *self)
				2140	{
				2141	int len = self->length;
				2142	Py_UNICODE *s = self->str;
				2143	int status = 0;
				2144
				2145	while (len-- > 0) {
				2146	register Py_UNICODE ch;
				2147
				2148	ch = Py_UNICODE_TOUPPER(*s);
				2149	if (ch != *s) {
				2150	status = 1;
				2151	*s = ch;
				2152	}
				2153	s++;
				2154	}
				2155
				2156	return status;
				2157	}
				2158
				2159	static
				2160	int fixlower(PyUnicodeObject *self)
				2161	{
				2162	int len = self->length;
				2163	Py_UNICODE *s = self->str;
				2164	int status = 0;
				2165
				2166	while (len-- > 0) {
				2167	register Py_UNICODE ch;
				2168
				2169	ch = Py_UNICODE_TOLOWER(*s);
				2170	if (ch != *s) {
				2171	status = 1;
				2172	*s = ch;
				2173	}
				2174	s++;
				2175	}
				2176
				2177	return status;
				2178	}
				2179
				2180	static
				2181	int fixswapcase(PyUnicodeObject *self)
				2182	{
				2183	int len = self->length;
				2184	Py_UNICODE *s = self->str;
				2185	int status = 0;
				2186
				2187	while (len-- > 0) {
				2188	if (Py_UNICODE_ISUPPER(*s)) {
				2189	s = Py_UNICODE_TOLOWER(s);
				2190	status = 1;
				2191	} else if (Py_UNICODE_ISLOWER(*s)) {
				2192	s = Py_UNICODE_TOUPPER(s);
				2193	status = 1;
				2194	}
				2195	s++;
				2196	}
				2197
				2198	return status;
				2199	}
				2200
				2201	static
				2202	int fixcapitalize(PyUnicodeObject *self)
				2203	{
				2204	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2205	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2206	return 1;
				2207	}
				2208	return 0;
				2209	}
				2210
				2211	static
				2212	int fixtitle(PyUnicodeObject *self)
				2213	{
				2214	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2215	register Py_UNICODE *e;
				2216	int previous_is_cased;
				2217
				2218	/* Shortcut for single character strings */
				2219	if (PyUnicode_GET_SIZE(self) == 1) {
				2220	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2221	if (*p != ch) {
				2222	*p = ch;
				2223	return 1;
				2224	}
				2225	else
				2226	return 0;
				2227	}
				2228
				2229	e = p + PyUnicode_GET_SIZE(self);
				2230	previous_is_cased = 0;
				2231	for (; p < e; p++) {
				2232	register const Py_UNICODE ch = *p;
				2233
				2234	if (previous_is_cased)
				2235	*p = Py_UNICODE_TOLOWER(ch);
				2236	else
				2237	*p = Py_UNICODE_TOTITLE(ch);
				2238
				2239	if (Py_UNICODE_ISLOWER(ch) \|\|
				2240	Py_UNICODE_ISUPPER(ch) \|\|
				2241	Py_UNICODE_ISTITLE(ch))
				2242	previous_is_cased = 1;
				2243	else
				2244	previous_is_cased = 0;
				2245	}
				2246	return 1;
				2247	}
				2248
				2249	PyObject PyUnicode_Join(PyObject separator,
				2250	PyObject *seq)
				2251	{
				2252	Py_UNICODE *sep;
				2253	int seplen;
				2254	PyUnicodeObject *res = NULL;
				2255	int reslen = 0;
				2256	Py_UNICODE *p;
				2257	int seqlen = 0;
				2258	int sz = 100;
				2259	int i;
				2260
				2261	seqlen = PySequence_Length(seq);
				2262	if (seqlen < 0 && PyErr_Occurred())
				2263	return NULL;
				2264
				2265	if (separator == NULL) {
				2266	Py_UNICODE blank = ' ';
				2267	sep = &blank;
				2268	seplen = 1;
				2269	}
				2270	else {
				2271	separator = PyUnicode_FromObject(separator);
				2272	if (separator == NULL)
				2273	return NULL;
				2274	sep = PyUnicode_AS_UNICODE(separator);
				2275	seplen = PyUnicode_GET_SIZE(separator);
				2276	}
				2277
				2278	res = _PyUnicode_New(sz);
				2279	if (res == NULL)
				2280	goto onError;
				2281	p = PyUnicode_AS_UNICODE(res);
				2282	reslen = 0;
				2283
				2284	for (i = 0; i < seqlen; i++) {
				2285	int itemlen;
				2286	PyObject *item;
				2287
				2288	item = PySequence_GetItem(seq, i);
				2289	if (item == NULL)
				2290	goto onError;
				2291	if (!PyUnicode_Check(item)) {
				2292	PyObject *v;
				2293	v = PyUnicode_FromObject(item);
				2294	Py_DECREF(item);
				2295	item = v;
				2296	if (item == NULL)
				2297	goto onError;
				2298	}
				2299	itemlen = PyUnicode_GET_SIZE(item);
				2300	while (reslen + itemlen + seplen >= sz) {
				2301	if (_PyUnicode_Resize(res, sz*2))
				2302	goto onError;
				2303	sz *= 2;
				2304	p = PyUnicode_AS_UNICODE(res) + reslen;
				2305	}
				2306	if (i > 0) {
				2307	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2308	p += seplen;
				2309	reslen += seplen;
				2310	}
				2311	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2312	p += itemlen;
				2313	reslen += itemlen;
				2314	Py_DECREF(item);
				2315	}
				2316	if (_PyUnicode_Resize(res, reslen))
				2317	goto onError;
				2318
				2319	Py_XDECREF(separator);
				2320	return (PyObject *)res;
				2321
				2322	onError:
				2323	Py_XDECREF(separator);
				2324	Py_DECREF(res);
				2325	return NULL;
				2326	}
				2327
				2328	static
				2329	PyUnicodeObject pad(PyUnicodeObject self,
				2330	int left,
				2331	int right,
				2332	Py_UNICODE fill)
				2333	{
				2334	PyUnicodeObject *u;
				2335
				2336	if (left < 0)
				2337	left = 0;
				2338	if (right < 0)
				2339	right = 0;
				2340
				2341	if (left == 0 && right == 0) {
				2342	Py_INCREF(self);
				2343	return self;
				2344	}
				2345
				2346	u = _PyUnicode_New(left + self->length + right);
				2347	if (u) {
				2348	if (left)
				2349	Py_UNICODE_FILL(u->str, fill, left);
				2350	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2351	if (right)
				2352	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2353	}
				2354
				2355	return u;
				2356	}
				2357
				2358	#define SPLIT_APPEND(data, left, right) \
				2359	str = PyUnicode_FromUnicode(data + left, right - left); \
				2360	if (!str) \
				2361	goto onError; \
				2362	if (PyList_Append(list, str)) { \
				2363	Py_DECREF(str); \
				2364	goto onError; \
				2365	} \
				2366	else \
				2367	Py_DECREF(str);
				2368
				2369	static
				2370	PyObject split_whitespace(PyUnicodeObject self,
				2371	PyObject *list,
				2372	int maxcount)
				2373	{
				2374	register int i;
				2375	register int j;
				2376	int len = self->length;
				2377	PyObject *str;
				2378
				2379	for (i = j = 0; i < len; ) {
				2380	/* find a token */
				2381	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2382	i++;
				2383	j = i;
				2384	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2385	i++;
				2386	if (j < i) {
				2387	if (maxcount-- <= 0)
				2388	break;
				2389	SPLIT_APPEND(self->str, j, i);
				2390	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2391	i++;
				2392	j = i;
				2393	}
				2394	}
				2395	if (j < len) {
				2396	SPLIT_APPEND(self->str, j, len);
				2397	}
				2398	return list;
				2399
				2400	onError:
				2401	Py_DECREF(list);
				2402	return NULL;
				2403	}
				2404
				2405	PyObject PyUnicode_Splitlines(PyObject string,
				2406	int maxcount)
				2407	{
				2408	register int i;
				2409	register int j;
				2410	int len;
				2411	PyObject *list;
				2412	PyObject *str;
				2413	Py_UNICODE *data;
				2414
				2415	string = PyUnicode_FromObject(string);
				2416	if (string == NULL)
				2417	return NULL;
				2418	data = PyUnicode_AS_UNICODE(string);
				2419	len = PyUnicode_GET_SIZE(string);
				2420
				2421	if (maxcount < 0)
				2422	maxcount = INT_MAX;
				2423
				2424	list = PyList_New(0);
				2425	if (!list)
				2426	goto onError;
				2427
				2428	for (i = j = 0; i < len; ) {
				2429	/* Find a line and append it */
				2430	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2431	i++;
				2432	if (maxcount-- <= 0)
				2433	break;
				2434	SPLIT_APPEND(data, j, i);
				2435
				2436	/* Skip the line break reading CRLF as one line break */
				2437	if (i < len) {
				2438	if (data[i] == '\r' && i + 1 < len &&
				2439	data[i+1] == '\n')
				2440	i += 2;
				2441	else
				2442	i++;
				2443	}
				2444	j = i;
				2445	}
				2446	if (j < len) {
				2447	SPLIT_APPEND(data, j, len);
				2448	}
				2449
				2450	Py_DECREF(string);
				2451	return list;
				2452
				2453	onError:
				2454	Py_DECREF(list);
				2455	Py_DECREF(string);
				2456	return NULL;
				2457	}
				2458
				2459	static
				2460	PyObject split_char(PyUnicodeObject self,
				2461	PyObject *list,
				2462	Py_UNICODE ch,
				2463	int maxcount)
				2464	{
				2465	register int i;
				2466	register int j;
				2467	int len = self->length;
				2468	PyObject *str;
				2469
				2470	for (i = j = 0; i < len; ) {
				2471	if (self->str[i] == ch) {
				2472	if (maxcount-- <= 0)
				2473	break;
				2474	SPLIT_APPEND(self->str, j, i);
				2475	i = j = i + 1;
				2476	} else
				2477	i++;
				2478	}
				2479	if (j <= len) {
				2480	SPLIT_APPEND(self->str, j, len);
				2481	}
				2482	return list;
				2483
				2484	onError:
				2485	Py_DECREF(list);
				2486	return NULL;
				2487	}
				2488
				2489	static
				2490	PyObject split_substring(PyUnicodeObject self,
				2491	PyObject *list,
				2492	PyUnicodeObject *substring,
				2493	int maxcount)
				2494	{
				2495	register int i;
				2496	register int j;
				2497	int len = self->length;
				2498	int sublen = substring->length;
				2499	PyObject *str;
				2500
				2501	for (i = j = 0; i < len - sublen; ) {
				2502	if (Py_UNICODE_MATCH(self, i, substring)) {
				2503	if (maxcount-- <= 0)
				2504	break;
				2505	SPLIT_APPEND(self->str, j, i);
				2506	i = j = i + sublen;
				2507	} else
				2508	i++;
				2509	}
				2510	if (j <= len) {
				2511	SPLIT_APPEND(self->str, j, len);
				2512	}
				2513	return list;
				2514
				2515	onError:
				2516	Py_DECREF(list);
				2517	return NULL;
				2518	}
				2519
				2520	#undef SPLIT_APPEND
				2521
				2522	static
				2523	PyObject split(PyUnicodeObject self,
				2524	PyUnicodeObject *substring,
				2525	int maxcount)
				2526	{
				2527	PyObject *list;
				2528
				2529	if (maxcount < 0)
				2530	maxcount = INT_MAX;
				2531
				2532	list = PyList_New(0);
				2533	if (!list)
				2534	return NULL;
				2535
				2536	if (substring == NULL)
				2537	return split_whitespace(self,list,maxcount);
				2538
				2539	else if (substring->length == 1)
				2540	return split_char(self,list,substring->str[0],maxcount);
				2541
				2542	else if (substring->length == 0) {
				2543	Py_DECREF(list);
				2544	PyErr_SetString(PyExc_ValueError, "empty separator");
				2545	return NULL;
				2546	}
				2547	else
				2548	return split_substring(self,list,substring,maxcount);
				2549	}
				2550
				2551	static
				2552	PyObject strip(PyUnicodeObject self,
				2553	int left,
				2554	int right)
				2555	{
				2556	Py_UNICODE *p = self->str;
				2557	int start = 0;
				2558	int end = self->length;
				2559
				2560	if (left)
				2561	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2562	start++;
				2563
				2564	if (right)
				2565	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2566	end--;
				2567
				2568	if (start == 0 && end == self->length) {
				2569	/* couldn't strip anything off, return original string */
				2570	Py_INCREF(self);
				2571	return (PyObject*) self;
				2572	}
				2573
				2574	return (PyObject*) PyUnicode_FromUnicode(
				2575	self->str + start,
				2576	end - start
				2577	);
				2578	}
				2579
				2580	static
				2581	PyObject replace(PyUnicodeObject self,
				2582	PyUnicodeObject *str1,
				2583	PyUnicodeObject *str2,
				2584	int maxcount)
				2585	{
				2586	PyUnicodeObject *u;
				2587
				2588	if (maxcount < 0)
				2589	maxcount = INT_MAX;
				2590
				2591	if (str1->length == 1 && str2->length == 1) {
				2592	int i;
				2593
				2594	/* replace characters */
				2595	if (!findchar(self->str, self->length, str1->str[0])) {
				2596	/* nothing to replace, return original string */
				2597	Py_INCREF(self);
				2598	u = self;
				2599	} else {
				2600	Py_UNICODE u1 = str1->str[0];
				2601	Py_UNICODE u2 = str2->str[0];
				2602
				2603	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2604	self->str,
				2605	self->length
				2606	);
				2607	if (u)
				2608	for (i = 0; i < u->length; i++)
				2609	if (u->str[i] == u1) {
				2610	if (--maxcount < 0)
				2611	break;
				2612	u->str[i] = u2;
				2613	}
				2614	}
				2615
				2616	} else {
				2617	int n, i;
				2618	Py_UNICODE *p;
				2619
				2620	/* replace strings */
				2621	n = count(self, 0, self->length, str1);
				2622	if (n > maxcount)
				2623	n = maxcount;
				2624	if (n == 0) {
				2625	/* nothing to replace, return original string */
				2626	Py_INCREF(self);
				2627	u = self;
				2628	} else {
				2629	u = _PyUnicode_New(
				2630	self->length + n * (str2->length - str1->length));
				2631	if (u) {
				2632	i = 0;
				2633	p = u->str;
				2634	while (i <= self->length - str1->length)
				2635	if (Py_UNICODE_MATCH(self, i, str1)) {
				2636	/* replace string segment */
				2637	Py_UNICODE_COPY(p, str2->str, str2->length);
				2638	p += str2->length;
				2639	i += str1->length;
				2640	if (--n <= 0) {
				2641	/* copy remaining part */
				2642	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				2643	break;
				2644	}
				2645	} else
				2646	*p++ = self->str[i++];
				2647	}
				2648	}
				2649	}
				2650
				2651	return (PyObject *) u;
				2652	}
				2653
				2654	/* --- Unicode Object Methods --------------------------------------------- */
				2655
				2656	static char title__doc__[] =
				2657	"S.title() -> unicode\n\
				2658	\n\
				2659	Return a titlecased version of S, i.e. words start with title case\n\
				2660	characters, all remaining cased characters have lower case.";
				2661
				2662	static PyObject*
				2663	unicode_title(PyUnicodeObject self, PyObject args)
				2664	{
				2665	if (!PyArg_NoArgs(args))
				2666	return NULL;
				2667	return fixup(self, fixtitle);
				2668	}
				2669
				2670	static char capitalize__doc__[] =
				2671	"S.capitalize() -> unicode\n\
				2672	\n\
				2673	Return a capitalized version of S, i.e. make the first character\n\
				2674	have upper case.";
				2675
				2676	static PyObject*
				2677	unicode_capitalize(PyUnicodeObject self, PyObject args)
				2678	{
				2679	if (!PyArg_NoArgs(args))
				2680	return NULL;
				2681	return fixup(self, fixcapitalize);
				2682	}
				2683
				2684	#if 0
				2685	static char capwords__doc__[] =
				2686	"S.capwords() -> unicode\n\
				2687	\n\
				2688	Apply .capitalize() to all words in S and return the result with\n\
				2689	normalized whitespace (all whitespace strings are replaced by ' ').";
				2690
				2691	static PyObject*
				2692	unicode_capwords(PyUnicodeObject self, PyObject args)
				2693	{
				2694	PyObject *list;
				2695	PyObject *item;
				2696	int i;
				2697
				2698	if (!PyArg_NoArgs(args))
				2699	return NULL;
				2700
				2701	/* Split into words */
				2702	list = split(self, NULL, -1);
				2703	if (!list)
				2704	return NULL;
				2705
				2706	/* Capitalize each word */
				2707	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				2708	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				2709	fixcapitalize);
				2710	if (item == NULL)
				2711	goto onError;
				2712	Py_DECREF(PyList_GET_ITEM(list, i));
				2713	PyList_SET_ITEM(list, i, item);
				2714	}
				2715
				2716	/* Join the words to form a new string */
				2717	item = PyUnicode_Join(NULL, list);
				2718
				2719	onError:
				2720	Py_DECREF(list);
				2721	return (PyObject *)item;
				2722	}
				2723	#endif
				2724
				2725	static char center__doc__[] =
				2726	"S.center(width) -> unicode\n\
				2727	\n\
				2728	Return S centered in a Unicode string of length width. Padding is done\n\
				2729	using spaces.";
				2730
				2731	static PyObject *
				2732	unicode_center(PyUnicodeObject self, PyObject args)
				2733	{
				2734	int marg, left;
				2735	int width;
				2736
				2737	if (!PyArg_ParseTuple(args, "i:center", &width))
				2738	return NULL;
				2739
				2740	if (self->length >= width) {
				2741	Py_INCREF(self);
				2742	return (PyObject*) self;
				2743	}
				2744
				2745	marg = width - self->length;
				2746	left = marg / 2 + (marg & width & 1);
				2747
				2748	return (PyObject*) pad(self, left, marg - left, ' ');
				2749	}
				2750
				2751	static int
				2752	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				2753	{
				2754	int len1, len2;
				2755	Py_UNICODE *s1 = str1->str;
				2756	Py_UNICODE *s2 = str2->str;
				2757
				2758	len1 = str1->length;
				2759	len2 = str2->length;
				2760
				2761	while (len1 > 0 && len2 > 0) {
				2762	int cmp = (s1++) - (s2++);
				2763	if (cmp)
				2764	/* This should make Christian happy! */
				2765	return (cmp < 0) ? -1 : (cmp != 0);
				2766	len1--, len2--;
				2767	}
				2768
				2769	return (len1 < len2) ? -1 : (len1 != len2);
				2770	}
				2771
				2772	int PyUnicode_Compare(PyObject *left,
				2773	PyObject *right)
				2774	{
				2775	PyUnicodeObject u = NULL, v = NULL;
				2776	int result;
				2777
				2778	/* Coerce the two arguments */
				2779	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2780	if (u == NULL)
				2781	goto onError;
				2782	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2783	if (v == NULL)
				2784	goto onError;
				2785
				2786	/* Shortcut for emtpy or interned objects */
				2787	if (v == u) {
				2788	Py_DECREF(u);
				2789	Py_DECREF(v);
				2790	return 0;
				2791	}
				2792
				2793	result = unicode_compare(u, v);
				2794
				2795	Py_DECREF(u);
				2796	Py_DECREF(v);
				2797	return result;
				2798
				2799	onError:
				2800	Py_XDECREF(u);
				2801	Py_XDECREF(v);
				2802	return -1;
				2803	}
				2804
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2805	int PyUnicode_Contains(PyObject *container,
				2806	PyObject *element)
				2807	{
				2808	PyUnicodeObject u = NULL, v = NULL;
				2809	int result;
				2810	register const Py_UNICODE p, e;
				2811	register Py_UNICODE ch;
				2812
				2813	/* Coerce the two arguments */
				2814	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				2815	if (u == NULL)
				2816	goto onError;
				2817	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
				2818	if (v == NULL)
				2819	goto onError;
				2820
				2821	/* Check v in u */
				2822	if (PyUnicode_GET_SIZE(v) != 1) {
				2823	PyErr_SetString(PyExc_TypeError,
				2824	"string member test needs char left operand");
				2825	goto onError;
				2826	}
				2827	ch = *PyUnicode_AS_UNICODE(v);
				2828	p = PyUnicode_AS_UNICODE(u);
				2829	e = p + PyUnicode_GET_SIZE(u);
				2830	result = 0;
				2831	while (p < e) {
				2832	if (*p++ == ch) {
				2833	result = 1;
				2834	break;
				2835	}
				2836	}
				2837
				2838	Py_DECREF(u);
				2839	Py_DECREF(v);
				2840	return result;
				2841
				2842	onError:
				2843	Py_XDECREF(u);
				2844	Py_XDECREF(v);
				2845	return -1;
				2846	}
				2847
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2848	/* Concat to string or Unicode object giving a new Unicode object. */
				2849
				2850	PyObject PyUnicode_Concat(PyObject left,
				2851	PyObject *right)
				2852	{
				2853	PyUnicodeObject u = NULL, v = NULL, *w;
				2854
				2855	/* Coerce the two arguments */
				2856	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2857	if (u == NULL)
				2858	goto onError;
				2859	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2860	if (v == NULL)
				2861	goto onError;
				2862
				2863	/* Shortcuts */
				2864	if (v == unicode_empty) {
				2865	Py_DECREF(v);
				2866	return (PyObject *)u;
				2867	}
				2868	if (u == unicode_empty) {
				2869	Py_DECREF(u);
				2870	return (PyObject *)v;
				2871	}
				2872
				2873	/* Concat the two Unicode strings */
				2874	w = _PyUnicode_New(u->length + v->length);
				2875	if (w == NULL)
				2876	goto onError;
				2877	Py_UNICODE_COPY(w->str, u->str, u->length);
				2878	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				2879
				2880	Py_DECREF(u);
				2881	Py_DECREF(v);
				2882	return (PyObject *)w;
				2883
				2884	onError:
				2885	Py_XDECREF(u);
				2886	Py_XDECREF(v);
				2887	return NULL;
				2888	}
				2889
				2890	static char count__doc__[] =
				2891	"S.count(sub[, start[, end]]) -> int\n\
				2892	\n\
				2893	Return the number of occurrences of substring sub in Unicode string\n\
				2894	S[start:end]. Optional arguments start and end are\n\
				2895	interpreted as in slice notation.";
				2896
				2897	static PyObject *
				2898	unicode_count(PyUnicodeObject self, PyObject args)
				2899	{
				2900	PyUnicodeObject *substring;
				2901	int start = 0;
				2902	int end = INT_MAX;
				2903	PyObject *result;
				2904
				2905	if (!PyArg_ParseTuple(args, "O\|ii:count", &substring, &start, &end))
				2906	return NULL;
				2907
				2908	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				2909	(PyObject *)substring);
				2910	if (substring == NULL)
				2911	return NULL;
				2912
				2913	if (substring->length == 0) {
				2914	Py_DECREF(substring);
				2915	return PyInt_FromLong((long) 0);
				2916	}
				2917
				2918	if (start < 0)
				2919	start += self->length;
				2920	if (start < 0)
				2921	start = 0;
				2922	if (end > self->length)
				2923	end = self->length;
				2924	if (end < 0)
				2925	end += self->length;
				2926	if (end < 0)
				2927	end = 0;
				2928
				2929	result = PyInt_FromLong((long) count(self, start, end, substring));
				2930
				2931	Py_DECREF(substring);
				2932	return result;
				2933	}
				2934
				2935	static char encode__doc__[] =
				2936	"S.encode([encoding[,errors]]) -> string\n\
				2937	\n\
				2938	Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
				2939	errors may be given to set a different error handling scheme. Default\n\
				2940	is 'strict' meaning that encoding errors raise a ValueError. Other\n\
				2941	possible values are 'ignore' and 'replace'.";
				2942
				2943	static PyObject *
				2944	unicode_encode(PyUnicodeObject self, PyObject args)
				2945	{
				2946	char *encoding = NULL;
				2947	char *errors = NULL;
				2948	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				2949	return NULL;
				2950	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				2951	}
				2952
				2953	static char expandtabs__doc__[] =
				2954	"S.expandtabs([tabsize]) -> unicode\n\
				2955	\n\
				2956	Return a copy of S where all tab characters are expanded using spaces.\n\
				2957	If tabsize is not given, a tab size of 8 characters is assumed.";
				2958
				2959	static PyObject*
				2960	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				2961	{
				2962	Py_UNICODE *e;
				2963	Py_UNICODE *p;
				2964	Py_UNICODE *q;
				2965	int i, j;
				2966	PyUnicodeObject *u;
				2967	int tabsize = 8;
				2968
				2969	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				2970	return NULL;
				2971
				2972	/* First pass: determine size of ouput string */
				2973	i = j = 0;
				2974	e = self->str + self->length;
				2975	for (p = self->str; p < e; p++)
				2976	if (*p == '\t') {
				2977	if (tabsize > 0)
				2978	j += tabsize - (j % tabsize);
				2979	}
				2980	else {
				2981	j++;
				2982	if (p == '\n' \|\| p == '\r') {
				2983	i += j;
				2984	j = 0;
				2985	}
				2986	}
				2987
				2988	/* Second pass: create output string and fill it */
				2989	u = _PyUnicode_New(i + j);
				2990	if (!u)
				2991	return NULL;
				2992
				2993	j = 0;
				2994	q = u->str;
				2995
				2996	for (p = self->str; p < e; p++)
				2997	if (*p == '\t') {
				2998	if (tabsize > 0) {
				2999	i = tabsize - (j % tabsize);
				3000	j += i;
				3001	while (i--)
				3002	*q++ = ' ';
				3003	}
				3004	}
				3005	else {
				3006	j++;
				3007	q++ = p;
				3008	if (p == '\n' \|\| p == '\r')
				3009	j = 0;
				3010	}
				3011
				3012	return (PyObject*) u;
				3013	}
				3014
				3015	static char find__doc__[] =
				3016	"S.find(sub [,start [,end]]) -> int\n\
				3017	\n\
				3018	Return the lowest index in S where substring sub is found,\n\
				3019	such that sub is contained within s[start,end]. Optional\n\
				3020	arguments start and end are interpreted as in slice notation.\n\
				3021	\n\
				3022	Return -1 on failure.";
				3023
				3024	static PyObject *
				3025	unicode_find(PyUnicodeObject self, PyObject args)
				3026	{
				3027	PyUnicodeObject *substring;
				3028	int start = 0;
				3029	int end = INT_MAX;
				3030	PyObject *result;
				3031
				3032	if (!PyArg_ParseTuple(args, "O\|ii:find", &substring, &start, &end))
				3033	return NULL;
				3034	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3035	(PyObject *)substring);
				3036	if (substring == NULL)
				3037	return NULL;
				3038
				3039	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3040
				3041	Py_DECREF(substring);
				3042	return result;
				3043	}
				3044
				3045	static PyObject *
				3046	unicode_getitem(PyUnicodeObject *self, int index)
				3047	{
				3048	if (index < 0 \|\| index >= self->length) {
				3049	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3050	return NULL;
				3051	}
				3052
				3053	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3054	}
				3055
				3056	static long
				3057	unicode_hash(PyUnicodeObject *self)
				3058	{
				3059	long hash;
				3060	PyObject *utf8;
				3061
				3062	/* Since Unicode objects compare equal to their UTF-8 string
				3063	counterparts, they should also use the UTF-8 strings as basis
				3064	for their hash value. This is needed to assure that strings and
				3065	Unicode objects behave in the same way as dictionary
				3066	keys. Unfortunately, this costs some performance and also some
				3067	memory if the cached UTF-8 representation is not used later
				3068	on. */
				3069	if (self->hash != -1)
				3070	return self->hash;
				3071	utf8 = utf8_string(self, NULL);
				3072	if (utf8 == NULL)
				3073	return -1;
				3074	hash = PyObject_Hash(utf8);
				3075	if (hash == -1)
				3076	return -1;
				3077	self->hash = hash;
				3078	return hash;
				3079	}
				3080
				3081	static char index__doc__[] =
				3082	"S.index(sub [,start [,end]]) -> int\n\
				3083	\n\
				3084	Like S.find() but raise ValueError when the substring is not found.";
				3085
				3086	static PyObject *
				3087	unicode_index(PyUnicodeObject self, PyObject args)
				3088	{
				3089	int result;
				3090	PyUnicodeObject *substring;
				3091	int start = 0;
				3092	int end = INT_MAX;
				3093
				3094	if (!PyArg_ParseTuple(args, "O\|ii:index", &substring, &start, &end))
				3095	return NULL;
				3096
				3097	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3098	(PyObject *)substring);
				3099	if (substring == NULL)
				3100	return NULL;
				3101
				3102	result = findstring(self, substring, start, end, 1);
				3103
				3104	Py_DECREF(substring);
				3105	if (result < 0) {
				3106	PyErr_SetString(PyExc_ValueError, "substring not found");
				3107	return NULL;
				3108	}
				3109	return PyInt_FromLong(result);
				3110	}
				3111
				3112	static char islower__doc__[] =
				3113	"S.islower() -> int\n\
				3114	\n\
				3115	Return 1 if all cased characters in S are lowercase and there is\n\
				3116	at least one cased character in S, 0 otherwise.";
				3117
				3118	static PyObject*
				3119	unicode_islower(PyUnicodeObject self, PyObject args)
				3120	{
				3121	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3122	register const Py_UNICODE *e;
				3123	int cased;
				3124
				3125	if (!PyArg_NoArgs(args))
				3126	return NULL;
				3127
				3128	/* Shortcut for single character strings */
				3129	if (PyUnicode_GET_SIZE(self) == 1)
				3130	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3131
				3132	e = p + PyUnicode_GET_SIZE(self);
				3133	cased = 0;
				3134	for (; p < e; p++) {
				3135	register const Py_UNICODE ch = *p;
				3136
				3137	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3138	return PyInt_FromLong(0);
				3139	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3140	cased = 1;
				3141	}
				3142	return PyInt_FromLong(cased);
				3143	}
				3144
				3145	static char isupper__doc__[] =
				3146	"S.isupper() -> int\n\
				3147	\n\
				3148	Return 1 if all cased characters in S are uppercase and there is\n\
				3149	at least one cased character in S, 0 otherwise.";
				3150
				3151	static PyObject*
				3152	unicode_isupper(PyUnicodeObject self, PyObject args)
				3153	{
				3154	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3155	register const Py_UNICODE *e;
				3156	int cased;
				3157
				3158	if (!PyArg_NoArgs(args))
				3159	return NULL;
				3160
				3161	/* Shortcut for single character strings */
				3162	if (PyUnicode_GET_SIZE(self) == 1)
				3163	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3164
				3165	e = p + PyUnicode_GET_SIZE(self);
				3166	cased = 0;
				3167	for (; p < e; p++) {
				3168	register const Py_UNICODE ch = *p;
				3169
				3170	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3171	return PyInt_FromLong(0);
				3172	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3173	cased = 1;
				3174	}
				3175	return PyInt_FromLong(cased);
				3176	}
				3177
				3178	static char istitle__doc__[] =
				3179	"S.istitle() -> int\n\
				3180	\n\
				3181	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3182	may only follow uncased characters and lowercase characters only cased\n\
				3183	ones. Return 0 otherwise.";
				3184
				3185	static PyObject*
				3186	unicode_istitle(PyUnicodeObject self, PyObject args)
				3187	{
				3188	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3189	register const Py_UNICODE *e;
				3190	int cased, previous_is_cased;
				3191
				3192	if (!PyArg_NoArgs(args))
				3193	return NULL;
				3194
				3195	/* Shortcut for single character strings */
				3196	if (PyUnicode_GET_SIZE(self) == 1)
				3197	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3198	(Py_UNICODE_ISUPPER(*p) != 0));
				3199
				3200	e = p + PyUnicode_GET_SIZE(self);
				3201	cased = 0;
				3202	previous_is_cased = 0;
				3203	for (; p < e; p++) {
				3204	register const Py_UNICODE ch = *p;
				3205
				3206	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3207	if (previous_is_cased)
				3208	return PyInt_FromLong(0);
				3209	previous_is_cased = 1;
				3210	cased = 1;
				3211	}
				3212	else if (Py_UNICODE_ISLOWER(ch)) {
				3213	if (!previous_is_cased)
				3214	return PyInt_FromLong(0);
				3215	previous_is_cased = 1;
				3216	cased = 1;
				3217	}
				3218	else
				3219	previous_is_cased = 0;
				3220	}
				3221	return PyInt_FromLong(cased);
				3222	}
				3223
				3224	static char isspace__doc__[] =
				3225	"S.isspace() -> int\n\
				3226	\n\
				3227	Return 1 if there are only whitespace characters in S,\n\
				3228	0 otherwise.";
				3229
				3230	static PyObject*
				3231	unicode_isspace(PyUnicodeObject self, PyObject args)
				3232	{
				3233	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3234	register const Py_UNICODE *e;
				3235
				3236	if (!PyArg_NoArgs(args))
				3237	return NULL;
				3238
				3239	/* Shortcut for single character strings */
				3240	if (PyUnicode_GET_SIZE(self) == 1 &&
				3241	Py_UNICODE_ISSPACE(*p))
				3242	return PyInt_FromLong(1);
				3243
				3244	e = p + PyUnicode_GET_SIZE(self);
				3245	for (; p < e; p++) {
				3246	if (!Py_UNICODE_ISSPACE(*p))
				3247	return PyInt_FromLong(0);
				3248	}
				3249	return PyInt_FromLong(1);
				3250	}
				3251
				3252	static char isdecimal__doc__[] =
				3253	"S.isdecimal() -> int\n\
				3254	\n\
				3255	Return 1 if there are only decimal characters in S,\n\
				3256	0 otherwise.";
				3257
				3258	static PyObject*
				3259	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3260	{
				3261	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3262	register const Py_UNICODE *e;
				3263
				3264	if (!PyArg_NoArgs(args))
				3265	return NULL;
				3266
				3267	/* Shortcut for single character strings */
				3268	if (PyUnicode_GET_SIZE(self) == 1 &&
				3269	Py_UNICODE_ISDECIMAL(*p))
				3270	return PyInt_FromLong(1);
				3271
				3272	e = p + PyUnicode_GET_SIZE(self);
				3273	for (; p < e; p++) {
				3274	if (!Py_UNICODE_ISDECIMAL(*p))
				3275	return PyInt_FromLong(0);
				3276	}
				3277	return PyInt_FromLong(1);
				3278	}
				3279
				3280	static char isdigit__doc__[] =
				3281	"S.isdigit() -> int\n\
				3282	\n\
				3283	Return 1 if there are only digit characters in S,\n\
				3284	0 otherwise.";
				3285
				3286	static PyObject*
				3287	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3288	{
				3289	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3290	register const Py_UNICODE *e;
				3291
				3292	if (!PyArg_NoArgs(args))
				3293	return NULL;
				3294
				3295	/* Shortcut for single character strings */
				3296	if (PyUnicode_GET_SIZE(self) == 1 &&
				3297	Py_UNICODE_ISDIGIT(*p))
				3298	return PyInt_FromLong(1);
				3299
				3300	e = p + PyUnicode_GET_SIZE(self);
				3301	for (; p < e; p++) {
				3302	if (!Py_UNICODE_ISDIGIT(*p))
				3303	return PyInt_FromLong(0);
				3304	}
				3305	return PyInt_FromLong(1);
				3306	}
				3307
				3308	static char isnumeric__doc__[] =
				3309	"S.isnumeric() -> int\n\
				3310	\n\
				3311	Return 1 if there are only numeric characters in S,\n\
				3312	0 otherwise.";
				3313
				3314	static PyObject*
				3315	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3316	{
				3317	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3318	register const Py_UNICODE *e;
				3319
				3320	if (!PyArg_NoArgs(args))
				3321	return NULL;
				3322
				3323	/* Shortcut for single character strings */
				3324	if (PyUnicode_GET_SIZE(self) == 1 &&
				3325	Py_UNICODE_ISNUMERIC(*p))
				3326	return PyInt_FromLong(1);
				3327
				3328	e = p + PyUnicode_GET_SIZE(self);
				3329	for (; p < e; p++) {
				3330	if (!Py_UNICODE_ISNUMERIC(*p))
				3331	return PyInt_FromLong(0);
				3332	}
				3333	return PyInt_FromLong(1);
				3334	}
				3335
				3336	static char join__doc__[] =
				3337	"S.join(sequence) -> unicode\n\
				3338	\n\
				3339	Return a string which is the concatenation of the strings in the\n\
				3340	sequence. The separator between elements is S.";
				3341
				3342	static PyObject*
				3343	unicode_join(PyUnicodeObject self, PyObject args)
				3344	{
				3345	PyObject *data;
				3346	if (!PyArg_ParseTuple(args, "O:join", &data))
				3347	return NULL;
				3348
				3349	return PyUnicode_Join((PyObject *)self, data);
				3350	}
				3351
				3352	static int
				3353	unicode_length(PyUnicodeObject *self)
				3354	{
				3355	return self->length;
				3356	}
				3357
				3358	static char ljust__doc__[] =
				3359	"S.ljust(width) -> unicode\n\
				3360	\n\
				3361	Return S left justified in a Unicode string of length width. Padding is\n\
				3362	done using spaces.";
				3363
				3364	static PyObject *
				3365	unicode_ljust(PyUnicodeObject self, PyObject args)
				3366	{
				3367	int width;
				3368	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3369	return NULL;
				3370
				3371	if (self->length >= width) {
				3372	Py_INCREF(self);
				3373	return (PyObject*) self;
				3374	}
				3375
				3376	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3377	}
				3378
				3379	static char lower__doc__[] =
				3380	"S.lower() -> unicode\n\
				3381	\n\
				3382	Return a copy of the string S converted to lowercase.";
				3383
				3384	static PyObject*
				3385	unicode_lower(PyUnicodeObject self, PyObject args)
				3386	{
				3387	if (!PyArg_NoArgs(args))
				3388	return NULL;
				3389	return fixup(self, fixlower);
				3390	}
				3391
				3392	static char lstrip__doc__[] =
				3393	"S.lstrip() -> unicode\n\
				3394	\n\
				3395	Return a copy of the string S with leading whitespace removed.";
				3396
				3397	static PyObject *
				3398	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3399	{
				3400	if (!PyArg_NoArgs(args))
				3401	return NULL;
				3402	return strip(self, 1, 0);
				3403	}
				3404
				3405	static PyObject*
				3406	unicode_repeat(PyUnicodeObject *str, int len)
				3407	{
				3408	PyUnicodeObject *u;
				3409	Py_UNICODE *p;
				3410
				3411	if (len < 0)
				3412	len = 0;
				3413
				3414	if (len == 1) {
				3415	/* no repeat, return original string */
				3416	Py_INCREF(str);
				3417	return (PyObject*) str;
				3418	}
				3419
				3420	u = _PyUnicode_New(len * str->length);
				3421	if (!u)
				3422	return NULL;
				3423
				3424	p = u->str;
				3425
				3426	while (len-- > 0) {
				3427	Py_UNICODE_COPY(p, str->str, str->length);
				3428	p += str->length;
				3429	}
				3430
				3431	return (PyObject*) u;
				3432	}
				3433
				3434	PyObject PyUnicode_Replace(PyObject obj,
				3435	PyObject *subobj,
				3436	PyObject *replobj,
				3437	int maxcount)
				3438	{
				3439	PyObject *self;
				3440	PyObject *str1;
				3441	PyObject *str2;
				3442	PyObject *result;
				3443
				3444	self = PyUnicode_FromObject(obj);
				3445	if (self == NULL)
				3446	return NULL;
				3447	str1 = PyUnicode_FromObject(subobj);
				3448	if (str1 == NULL) {
				3449	Py_DECREF(self);
				3450	return NULL;
				3451	}
				3452	str2 = PyUnicode_FromObject(replobj);
				3453	if (str2 == NULL) {
				3454	Py_DECREF(self);
				3455	Py_DECREF(str1);
				3456	return NULL;
				3457	}
				3458	result = replace((PyUnicodeObject *)self,
				3459	(PyUnicodeObject *)str1,
				3460	(PyUnicodeObject *)str2,
				3461	maxcount);
				3462	Py_DECREF(self);
				3463	Py_DECREF(str1);
				3464	Py_DECREF(str2);
				3465	return result;
				3466	}
				3467
				3468	static char replace__doc__[] =
				3469	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3470	\n\
				3471	Return a copy of S with all occurrences of substring\n\
				3472	old replaced by new. If the optional argument maxsplit is\n\
				3473	given, only the first maxsplit occurrences are replaced.";
				3474
				3475	static PyObject*
				3476	unicode_replace(PyUnicodeObject self, PyObject args)
				3477	{
				3478	PyUnicodeObject *str1;
				3479	PyUnicodeObject *str2;
				3480	int maxcount = -1;
				3481	PyObject *result;
				3482
				3483	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3484	return NULL;
				3485	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3486	if (str1 == NULL)
				3487	return NULL;
				3488	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3489	if (str2 == NULL)
				3490	return NULL;
				3491
				3492	result = replace(self, str1, str2, maxcount);
				3493
				3494	Py_DECREF(str1);
				3495	Py_DECREF(str2);
				3496	return result;
				3497	}
				3498
				3499	static
				3500	PyObject unicode_repr(PyObject unicode)
				3501	{
				3502	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				3503	PyUnicode_GET_SIZE(unicode),
				3504	1);
				3505	}
				3506
				3507	static char rfind__doc__[] =
				3508	"S.rfind(sub [,start [,end]]) -> int\n\
				3509	\n\
				3510	Return the highest index in S where substring sub is found,\n\
				3511	such that sub is contained within s[start,end]. Optional\n\
				3512	arguments start and end are interpreted as in slice notation.\n\
				3513	\n\
				3514	Return -1 on failure.";
				3515
				3516	static PyObject *
				3517	unicode_rfind(PyUnicodeObject self, PyObject args)
				3518	{
				3519	PyUnicodeObject *substring;
				3520	int start = 0;
				3521	int end = INT_MAX;
				3522	PyObject *result;
				3523
				3524	if (!PyArg_ParseTuple(args, "O\|ii:rfind", &substring, &start, &end))
				3525	return NULL;
				3526	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3527	(PyObject *)substring);
				3528	if (substring == NULL)
				3529	return NULL;
				3530
				3531	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				3532
				3533	Py_DECREF(substring);
				3534	return result;
				3535	}
				3536
				3537	static char rindex__doc__[] =
				3538	"S.rindex(sub [,start [,end]]) -> int\n\
				3539	\n\
				3540	Like S.rfind() but raise ValueError when the substring is not found.";
				3541
				3542	static PyObject *
				3543	unicode_rindex(PyUnicodeObject self, PyObject args)
				3544	{
				3545	int result;
				3546	PyUnicodeObject *substring;
				3547	int start = 0;
				3548	int end = INT_MAX;
				3549
				3550	if (!PyArg_ParseTuple(args, "O\|ii:rindex", &substring, &start, &end))
				3551	return NULL;
				3552	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3553	(PyObject *)substring);
				3554	if (substring == NULL)
				3555	return NULL;
				3556
				3557	result = findstring(self, substring, start, end, -1);
				3558
				3559	Py_DECREF(substring);
				3560	if (result < 0) {
				3561	PyErr_SetString(PyExc_ValueError, "substring not found");
				3562	return NULL;
				3563	}
				3564	return PyInt_FromLong(result);
				3565	}
				3566
				3567	static char rjust__doc__[] =
				3568	"S.rjust(width) -> unicode\n\
				3569	\n\
				3570	Return S right justified in a Unicode string of length width. Padding is\n\
				3571	done using spaces.";
				3572
				3573	static PyObject *
				3574	unicode_rjust(PyUnicodeObject self, PyObject args)
				3575	{
				3576	int width;
				3577	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				3578	return NULL;
				3579
				3580	if (self->length >= width) {
				3581	Py_INCREF(self);
				3582	return (PyObject*) self;
				3583	}
				3584
				3585	return (PyObject*) pad(self, width - self->length, 0, ' ');
				3586	}
				3587
				3588	static char rstrip__doc__[] =
				3589	"S.rstrip() -> unicode\n\
				3590	\n\
				3591	Return a copy of the string S with trailing whitespace removed.";
				3592
				3593	static PyObject *
				3594	unicode_rstrip(PyUnicodeObject self, PyObject args)
				3595	{
				3596	if (!PyArg_NoArgs(args))
				3597	return NULL;
				3598	return strip(self, 0, 1);
				3599	}
				3600
				3601	static PyObject*
				3602	unicode_slice(PyUnicodeObject *self, int start, int end)
				3603	{
				3604	/* standard clamping */
				3605	if (start < 0)
				3606	start = 0;
				3607	if (end < 0)
				3608	end = 0;
				3609	if (end > self->length)
				3610	end = self->length;
				3611	if (start == 0 && end == self->length) {
				3612	/* full slice, return original string */
				3613	Py_INCREF(self);
				3614	return (PyObject*) self;
				3615	}
				3616	if (start > end)
				3617	start = end;
				3618	/* copy slice */
				3619	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				3620	end - start);
				3621	}
				3622
				3623	PyObject PyUnicode_Split(PyObject s,
				3624	PyObject *sep,
				3625	int maxsplit)
				3626	{
				3627	PyObject *result;
				3628
				3629	s = PyUnicode_FromObject(s);
				3630	if (s == NULL)
				3631	return NULL;
				3632	if (sep != NULL) {
				3633	sep = PyUnicode_FromObject(sep);
				3634	if (sep == NULL) {
				3635	Py_DECREF(s);
				3636	return NULL;
				3637	}
				3638	}
				3639
				3640	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				3641
				3642	Py_DECREF(s);
				3643	Py_XDECREF(sep);
				3644	return result;
				3645	}
				3646
				3647	static char split__doc__[] =
				3648	"S.split([sep [,maxsplit]]) -> list of strings\n\
				3649	\n\
				3650	Return a list of the words in S, using sep as the\n\
				3651	delimiter string. If maxsplit is given, at most maxsplit\n\
				3652	splits are done. If sep is not specified, any whitespace string\n\
				3653	is a separator.";
				3654
				3655	static PyObject*
				3656	unicode_split(PyUnicodeObject self, PyObject args)
				3657	{
				3658	PyObject *substring = Py_None;
				3659	int maxcount = -1;
				3660
				3661	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				3662	return NULL;
				3663
				3664	if (substring == Py_None)
				3665	return split(self, NULL, maxcount);
				3666	else if (PyUnicode_Check(substring))
				3667	return split(self, (PyUnicodeObject *)substring, maxcount);
				3668	else
				3669	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				3670	}
				3671
				3672	static char splitlines__doc__[] =
				3673	"S.splitlines([maxsplit]]) -> list of strings\n\
				3674	\n\
				3675	Return a list of the lines in S, breaking at line boundaries.\n\
				3676	If maxsplit is given, at most maxsplit are done. Line breaks are not\n\
				3677	included in the resulting list.";
				3678
				3679	static PyObject*
				3680	unicode_splitlines(PyUnicodeObject self, PyObject args)
				3681	{
				3682	int maxcount = -1;
				3683
				3684	if (!PyArg_ParseTuple(args, "\|i:splitlines", &maxcount))
				3685	return NULL;
				3686
				3687	return PyUnicode_Splitlines((PyObject *)self, maxcount);
				3688	}
				3689
				3690	static
				3691	PyObject unicode_str(PyUnicodeObject self)
				3692	{
				3693	return PyUnicode_AsUTF8String((PyObject *)self);
				3694	}
				3695
				3696	static char strip__doc__[] =
				3697	"S.strip() -> unicode\n\
				3698	\n\
				3699	Return a copy of S with leading and trailing whitespace removed.";
				3700
				3701	static PyObject *
				3702	unicode_strip(PyUnicodeObject self, PyObject args)
				3703	{
				3704	if (!PyArg_NoArgs(args))
				3705	return NULL;
				3706	return strip(self, 1, 1);
				3707	}
				3708
				3709	static char swapcase__doc__[] =
				3710	"S.swapcase() -> unicode\n\
				3711	\n\
				3712	Return a copy of S with uppercase characters converted to lowercase\n\
				3713	and vice versa.";
				3714
				3715	static PyObject*
				3716	unicode_swapcase(PyUnicodeObject self, PyObject args)
				3717	{
				3718	if (!PyArg_NoArgs(args))
				3719	return NULL;
				3720	return fixup(self, fixswapcase);
				3721	}
				3722
				3723	static char translate__doc__[] =
				3724	"S.translate(table) -> unicode\n\
				3725	\n\
				3726	Return a copy of the string S, where all characters have been mapped\n\
				3727	through the given translation table, which must be a mapping of\n\
				3728	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				3729	are left untouched. Characters mapped to None are deleted.";
				3730
				3731	static PyObject*
				3732	unicode_translate(PyUnicodeObject self, PyObject args)
				3733	{
				3734	PyObject *table;
				3735
				3736	if (!PyArg_ParseTuple(args, "O:translate", &table))
				3737	return NULL;
				3738	return PyUnicode_TranslateCharmap(self->str,
				3739	self->length,
				3740	table,
				3741	"ignore");
				3742	}
				3743
				3744	static char upper__doc__[] =
				3745	"S.upper() -> unicode\n\
				3746	\n\
				3747	Return a copy of S converted to uppercase.";
				3748
				3749	static PyObject*
				3750	unicode_upper(PyUnicodeObject self, PyObject args)
				3751	{
				3752	if (!PyArg_NoArgs(args))
				3753	return NULL;
				3754	return fixup(self, fixupper);
				3755	}
				3756
				3757	#if 0
				3758	static char zfill__doc__[] =
				3759	"S.zfill(width) -> unicode\n\
				3760	\n\
				3761	Pad a numeric string x with zeros on the left, to fill a field\n\
				3762	of the specified width. The string x is never truncated.";
				3763
				3764	static PyObject *
				3765	unicode_zfill(PyUnicodeObject self, PyObject args)
				3766	{
				3767	int fill;
				3768	PyUnicodeObject *u;
				3769
				3770	int width;
				3771	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				3772	return NULL;
				3773
				3774	if (self->length >= width) {
				3775	Py_INCREF(self);
				3776	return (PyObject*) self;
				3777	}
				3778
				3779	fill = width - self->length;
				3780
				3781	u = pad(self, fill, 0, '0');
				3782
				3783	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				3784	/* move sign to beginning of string */
				3785	u->str[0] = u->str[fill];
				3786	u->str[fill] = '0';
				3787	}
				3788
				3789	return (PyObject*) u;
				3790	}
				3791	#endif
				3792
				3793	#if 0
				3794	static PyObject*
				3795	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				3796	{
				3797	if (!PyArg_NoArgs(args))
				3798	return NULL;
				3799	return PyInt_FromLong(unicode_freelist_size);
				3800	}
				3801	#endif
				3802
				3803	static char startswith__doc__[] =
				3804	"S.startswith(prefix[, start[, end]]) -> int\n\
				3805	\n\
				3806	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				3807	optional start, test S beginning at that position. With optional end, stop\n\
				3808	comparing S at that position.";
				3809
				3810	static PyObject *
				3811	unicode_startswith(PyUnicodeObject *self,
				3812	PyObject *args)
				3813	{
				3814	PyUnicodeObject *substring;
				3815	int start = 0;
				3816	int end = INT_MAX;
				3817	PyObject *result;
				3818
				3819	if (!PyArg_ParseTuple(args, "O\|ii:startswith", &substring, &start, &end))
				3820	return NULL;
				3821	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3822	(PyObject *)substring);
				3823	if (substring == NULL)
				3824	return NULL;
				3825
				3826	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				3827
				3828	Py_DECREF(substring);
				3829	return result;
				3830	}
				3831
				3832
				3833	static char endswith__doc__[] =
				3834	"S.endswith(suffix[, start[, end]]) -> int\n\
				3835	\n\
				3836	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				3837	optional start, test S beginning at that position. With optional end, stop\n\
				3838	comparing S at that position.";
				3839
				3840	static PyObject *
				3841	unicode_endswith(PyUnicodeObject *self,
				3842	PyObject *args)
				3843	{
				3844	PyUnicodeObject *substring;
				3845	int start = 0;
				3846	int end = INT_MAX;
				3847	PyObject *result;
				3848
				3849	if (!PyArg_ParseTuple(args, "O\|ii:endswith", &substring, &start, &end))
				3850	return NULL;
				3851	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3852	(PyObject *)substring);
				3853	if (substring == NULL)
				3854	return NULL;
				3855
				3856	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				3857
				3858	Py_DECREF(substring);
				3859	return result;
				3860	}
				3861
				3862
				3863	static PyMethodDef unicode_methods[] = {
				3864
				3865	/* Order is according to common usage: often used methods should
				3866	appear first, since lookup is done sequentially. */
				3867
				3868	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				3869	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				3870	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				3871	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				3872	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				3873	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				3874	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				3875	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				3876	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				3877	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				3878	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				3879	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				3880	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				3881	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				3882	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				3883	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				3884	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				3885	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				3886	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				3887	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				3888	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				3889	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				3890	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				3891	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				3892	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				3893	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				3894	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				3895	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				3896	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				3897	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				3898	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				3899	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				3900	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
				3901	#if 0
				3902	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				3903	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				3904	#endif
				3905
				3906	#if 0
				3907	/* This one is just used for debugging the implementation. */
				3908	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				3909	#endif
				3910
				3911	{NULL, NULL}
				3912	};
				3913
				3914	static PyObject *
				3915	unicode_getattr(PyUnicodeObject self, char name)
				3916	{
				3917	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				3918	}
				3919
				3920	static PySequenceMethods unicode_as_sequence = {
				3921	(inquiry) unicode_length, /* sq_length */
				3922	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				3923	(intargfunc) unicode_repeat, /* sq_repeat */
				3924	(intargfunc) unicode_getitem, /* sq_item */
				3925	(intintargfunc) unicode_slice, /* sq_slice */
				3926	0, /* sq_ass_item */
				3927	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3928	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3929	};
				3930
				3931	static int
				3932	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				3933	int index,
				3934	const void **ptr)
				3935	{
				3936	if (index != 0) {
				3937	PyErr_SetString(PyExc_SystemError,
				3938	"accessing non-existent unicode segment");
				3939	return -1;
				3940	}
				3941	ptr = (void ) self->str;
				3942	return PyUnicode_GET_DATA_SIZE(self);
				3943	}
				3944
				3945	static int
				3946	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				3947	const void **ptr)
				3948	{
				3949	PyErr_SetString(PyExc_TypeError,
				3950	"cannot use unicode as modifyable buffer");
				3951	return -1;
				3952	}
				3953
				3954	static int
				3955	unicode_buffer_getsegcount(PyUnicodeObject *self,
				3956	int *lenp)
				3957	{
				3958	if (lenp)
				3959	*lenp = PyUnicode_GET_DATA_SIZE(self);
				3960	return 1;
				3961	}
				3962
				3963	static int
				3964	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				3965	int index,
				3966	const void **ptr)
				3967	{
				3968	PyObject *str;
				3969
				3970	if (index != 0) {
				3971	PyErr_SetString(PyExc_SystemError,
				3972	"accessing non-existent unicode segment");
				3973	return -1;
				3974	}
				3975	str = utf8_string(self, NULL);
				3976	if (str == NULL)
				3977	return -1;
				3978	ptr = (void ) PyString_AS_STRING(str);
				3979	return PyString_GET_SIZE(str);
				3980	}
				3981
				3982	/* Helpers for PyUnicode_Format() */
				3983
				3984	static PyObject *
				3985	getnextarg(args, arglen, p_argidx)
				3986	PyObject *args;
				3987	int arglen;
				3988	int *p_argidx;
				3989	{
				3990	int argidx = *p_argidx;
				3991	if (argidx < arglen) {
				3992	(*p_argidx)++;
				3993	if (arglen < 0)
				3994	return args;
				3995	else
				3996	return PyTuple_GetItem(args, argidx);
				3997	}
				3998	PyErr_SetString(PyExc_TypeError,
				3999	"not enough arguments for format string");
				4000	return NULL;
				4001	}
				4002
				4003	#define F_LJUST (1<<0)
				4004	#define F_SIGN (1<<1)
				4005	#define F_BLANK (1<<2)
				4006	#define F_ALT (1<<3)
				4007	#define F_ZERO (1<<4)
				4008
				4009	static
				4010	#ifdef HAVE_STDARG_PROTOTYPES
				4011	int usprintf(register Py_UNICODE buffer, char format, ...)
				4012	#else
				4013	int usprintf(va_alist) va_dcl
				4014	#endif
				4015	{
				4016	register int i;
				4017	int len;
				4018	va_list va;
				4019	char *charbuffer;
				4020	#ifdef HAVE_STDARG_PROTOTYPES
				4021	va_start(va, format);
				4022	#else
				4023	Py_UNICODE *args;
				4024	char *format;
				4025
				4026	va_start(va);
				4027	buffer = va_arg(va, Py_UNICODE *);
				4028	format = va_arg(va, char *);
				4029	#endif
				4030
				4031	/* First, format the string as char array, then expand to Py_UNICODE
				4032	array. */
				4033	charbuffer = (char *)buffer;
				4034	len = vsprintf(charbuffer, format, va);
				4035	for (i = len - 1; i >= 0; i--)
				4036	buffer[i] = (Py_UNICODE) charbuffer[i];
				4037
				4038	va_end(va);
				4039	return len;
				4040	}
				4041
				4042	static int
				4043	formatfloat(Py_UNICODE *buf,
				4044	int flags,
				4045	int prec,
				4046	int type,
				4047	PyObject *v)
				4048	{
				4049	char fmt[20];
				4050	double x;
				4051
				4052	x = PyFloat_AsDouble(v);
				4053	if (x == -1.0 && PyErr_Occurred())
				4054	return -1;
				4055	if (prec < 0)
				4056	prec = 6;
				4057	if (prec > 50)
				4058	prec = 50; /* Arbitrary limitation */
				4059	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4060	type = 'g';
				4061	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
				4062	return usprintf(buf, fmt, x);
				4063	}
				4064
				4065	static int
				4066	formatint(Py_UNICODE *buf,
				4067	int flags,
				4068	int prec,
				4069	int type,
				4070	PyObject *v)
				4071	{
				4072	char fmt[20];
				4073	long x;
				4074
				4075	x = PyInt_AsLong(v);
				4076	if (x == -1 && PyErr_Occurred())
				4077	return -1;
				4078	if (prec < 0)
				4079	prec = 1;
				4080	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4081	return usprintf(buf, fmt, x);
				4082	}
				4083
				4084	static int
				4085	formatchar(Py_UNICODE *buf,
				4086	PyObject *v)
				4087	{
				4088	if (PyUnicode_Check(v))
				4089	buf[0] = PyUnicode_AS_UNICODE(v)[0];
				4090
				4091	else if (PyString_Check(v))
				4092	buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
				4093
				4094	else {
				4095	/* Integer input truncated to a character */
				4096	long x;
				4097	x = PyInt_AsLong(v);
				4098	if (x == -1 && PyErr_Occurred())
				4099	return -1;
				4100	buf[0] = (char) x;
				4101	}
				4102	buf[1] = '\0';
				4103	return 1;
				4104	}
				4105
				4106	PyObject PyUnicode_Format(PyObject format,
				4107	PyObject *args)
				4108	{
				4109	Py_UNICODE fmt, res;
				4110	int fmtcnt, rescnt, reslen, arglen, argidx;
				4111	int args_owned = 0;
				4112	PyUnicodeObject *result = NULL;
				4113	PyObject *dict = NULL;
				4114	PyObject *uformat;
				4115
				4116	if (format == NULL \|\| args == NULL) {
				4117	PyErr_BadInternalCall();
				4118	return NULL;
				4119	}
				4120	uformat = PyUnicode_FromObject(format);
				4121	fmt = PyUnicode_AS_UNICODE(uformat);
				4122	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4123
				4124	reslen = rescnt = fmtcnt + 100;
				4125	result = _PyUnicode_New(reslen);
				4126	if (result == NULL)
				4127	goto onError;
				4128	res = PyUnicode_AS_UNICODE(result);
				4129
				4130	if (PyTuple_Check(args)) {
				4131	arglen = PyTuple_Size(args);
				4132	argidx = 0;
				4133	}
				4134	else {
				4135	arglen = -1;
				4136	argidx = -2;
				4137	}
				4138	if (args->ob_type->tp_as_mapping)
				4139	dict = args;
				4140
				4141	while (--fmtcnt >= 0) {
				4142	if (*fmt != '%') {
				4143	if (--rescnt < 0) {
				4144	rescnt = fmtcnt + 100;
				4145	reslen += rescnt;
				4146	if (_PyUnicode_Resize(result, reslen) < 0)
				4147	return NULL;
				4148	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4149	--rescnt;
				4150	}
				4151	res++ = fmt++;
				4152	}
				4153	else {
				4154	/* Got a format specifier */
				4155	int flags = 0;
				4156	int width = -1;
				4157	int prec = -1;
				4158	int size = 0;
				4159	Py_UNICODE c = '\0';
				4160	Py_UNICODE fill;
				4161	PyObject *v = NULL;
				4162	PyObject *temp = NULL;
				4163	Py_UNICODE *buf;
				4164	Py_UNICODE sign;
				4165	int len;
				4166	Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
				4167
				4168	fmt++;
				4169	if (*fmt == '(') {
				4170	Py_UNICODE *keystart;
				4171	int keylen;
				4172	PyObject *key;
				4173	int pcount = 1;
				4174
				4175	if (dict == NULL) {
				4176	PyErr_SetString(PyExc_TypeError,
				4177	"format requires a mapping");
				4178	goto onError;
				4179	}
				4180	++fmt;
				4181	--fmtcnt;
				4182	keystart = fmt;
				4183	/* Skip over balanced parentheses */
				4184	while (pcount > 0 && --fmtcnt >= 0) {
				4185	if (*fmt == ')')
				4186	--pcount;
				4187	else if (*fmt == '(')
				4188	++pcount;
				4189	fmt++;
				4190	}
				4191	keylen = fmt - keystart - 1;
				4192	if (fmtcnt < 0 \|\| pcount > 0) {
				4193	PyErr_SetString(PyExc_ValueError,
				4194	"incomplete format key");
				4195	goto onError;
				4196	}
				4197	/* keys are converted to strings (using UTF-8) and
				4198	then looked up since Python uses strings to hold
				4199	variables names etc. in its namespaces and we
				4200	wouldn't want to break common idioms. The
				4201	alternative would be using Unicode objects for the
				4202	lookup but u"abc" and "abc" have different hash
				4203	values (on purpose). */
				4204	key = PyUnicode_EncodeUTF8(keystart,
				4205	keylen,
				4206	NULL);
				4207	if (key == NULL)
				4208	goto onError;
				4209	if (args_owned) {
				4210	Py_DECREF(args);
				4211	args_owned = 0;
				4212	}
				4213	args = PyObject_GetItem(dict, key);
				4214	Py_DECREF(key);
				4215	if (args == NULL) {
				4216	goto onError;
				4217	}
				4218	args_owned = 1;
				4219	arglen = -1;
				4220	argidx = -2;
				4221	}
				4222	while (--fmtcnt >= 0) {
				4223	switch (c = *fmt++) {
				4224	case '-': flags \|= F_LJUST; continue;
				4225	case '+': flags \|= F_SIGN; continue;
				4226	case ' ': flags \|= F_BLANK; continue;
				4227	case '#': flags \|= F_ALT; continue;
				4228	case '0': flags \|= F_ZERO; continue;
				4229	}
				4230	break;
				4231	}
				4232	if (c == '*') {
				4233	v = getnextarg(args, arglen, &argidx);
				4234	if (v == NULL)
				4235	goto onError;
				4236	if (!PyInt_Check(v)) {
				4237	PyErr_SetString(PyExc_TypeError,
				4238	"* wants int");
				4239	goto onError;
				4240	}
				4241	width = PyInt_AsLong(v);
				4242	if (width < 0) {
				4243	flags \|= F_LJUST;
				4244	width = -width;
				4245	}
				4246	if (--fmtcnt >= 0)
				4247	c = *fmt++;
				4248	}
				4249	else if (c >= '0' && c <= '9') {
				4250	width = c - '0';
				4251	while (--fmtcnt >= 0) {
				4252	c = *fmt++;
				4253	if (c < '0' \|\| c > '9')
				4254	break;
				4255	if ((width*10) / 10 != width) {
				4256	PyErr_SetString(PyExc_ValueError,
				4257	"width too big");
				4258	goto onError;
				4259	}
				4260	width = width*10 + (c - '0');
				4261	}
				4262	}
				4263	if (c == '.') {
				4264	prec = 0;
				4265	if (--fmtcnt >= 0)
				4266	c = *fmt++;
				4267	if (c == '*') {
				4268	v = getnextarg(args, arglen, &argidx);
				4269	if (v == NULL)
				4270	goto onError;
				4271	if (!PyInt_Check(v)) {
				4272	PyErr_SetString(PyExc_TypeError,
				4273	"* wants int");
				4274	goto onError;
				4275	}
				4276	prec = PyInt_AsLong(v);
				4277	if (prec < 0)
				4278	prec = 0;
				4279	if (--fmtcnt >= 0)
				4280	c = *fmt++;
				4281	}
				4282	else if (c >= '0' && c <= '9') {
				4283	prec = c - '0';
				4284	while (--fmtcnt >= 0) {
				4285	c = Py_CHARMASK(*fmt++);
				4286	if (c < '0' \|\| c > '9')
				4287	break;
				4288	if ((prec*10) / 10 != prec) {
				4289	PyErr_SetString(PyExc_ValueError,
				4290	"prec too big");
				4291	goto onError;
				4292	}
				4293	prec = prec*10 + (c - '0');
				4294	}
				4295	}
				4296	} /* prec */
				4297	if (fmtcnt >= 0) {
				4298	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4299	size = c;
				4300	if (--fmtcnt >= 0)
				4301	c = *fmt++;
				4302	}
				4303	}
				4304	if (fmtcnt < 0) {
				4305	PyErr_SetString(PyExc_ValueError,
				4306	"incomplete format");
				4307	goto onError;
				4308	}
				4309	if (c != '%') {
				4310	v = getnextarg(args, arglen, &argidx);
				4311	if (v == NULL)
				4312	goto onError;
				4313	}
				4314	sign = 0;
				4315	fill = ' ';
				4316	switch (c) {
				4317
				4318	case '%':
				4319	buf = tmpbuf;
				4320	buf[0] = '%';
				4321	len = 1;
				4322	break;
				4323
				4324	case 's':
				4325	case 'r':
				4326	if (PyUnicode_Check(v) && c == 's') {
				4327	temp = v;
				4328	Py_INCREF(temp);
				4329	}
				4330	else {
				4331	PyObject *unicode;
				4332	if (c == 's')
				4333	temp = PyObject_Str(v);
				4334	else
				4335	temp = PyObject_Repr(v);
				4336	if (temp == NULL)
				4337	goto onError;
				4338	if (!PyString_Check(temp)) {
				4339	/* XXX Note: this should never happen, since
				4340	PyObject_Repr() and PyObject_Str() assure
				4341	this */
				4342	Py_DECREF(temp);
				4343	PyErr_SetString(PyExc_TypeError,
				4344	"%s argument has non-string str()");
				4345	goto onError;
				4346	}
				4347	unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
				4348	PyString_GET_SIZE(temp),
				4349	"strict");
				4350	Py_DECREF(temp);
				4351	temp = unicode;
				4352	if (temp == NULL)
				4353	goto onError;
				4354	}
				4355	buf = PyUnicode_AS_UNICODE(temp);
				4356	len = PyUnicode_GET_SIZE(temp);
				4357	if (prec >= 0 && len > prec)
				4358	len = prec;
				4359	break;
				4360
				4361	case 'i':
				4362	case 'd':
				4363	case 'u':
				4364	case 'o':
				4365	case 'x':
				4366	case 'X':
				4367	if (c == 'i')
				4368	c = 'd';
				4369	buf = tmpbuf;
				4370	len = formatint(buf, flags, prec, c, v);
				4371	if (len < 0)
				4372	goto onError;
				4373	sign = (c == 'd');
				4374	if (flags & F_ZERO) {
				4375	fill = '0';
				4376	if ((flags&F_ALT) &&
				4377	(c == 'x' \|\| c == 'X') &&
				4378	buf[0] == '0' && buf[1] == c) {
				4379	res++ = buf++;
				4380	res++ = buf++;
				4381	rescnt -= 2;
				4382	len -= 2;
				4383	width -= 2;
				4384	if (width < 0)
				4385	width = 0;
				4386	}
				4387	}
				4388	break;
				4389
				4390	case 'e':
				4391	case 'E':
				4392	case 'f':
				4393	case 'g':
				4394	case 'G':
				4395	buf = tmpbuf;
				4396	len = formatfloat(buf, flags, prec, c, v);
				4397	if (len < 0)
				4398	goto onError;
				4399	sign = 1;
				4400	if (flags&F_ZERO)
				4401	fill = '0';
				4402	break;
				4403
				4404	case 'c':
				4405	buf = tmpbuf;
				4406	len = formatchar(buf, v);
				4407	if (len < 0)
				4408	goto onError;
				4409	break;
				4410
				4411	default:
				4412	PyErr_Format(PyExc_ValueError,
				4413	"unsupported format character '%c' (0x%x)",
				4414	c, c);
				4415	goto onError;
				4416	}
				4417	if (sign) {
				4418	if (buf == '-' \|\| buf == '+') {
				4419	sign = *buf++;
				4420	len--;
				4421	}
				4422	else if (flags & F_SIGN)
				4423	sign = '+';
				4424	else if (flags & F_BLANK)
				4425	sign = ' ';
				4426	else
				4427	sign = 0;
				4428	}
				4429	if (width < len)
				4430	width = len;
				4431	if (rescnt < width + (sign != 0)) {
				4432	reslen -= rescnt;
				4433	rescnt = width + fmtcnt + 100;
				4434	reslen += rescnt;
				4435	if (_PyUnicode_Resize(result, reslen) < 0)
				4436	return NULL;
				4437	res = PyUnicode_AS_UNICODE(result)
				4438	+ reslen - rescnt;
				4439	}
				4440	if (sign) {
				4441	if (fill != ' ')
				4442	*res++ = sign;
				4443	rescnt--;
				4444	if (width > len)
				4445	width--;
				4446	}
				4447	if (width > len && !(flags & F_LJUST)) {
				4448	do {
				4449	--rescnt;
				4450	*res++ = fill;
				4451	} while (--width > len);
				4452	}
				4453	if (sign && fill == ' ')
				4454	*res++ = sign;
				4455	memcpy(res, buf, len * sizeof(Py_UNICODE));
				4456	res += len;
				4457	rescnt -= len;
				4458	while (--width >= len) {
				4459	--rescnt;
				4460	*res++ = ' ';
				4461	}
				4462	if (dict && (argidx < arglen) && c != '%') {
				4463	PyErr_SetString(PyExc_TypeError,
				4464	"not all arguments converted");
				4465	goto onError;
				4466	}
				4467	Py_XDECREF(temp);
				4468	} /* '%' */
				4469	} /* until end */
				4470	if (argidx < arglen && !dict) {
				4471	PyErr_SetString(PyExc_TypeError,
				4472	"not all arguments converted");
				4473	goto onError;
				4474	}
				4475
				4476	if (args_owned) {
				4477	Py_DECREF(args);
				4478	}
				4479	Py_DECREF(uformat);
				4480	_PyUnicode_Resize(result, reslen - rescnt);
				4481	return (PyObject *)result;
				4482
				4483	onError:
				4484	Py_XDECREF(result);
				4485	Py_DECREF(uformat);
				4486	if (args_owned) {
				4487	Py_DECREF(args);
				4488	}
				4489	return NULL;
				4490	}
				4491
				4492	static PyBufferProcs unicode_as_buffer = {
				4493	(getreadbufferproc) unicode_buffer_getreadbuf,
				4494	(getwritebufferproc) unicode_buffer_getwritebuf,
				4495	(getsegcountproc) unicode_buffer_getsegcount,
				4496	(getcharbufferproc) unicode_buffer_getcharbuf,
				4497	};
				4498
				4499	PyTypeObject PyUnicode_Type = {
				4500	PyObject_HEAD_INIT(&PyType_Type)
				4501	0, /* ob_size */
				4502	"unicode", /* tp_name */
				4503	sizeof(PyUnicodeObject), /* tp_size */
				4504	0, /* tp_itemsize */
				4505	/* Slots */
				4506	(destructor)_PyUnicode_Free, /* tp_dealloc */
				4507	0, /* tp_print */
				4508	(getattrfunc)unicode_getattr, /* tp_getattr */
				4509	0, /* tp_setattr */
				4510	(cmpfunc) unicode_compare, /* tp_compare */
				4511	(reprfunc) unicode_repr, /* tp_repr */
				4512	0, /* tp_as_number */
				4513	&unicode_as_sequence, /* tp_as_sequence */
				4514	0, /* tp_as_mapping */
				4515	(hashfunc) unicode_hash, /* tp_hash*/
				4516	0, /* tp_call*/
				4517	(reprfunc) unicode_str, /* tp_str */
				4518	(getattrofunc) NULL, /* tp_getattro */
				4519	(setattrofunc) NULL, /* tp_setattro */
				4520	&unicode_as_buffer, /* tp_as_buffer */
				4521	Py_TPFLAGS_DEFAULT, /* tp_flags */
				4522	};
				4523
				4524	/* Initialize the Unicode implementation */
				4525
				4526	void _PyUnicode_Init()
				4527	{
				4528	/* Doublecheck the configuration... */
				4529	if (sizeof(Py_UNICODE) != 2)
				4530	Py_FatalError("Unicode configuration error: "
				4531	"sizeof(Py_UNICODE) != 2 bytes");
				4532
				4533	unicode_empty = _PyUnicode_New(0);
				4534	}
				4535
				4536	/* Finalize the Unicode implementation */
				4537
				4538	void
				4539	_PyUnicode_Fini()
				4540	{
				4541	PyUnicodeObject *u = unicode_freelist;
				4542
				4543	while (u != NULL) {
				4544	PyUnicodeObject *v = u;
				4545	u = (PyUnicodeObject *)u;
				4546	free(v);
				4547	}
				4548	Py_XDECREF(unicode_empty);
				4549	}