Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: e4bbcff20cb0548ab700c167d78fa427663c5eaa [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
				4	modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
				5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
				69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	79	/* Limit for the Unicode object free list */
				80
				81	#define MAX_UNICODE_FREELIST_SIZE 1024
				82
				83	/* Limit for the Unicode object free list stay alive optimization.
				84
				85	The implementation will keep allocated Unicode memory intact for
				86	all objects on the free list having a size less than this
				87	limit. This reduces malloc() overhead for small Unicode objects.
				88
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	89	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	90	(sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
				91	malloc()-overhead) bytes of unused garbage.
				92
				93	Setting the limit to 0 effectively turns the feature off.
				94
				95	XXX The feature is currently turned off because there are
				96	apparently some lingering bugs in its implementation which I
				97	haven't yet been able to sort out.
				98
				99	*/
				100
				101	#define STAYALIVE_SIZE_LIMIT 0
				102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
				111	/* --- Globals ------------------------------------------------------------ */
				112
				113	/* The empty Unicode object */
				114	static PyUnicodeObject *unicode_empty = NULL;
				115
				116	/* Free list for Unicode objects */
				117	static PyUnicodeObject *unicode_freelist = NULL;
				118	static int unicode_freelist_size = 0;
				119
				120	/* --- Unicode Object ----------------------------------------------------- */
				121
				122	static
				123	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				124	int length)
				125	{
				126	void *oldstr;
				127
				128	/* Shortcut if there's nothing to do. */
				129	if (unicode->length == length)
				130	return 0;
				131
				132	/* Resizing unicode_empty is not allowed. */
				133	if (unicode == unicode_empty) {
				134	PyErr_SetString(PyExc_SystemError,
				135	"can't resize empty unicode object");
				136	return -1;
				137	}
				138
				139	/* We allocate one more byte to make sure the string is
				140	Ux0000 terminated -- XXX is this needed ? */
				141	oldstr = unicode->str;
				142	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				143	if (!unicode->str) {
				144	unicode->str = oldstr;
				145	PyErr_NoMemory();
				146	return -1;
				147	}
				148	unicode->str[length] = 0;
				149	unicode->length = length;
				150
				151	/* Reset the object caches */
				152	if (unicode->utf8str) {
				153	Py_DECREF(unicode->utf8str);
				154	unicode->utf8str = NULL;
				155	}
				156	unicode->hash = -1;
				157
				158	return 0;
				159	}
				160
				161	/* We allocate one more byte to make sure the string is
				162	Ux0000 terminated -- XXX is this needed ?
				163
				164	XXX This allocator could further be enhanced by assuring that the
				165	free list never reduces its size below 1.
				166
				167	*/
				168
				169	static
				170	PyUnicodeObject *_PyUnicode_New(int length)
				171	{
				172	register PyUnicodeObject *unicode;
				173
				174	/* Optimization for empty strings */
				175	if (length == 0 && unicode_empty != NULL) {
				176	Py_INCREF(unicode_empty);
				177	return unicode_empty;
				178	}
				179
				180	/* Unicode freelist & memory allocation */
				181	if (unicode_freelist) {
				182	unicode = unicode_freelist;
				183	unicode_freelist = (PyUnicodeObject *)unicode_freelist;
				184	unicode_freelist_size--;
				185	unicode->ob_type = &PyUnicode_Type;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	186	_Py_NewReference((PyObject *)unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	187	if (unicode->str) {
				188	if (unicode->length < length &&
				189	_PyUnicode_Resize(unicode, length)) {
				190	free(unicode->str);
				191	PyMem_DEL(unicode);
				192	return NULL;
				193	}
				194	}
				195	else
				196	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				197	}
				198	else {
				199	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				200	if (unicode == NULL)
				201	return NULL;
				202	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				203	}
				204
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	205	if (!unicode->str)
				206	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	207	unicode->str[length] = 0;
				208	unicode->length = length;
				209	unicode->hash = -1;
				210	unicode->utf8str = NULL;
				211	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212
				213	onError:
				214	_Py_ForgetReference((PyObject *)unicode);
				215	PyMem_DEL(unicode);
				216	PyErr_NoMemory();
				217	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	}
				219
				220	static
				221	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				222	{
				223	Py_XDECREF(unicode->utf8str);
				224	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
				225	if (unicode->length >= STAYALIVE_SIZE_LIMIT) {
				226	free(unicode->str);
				227	unicode->str = NULL;
				228	unicode->length = 0;
				229	}
				230	(PyUnicodeObject *)unicode = unicode_freelist;
				231	unicode_freelist = unicode;
				232	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	233	}
				234	else {
				235	free(unicode->str);
				236	PyMem_DEL(unicode);
				237	}
				238	}
				239
				240	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				241	int size)
				242	{
				243	PyUnicodeObject *unicode;
				244
				245	unicode = _PyUnicode_New(size);
				246	if (!unicode)
				247	return NULL;
				248
				249	/* Copy the Unicode data into the new object */
				250	if (u != NULL)
				251	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				252
				253	return (PyObject *)unicode;
				254	}
				255
				256	#ifdef HAVE_WCHAR_H
				257
				258	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				259	int size)
				260	{
				261	PyUnicodeObject *unicode;
				262
				263	if (w == NULL) {
				264	PyErr_BadInternalCall();
				265	return NULL;
				266	}
				267
				268	unicode = _PyUnicode_New(size);
				269	if (!unicode)
				270	return NULL;
				271
				272	/* Copy the wchar_t data into the new object */
				273	#ifdef HAVE_USABLE_WCHAR_T
				274	memcpy(unicode->str, w, size * sizeof(wchar_t));
				275	#else
				276	{
				277	register Py_UNICODE *u;
				278	register int i;
				279	u = PyUnicode_AS_UNICODE(unicode);
				280	for (i = size; i >= 0; i--)
				281	u++ = w++;
				282	}
				283	#endif
				284
				285	return (PyObject *)unicode;
				286	}
				287
				288	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				289	register wchar_t *w,
				290	int size)
				291	{
				292	if (unicode == NULL) {
				293	PyErr_BadInternalCall();
				294	return -1;
				295	}
				296	if (size > PyUnicode_GET_SIZE(unicode))
				297	size = PyUnicode_GET_SIZE(unicode);
				298	#ifdef HAVE_USABLE_WCHAR_T
				299	memcpy(w, unicode->str, size * sizeof(wchar_t));
				300	#else
				301	{
				302	register Py_UNICODE *u;
				303	register int i;
				304	u = PyUnicode_AS_UNICODE(unicode);
				305	for (i = size; i >= 0; i--)
				306	w++ = u++;
				307	}
				308	#endif
				309
				310	return size;
				311	}
				312
				313	#endif
				314
				315	PyObject PyUnicode_FromObject(register PyObject obj)
				316	{
				317	const char *s;
				318	int len;
				319
				320	if (obj == NULL) {
				321	PyErr_BadInternalCall();
				322	return NULL;
				323	}
				324	else if (PyUnicode_Check(obj)) {
				325	Py_INCREF(obj);
				326	return obj;
				327	}
				328	else if (PyString_Check(obj)) {
				329	s = PyString_AS_STRING(obj);
				330	len = PyString_GET_SIZE(obj);
				331	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	332	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				333	/* Overwrite the error message with something more useful in
				334	case of a TypeError. */
				335	if (PyErr_ExceptionMatches(PyExc_TypeError))
				336	PyErr_SetString(PyExc_TypeError,
				337	"coercing to Unicode: need string or charbuffer");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	338	return NULL;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	339	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	340	if (len == 0) {
				341	Py_INCREF(unicode_empty);
				342	return (PyObject *)unicode_empty;
				343	}
				344	return PyUnicode_DecodeUTF8(s, len, "strict");
				345	}
				346
				347	PyObject PyUnicode_Decode(const char s,
				348	int size,
				349	const char *encoding,
				350	const char *errors)
				351	{
				352	PyObject buffer = NULL, unicode;
				353
				354	/* Shortcut for the default encoding UTF-8 */
				355	if (encoding == NULL \|\|
				356	(strcmp(encoding, "utf-8") == 0))
				357	return PyUnicode_DecodeUTF8(s, size, errors);
				358
				359	/* Decode via the codec registry */
				360	buffer = PyBuffer_FromMemory((void *)s, size);
				361	if (buffer == NULL)
				362	goto onError;
				363	unicode = PyCodec_Decode(buffer, encoding, errors);
				364	if (unicode == NULL)
				365	goto onError;
				366	if (!PyUnicode_Check(unicode)) {
				367	PyErr_Format(PyExc_TypeError,
				368	"decoder did not return an unicode object (type=%s)",
				369	unicode->ob_type->tp_name);
				370	Py_DECREF(unicode);
				371	goto onError;
				372	}
				373	Py_DECREF(buffer);
				374	return unicode;
				375
				376	onError:
				377	Py_XDECREF(buffer);
				378	return NULL;
				379	}
				380
				381	PyObject PyUnicode_Encode(const Py_UNICODE s,
				382	int size,
				383	const char *encoding,
				384	const char *errors)
				385	{
				386	PyObject v, unicode;
				387
				388	unicode = PyUnicode_FromUnicode(s, size);
				389	if (unicode == NULL)
				390	return NULL;
				391	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				392	Py_DECREF(unicode);
				393	return v;
				394	}
				395
				396	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				397	const char *encoding,
				398	const char *errors)
				399	{
				400	PyObject *v;
				401
				402	if (!PyUnicode_Check(unicode)) {
				403	PyErr_BadArgument();
				404	goto onError;
				405	}
				406	/* Shortcut for the default encoding UTF-8 */
				407	if ((encoding == NULL \|\|
				408	(strcmp(encoding, "utf-8") == 0)) &&
				409	errors == NULL)
				410	return PyUnicode_AsUTF8String(unicode);
				411
				412	/* Encode via the codec registry */
				413	v = PyCodec_Encode(unicode, encoding, errors);
				414	if (v == NULL)
				415	goto onError;
				416	/* XXX Should we really enforce this ? */
				417	if (!PyString_Check(v)) {
				418	PyErr_Format(PyExc_TypeError,
				419	"encoder did not return a string object (type=%s)",
				420	v->ob_type->tp_name);
				421	Py_DECREF(v);
				422	goto onError;
				423	}
				424	return v;
				425
				426	onError:
				427	return NULL;
				428	}
				429
				430	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				431	{
				432	if (!PyUnicode_Check(unicode)) {
				433	PyErr_BadArgument();
				434	goto onError;
				435	}
				436	return PyUnicode_AS_UNICODE(unicode);
				437
				438	onError:
				439	return NULL;
				440	}
				441
				442	int PyUnicode_GetSize(PyObject *unicode)
				443	{
				444	if (!PyUnicode_Check(unicode)) {
				445	PyErr_BadArgument();
				446	goto onError;
				447	}
				448	return PyUnicode_GET_SIZE(unicode);
				449
				450	onError:
				451	return -1;
				452	}
				453
				454	/* --- UTF-8 Codec -------------------------------------------------------- */
				455
				456	static
				457	char utf8_code_length[256] = {
				458	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				459	illegal prefix. see RFC 2279 for details */
				460	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				461	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				462	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				463	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				464	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				465	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				466	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				467	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				468	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				469	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				470	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				471	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				472	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				473	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				474	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				475	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				476	};
				477
				478	static
				479	int utf8_decoding_error(const char **source,
				480	Py_UNICODE **dest,
				481	const char *errors,
				482	const char *details)
				483	{
				484	if ((errors == NULL) \|\|
				485	(strcmp(errors,"strict") == 0)) {
				486	PyErr_Format(PyExc_UnicodeError,
				487	"UTF-8 decoding error: %s",
				488	details);
				489	return -1;
				490	}
				491	else if (strcmp(errors,"ignore") == 0) {
				492	(*source)++;
				493	return 0;
				494	}
				495	else if (strcmp(errors,"replace") == 0) {
				496	(*source)++;
				497	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				498	(*dest)++;
				499	return 0;
				500	}
				501	else {
				502	PyErr_Format(PyExc_ValueError,
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	503	"UTF-8 decoding error; unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	504	errors);
				505	return -1;
				506	}
				507	}
				508
				509	#define UTF8_ERROR(details) do { \
				510	if (utf8_decoding_error(&s, &p, errors, details)) \
				511	goto onError; \
				512	continue; \
				513	} while (0)
				514
				515	PyObject PyUnicode_DecodeUTF8(const char s,
				516	int size,
				517	const char *errors)
				518	{
				519	int n;
				520	const char *e;
				521	PyUnicodeObject *unicode;
				522	Py_UNICODE *p;
				523
				524	/* Note: size will always be longer than the resulting Unicode
				525	character count */
				526	unicode = _PyUnicode_New(size);
				527	if (!unicode)
				528	return NULL;
				529	if (size == 0)
				530	return (PyObject *)unicode;
				531
				532	/* Unpack UTF-8 encoded data */
				533	p = unicode->str;
				534	e = s + size;
				535
				536	while (s < e) {
				537	register Py_UNICODE ch = (unsigned char)*s;
				538
				539	if (ch < 0x80) {
				540	*p++ = ch;
				541	s++;
				542	continue;
				543	}
				544
				545	n = utf8_code_length[ch];
				546
				547	if (s + n > e)
				548	UTF8_ERROR("unexpected end of data");
				549
				550	switch (n) {
				551
				552	case 0:
				553	UTF8_ERROR("unexpected code byte");
				554	break;
				555
				556	case 1:
				557	UTF8_ERROR("internal error");
				558	break;
				559
				560	case 2:
				561	if ((s[1] & 0xc0) != 0x80)
				562	UTF8_ERROR("invalid data");
				563	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				564	if (ch < 0x80)
				565	UTF8_ERROR("illegal encoding");
				566	else
				567	*p++ = ch;
				568	break;
				569
				570	case 3:
				571	if ((s[1] & 0xc0) != 0x80 \|\|
				572	(s[2] & 0xc0) != 0x80)
				573	UTF8_ERROR("invalid data");
				574	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				575	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				576	UTF8_ERROR("illegal encoding");
				577	else
				578	*p++ = ch;
				579	break;
				580
				581	default:
				582	/* Other sizes are only needed for UCS-4 */
				583	UTF8_ERROR("unsupported Unicode code range");
				584	}
				585	s += n;
				586	}
				587
				588	/* Adjust length */
				589	if (_PyUnicode_Resize(unicode, p - unicode->str))
				590	goto onError;
				591
				592	return (PyObject *)unicode;
				593
				594	onError:
				595	Py_DECREF(unicode);
				596	return NULL;
				597	}
				598
				599	#undef UTF8_ERROR
				600
				601	static
				602	int utf8_encoding_error(const Py_UNICODE **source,
				603	char **dest,
				604	const char *errors,
				605	const char *details)
				606	{
				607	if ((errors == NULL) \|\|
				608	(strcmp(errors,"strict") == 0)) {
				609	PyErr_Format(PyExc_UnicodeError,
				610	"UTF-8 encoding error: %s",
				611	details);
				612	return -1;
				613	}
				614	else if (strcmp(errors,"ignore") == 0) {
				615	return 0;
				616	}
				617	else if (strcmp(errors,"replace") == 0) {
				618	**dest = '?';
				619	(*dest)++;
				620	return 0;
				621	}
				622	else {
				623	PyErr_Format(PyExc_ValueError,
				624	"UTF-8 encoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	625	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	626	errors);
				627	return -1;
				628	}
				629	}
				630
				631	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				632	int size,
				633	const char *errors)
				634	{
				635	PyObject *v;
				636	char *p;
				637	char *q;
				638
				639	v = PyString_FromStringAndSize(NULL, 3 * size);
				640	if (v == NULL)
				641	return NULL;
				642	if (size == 0)
				643	goto done;
				644
				645	p = q = PyString_AS_STRING(v);
				646	while (size-- > 0) {
				647	Py_UNICODE ch = *s++;
				648	if (ch < 0x80)
				649	*p++ = (char) ch;
				650	else if (ch < 0x0800) {
				651	*p++ = 0xc0 \| (ch >> 6);
				652	*p++ = 0x80 \| (ch & 0x3f);
				653	} else if (0xD800 <= ch && ch <= 0xDFFF) {
				654	/* These byte ranges are reserved for UTF-16 surrogate
				655	bytes which the Python implementation currently does
				656	not support. */
				657	printf("code range problem: U+%04x\n", ch);
				658	if (utf8_encoding_error(&s, &p, errors,
				659	"unsupported code range"))
				660	goto onError;
				661	} else {
				662	*p++ = 0xe0 \| (ch >> 12);
				663	*p++ = 0x80 \| ((ch >> 6) & 0x3f);
				664	*p++ = 0x80 \| (ch & 0x3f);
				665	}
				666	}
				667	*p = '\0';
				668	_PyString_Resize(&v, p - q);
				669
				670	done:
				671	return v;
				672
				673	onError:
				674	Py_DECREF(v);
				675	return NULL;
				676	}
				677
				678	/* Return a Python string holding the UTF-8 encoded value of the
				679	Unicode object.
				680
				681	The resulting string is cached in the Unicode object for subsequent
				682	usage by this function. The cached version is needed to implement
				683	the character buffer interface.
				684
				685	The refcount of the string is not incremented.
				686
				687	*/
				688
				689	static
				690	PyObject utf8_string(PyUnicodeObject self,
				691	const char *errors)
				692	{
				693	PyObject *v = self->utf8str;
				694
				695	if (v)
				696	return v;
				697	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(self),
				698	PyUnicode_GET_SIZE(self),
				699	errors);
				700	if (v && errors == NULL)
				701	self->utf8str = v;
				702	return v;
				703	}
				704
				705	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				706	{
				707	PyObject *str;
				708
				709	if (!PyUnicode_Check(unicode)) {
				710	PyErr_BadArgument();
				711	return NULL;
				712	}
				713	str = utf8_string((PyUnicodeObject *)unicode, NULL);
				714	if (str == NULL)
				715	return NULL;
				716	Py_INCREF(str);
				717	return str;
				718	}
				719
				720	/* --- UTF-16 Codec ------------------------------------------------------- */
				721
				722	static
				723	int utf16_decoding_error(const Py_UNICODE **source,
				724	Py_UNICODE **dest,
				725	const char *errors,
				726	const char *details)
				727	{
				728	if ((errors == NULL) \|\|
				729	(strcmp(errors,"strict") == 0)) {
				730	PyErr_Format(PyExc_UnicodeError,
				731	"UTF-16 decoding error: %s",
				732	details);
				733	return -1;
				734	}
				735	else if (strcmp(errors,"ignore") == 0) {
				736	return 0;
				737	}
				738	else if (strcmp(errors,"replace") == 0) {
				739	if (dest) {
				740	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				741	(*dest)++;
				742	}
				743	return 0;
				744	}
				745	else {
				746	PyErr_Format(PyExc_ValueError,
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	747	"UTF-16 decoding error; unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	748	errors);
				749	return -1;
				750	}
				751	}
				752
				753	#define UTF16_ERROR(details) do { \
				754	if (utf16_decoding_error(&q, &p, errors, details)) \
				755	goto onError; \
				756	continue; \
				757	} while(0)
				758
				759	PyObject PyUnicode_DecodeUTF16(const char s,
				760	int size,
				761	const char *errors,
				762	int *byteorder)
				763	{
				764	PyUnicodeObject *unicode;
				765	Py_UNICODE *p;
				766	const Py_UNICODE q, e;
				767	int bo = 0;
				768
				769	/* size should be an even number */
				770	if (size % sizeof(Py_UNICODE) != 0) {
				771	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				772	return NULL;
				773	/* The remaining input chars are ignored if we fall through
				774	here... */
				775	}
				776
				777	/* Note: size will always be longer than the resulting Unicode
				778	character count */
				779	unicode = _PyUnicode_New(size);
				780	if (!unicode)
				781	return NULL;
				782	if (size == 0)
				783	return (PyObject *)unicode;
				784
				785	/* Unpack UTF-16 encoded data */
				786	p = unicode->str;
				787	q = (Py_UNICODE *)s;
				788	e = q + (size / sizeof(Py_UNICODE));
				789
				790	if (byteorder)
				791	bo = *byteorder;
				792
				793	while (q < e) {
				794	register Py_UNICODE ch = *q++;
				795
				796	/* Check for BOM marks (U+FEFF) in the input and adjust
				797	current byte order setting accordingly. Swap input
				798	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				799	!) */
				800	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				801	if (ch == 0xFEFF) {
				802	bo = -1;
				803	continue;
				804	} else if (ch == 0xFFFE) {
				805	bo = 1;
				806	continue;
				807	}
				808	if (bo == 1)
				809	ch = (ch >> 8) \| (ch << 8);
				810	#else
				811	if (ch == 0xFEFF) {
				812	bo = 1;
				813	continue;
				814	} else if (ch == 0xFFFE) {
				815	bo = -1;
				816	continue;
				817	}
				818	if (bo == -1)
				819	ch = (ch >> 8) \| (ch << 8);
				820	#endif
				821	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				822	*p++ = ch;
				823	continue;
				824	}
				825
				826	/* UTF-16 code pair: */
				827	if (q >= e)
				828	UTF16_ERROR("unexpected end of data");
				829	if (0xDC00 <= q && q <= 0xDFFF) {
				830	q++;
				831	if (0xD800 <= q && q <= 0xDBFF)
				832	/* This is valid data (a UTF-16 surrogate pair), but
				833	we are not able to store this information since our
				834	Py_UNICODE type only has 16 bits... this might
				835	change someday, even though it's unlikely. */
				836	UTF16_ERROR("code pairs are not supported");
				837	else
				838	continue;
				839	}
				840	UTF16_ERROR("illegal encoding");
				841	}
				842
				843	if (byteorder)
				844	*byteorder = bo;
				845
				846	/* Adjust length */
				847	if (_PyUnicode_Resize(unicode, p - unicode->str))
				848	goto onError;
				849
				850	return (PyObject *)unicode;
				851
				852	onError:
				853	Py_DECREF(unicode);
				854	return NULL;
				855	}
				856
				857	#undef UTF16_ERROR
				858
				859	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				860	int size,
				861	const char *errors,
				862	int byteorder)
				863	{
				864	PyObject *v;
				865	Py_UNICODE *p;
				866	char *q;
				867
				868	/* We don't create UTF-16 pairs... */
				869	v = PyString_FromStringAndSize(NULL,
				870	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				871	if (v == NULL)
				872	return NULL;
				873	if (size == 0)
				874	goto done;
				875
				876	q = PyString_AS_STRING(v);
				877	p = (Py_UNICODE *)q;
				878
				879	if (byteorder == 0)
				880	*p++ = 0xFEFF;
				881	if (byteorder == 0 \|\|
				882	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				883	byteorder == -1
				884	#else
				885	byteorder == 1
				886	#endif
				887	)
				888	memcpy(p, s, size * sizeof(Py_UNICODE));
				889	else
				890	while (size-- > 0) {
				891	Py_UNICODE ch = *s++;
				892	*p++ = (ch >> 8) \| (ch << 8);
				893	}
				894	done:
				895	return v;
				896	}
				897
				898	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				899	{
				900	if (!PyUnicode_Check(unicode)) {
				901	PyErr_BadArgument();
				902	return NULL;
				903	}
				904	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				905	PyUnicode_GET_SIZE(unicode),
				906	NULL,
				907	0);
				908	}
				909
				910	/* --- Unicode Escape Codec ----------------------------------------------- */
				911
				912	static
				913	int unicodeescape_decoding_error(const char **source,
				914	unsigned int *x,
				915	const char *errors,
				916	const char *details)
				917	{
				918	if ((errors == NULL) \|\|
				919	(strcmp(errors,"strict") == 0)) {
				920	PyErr_Format(PyExc_UnicodeError,
				921	"Unicode-Escape decoding error: %s",
				922	details);
				923	return -1;
				924	}
				925	else if (strcmp(errors,"ignore") == 0) {
				926	return 0;
				927	}
				928	else if (strcmp(errors,"replace") == 0) {
				929	*x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
				930	return 0;
				931	}
				932	else {
				933	PyErr_Format(PyExc_ValueError,
				934	"Unicode-Escape decoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	935	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	936	errors);
				937	return -1;
				938	}
				939	}
				940
				941	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				942	int size,
				943	const char *errors)
				944	{
				945	PyUnicodeObject *v;
				946	Py_UNICODE p = NULL, buf = NULL;
				947	const char *end;
				948
				949	/* Escaped strings will always be longer than the resulting
				950	Unicode string, so we start with size here and then reduce the
				951	length after conversion to the true value. */
				952	v = _PyUnicode_New(size);
				953	if (v == NULL)
				954	goto onError;
				955	if (size == 0)
				956	return (PyObject *)v;
				957	p = buf = PyUnicode_AS_UNICODE(v);
				958	end = s + size;
				959	while (s < end) {
				960	unsigned char c;
				961	unsigned int x;
				962	int i;
				963
				964	/* Non-escape characters are interpreted as Unicode ordinals */
				965	if (*s != '\\') {
				966	p++ = (unsigned char)s++;
				967	continue;
				968	}
				969
				970	/* \ - Escapes */
				971	s++;
				972	switch (*s++) {
				973
				974	/* \x escapes */
				975	case '\n': break;
				976	case '\\': *p++ = '\\'; break;
				977	case '\'': *p++ = '\''; break;
				978	case '\"': *p++ = '\"'; break;
				979	case 'b': *p++ = '\b'; break;
				980	case 'f': p++ = '\014'; break; / FF */
				981	case 't': *p++ = '\t'; break;
				982	case 'n': *p++ = '\n'; break;
				983	case 'r': *p++ = '\r'; break;
				984	case 'v': p++ = '\013'; break; / VT */
				985	case 'a': p++ = '\007'; break; / BEL, not classic C */
				986
				987	/* \OOO (octal) escapes */
				988	case '0': case '1': case '2': case '3':
				989	case '4': case '5': case '6': case '7':
				990	c = s[-1] - '0';
				991	if ('0' <= s && s <= '7') {
				992	c = (c<<3) + *s++ - '0';
				993	if ('0' <= s && s <= '7')
				994	c = (c<<3) + *s++ - '0';
				995	}
				996	*p++ = c;
				997	break;
				998
				999	/* \xXXXX escape with 0-4 hex digits */
				1000	case 'x':
				1001	x = 0;
				1002	c = (unsigned char)*s;
				1003	if (isxdigit(c)) {
				1004	do {
				1005	x = (x<<4) & ~0xF;
				1006	if ('0' <= c && c <= '9')
				1007	x += c - '0';
				1008	else if ('a' <= c && c <= 'f')
				1009	x += 10 + c - 'a';
				1010	else
				1011	x += 10 + c - 'A';
				1012	c = (unsigned char)*++s;
				1013	} while (isxdigit(c));
				1014	*p++ = x;
				1015	} else {
				1016	*p++ = '\\';
				1017	*p++ = (unsigned char)s[-1];
				1018	}
				1019	break;
				1020
				1021	/* \uXXXX with 4 hex digits */
				1022	case 'u':
				1023	for (x = 0, i = 0; i < 4; i++) {
				1024	c = (unsigned char)s[i];
				1025	if (!isxdigit(c)) {
				1026	if (unicodeescape_decoding_error(&s, &x, errors,
				1027	"truncated \\uXXXX"))
				1028	goto onError;
				1029	i++;
				1030	break;
				1031	}
				1032	x = (x<<4) & ~0xF;
				1033	if (c >= '0' && c <= '9')
				1034	x += c - '0';
				1035	else if (c >= 'a' && c <= 'f')
				1036	x += 10 + c - 'a';
				1037	else
				1038	x += 10 + c - 'A';
				1039	}
				1040	s += i;
				1041	*p++ = x;
				1042	break;
				1043
				1044	default:
				1045	*p++ = '\\';
				1046	*p++ = (unsigned char)s[-1];
				1047	break;
				1048	}
				1049	}
				1050	_PyUnicode_Resize(v, (int)(p - buf));
				1051	return (PyObject *)v;
				1052
				1053	onError:
				1054	Py_XDECREF(v);
				1055	return NULL;
				1056	}
				1057
				1058	/* Return a Unicode-Escape string version of the Unicode object.
				1059
				1060	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1061	appropriate.
				1062
				1063	*/
				1064
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1065	static const Py_UNICODE findchar(const Py_UNICODE s,
				1066	int size,
				1067	Py_UNICODE ch);
				1068
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1069	static
				1070	PyObject unicodeescape_string(const Py_UNICODE s,
				1071	int size,
				1072	int quotes)
				1073	{
				1074	PyObject *repr;
				1075	char *p;
				1076	char *q;
				1077
				1078	static const char *hexdigit = "0123456789ABCDEF";
				1079
				1080	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1081	if (repr == NULL)
				1082	return NULL;
				1083
				1084	p = q = PyString_AS_STRING(repr);
				1085
				1086	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1087	*p++ = 'u';
				1088	*p++ = (findchar(s, size, '\'') &&
				1089	!findchar(s, size, '"')) ? '"' : '\'';
				1090	}
				1091	while (size-- > 0) {
				1092	Py_UNICODE ch = *s++;
				1093	/* Escape quotes */
				1094	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1095	*p++ = '\\';
				1096	*p++ = (char) ch;
				1097	}
				1098	/* Map 16-bit characters to '\uxxxx' */
				1099	else if (ch >= 256) {
				1100	*p++ = '\\';
				1101	*p++ = 'u';
				1102	*p++ = hexdigit[(ch >> 12) & 0xf];
				1103	*p++ = hexdigit[(ch >> 8) & 0xf];
				1104	*p++ = hexdigit[(ch >> 4) & 0xf];
				1105	*p++ = hexdigit[ch & 15];
				1106	}
				1107	/* Map non-printable US ASCII to '\ooo' */
				1108	else if (ch < ' ' \|\| ch >= 128) {
				1109	*p++ = '\\';
				1110	*p++ = hexdigit[(ch >> 6) & 7];
				1111	*p++ = hexdigit[(ch >> 3) & 7];
				1112	*p++ = hexdigit[ch & 7];
				1113	}
				1114	/* Copy everything else as-is */
				1115	else
				1116	*p++ = (char) ch;
				1117	}
				1118	if (quotes)
				1119	*p++ = q[1];
				1120
				1121	*p = '\0';
				1122	_PyString_Resize(&repr, p - q);
				1123
				1124	return repr;
				1125	}
				1126
				1127	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1128	int size)
				1129	{
				1130	return unicodeescape_string(s, size, 0);
				1131	}
				1132
				1133	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1134	{
				1135	if (!PyUnicode_Check(unicode)) {
				1136	PyErr_BadArgument();
				1137	return NULL;
				1138	}
				1139	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1140	PyUnicode_GET_SIZE(unicode));
				1141	}
				1142
				1143	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1144
				1145	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1146	int size,
				1147	const char *errors)
				1148	{
				1149	PyUnicodeObject *v;
				1150	Py_UNICODE p, buf;
				1151	const char *end;
				1152	const char *bs;
				1153
				1154	/* Escaped strings will always be longer than the resulting
				1155	Unicode string, so we start with size here and then reduce the
				1156	length after conversion to the true value. */
				1157	v = _PyUnicode_New(size);
				1158	if (v == NULL)
				1159	goto onError;
				1160	if (size == 0)
				1161	return (PyObject *)v;
				1162	p = buf = PyUnicode_AS_UNICODE(v);
				1163	end = s + size;
				1164	while (s < end) {
				1165	unsigned char c;
				1166	unsigned int x;
				1167	int i;
				1168
				1169	/* Non-escape characters are interpreted as Unicode ordinals */
				1170	if (*s != '\\') {
				1171	p++ = (unsigned char)s++;
				1172	continue;
				1173	}
				1174
				1175	/* \u-escapes are only interpreted iff the number of leading
				1176	backslashes if odd */
				1177	bs = s;
				1178	for (;s < end;) {
				1179	if (*s != '\\')
				1180	break;
				1181	p++ = (unsigned char)s++;
				1182	}
				1183	if (((s - bs) & 1) == 0 \|\|
				1184	s >= end \|\|
				1185	*s != 'u') {
				1186	continue;
				1187	}
				1188	p--;
				1189	s++;
				1190
				1191	/* \uXXXX with 4 hex digits */
				1192	for (x = 0, i = 0; i < 4; i++) {
				1193	c = (unsigned char)s[i];
				1194	if (!isxdigit(c)) {
				1195	if (unicodeescape_decoding_error(&s, &x, errors,
				1196	"truncated \\uXXXX"))
				1197	goto onError;
				1198	i++;
				1199	break;
				1200	}
				1201	x = (x<<4) & ~0xF;
				1202	if (c >= '0' && c <= '9')
				1203	x += c - '0';
				1204	else if (c >= 'a' && c <= 'f')
				1205	x += 10 + c - 'a';
				1206	else
				1207	x += 10 + c - 'A';
				1208	}
				1209	s += i;
				1210	*p++ = x;
				1211	}
				1212	_PyUnicode_Resize(v, (int)(p - buf));
				1213	return (PyObject *)v;
				1214
				1215	onError:
				1216	Py_XDECREF(v);
				1217	return NULL;
				1218	}
				1219
				1220	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1221	int size)
				1222	{
				1223	PyObject *repr;
				1224	char *p;
				1225	char *q;
				1226
				1227	static const char *hexdigit = "0123456789ABCDEF";
				1228
				1229	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1230	if (repr == NULL)
				1231	return NULL;
				1232
				1233	p = q = PyString_AS_STRING(repr);
				1234	while (size-- > 0) {
				1235	Py_UNICODE ch = *s++;
				1236	/* Map 16-bit characters to '\uxxxx' */
				1237	if (ch >= 256) {
				1238	*p++ = '\\';
				1239	*p++ = 'u';
				1240	*p++ = hexdigit[(ch >> 12) & 0xf];
				1241	*p++ = hexdigit[(ch >> 8) & 0xf];
				1242	*p++ = hexdigit[(ch >> 4) & 0xf];
				1243	*p++ = hexdigit[ch & 15];
				1244	}
				1245	/* Copy everything else as-is */
				1246	else
				1247	*p++ = (char) ch;
				1248	}
				1249	*p = '\0';
				1250	_PyString_Resize(&repr, p - q);
				1251
				1252	return repr;
				1253	}
				1254
				1255	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1256	{
				1257	if (!PyUnicode_Check(unicode)) {
				1258	PyErr_BadArgument();
				1259	return NULL;
				1260	}
				1261	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1262	PyUnicode_GET_SIZE(unicode));
				1263	}
				1264
				1265	/* --- Latin-1 Codec ------------------------------------------------------ */
				1266
				1267	PyObject PyUnicode_DecodeLatin1(const char s,
				1268	int size,
				1269	const char *errors)
				1270	{
				1271	PyUnicodeObject *v;
				1272	Py_UNICODE *p;
				1273
				1274	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1275	v = _PyUnicode_New(size);
				1276	if (v == NULL)
				1277	goto onError;
				1278	if (size == 0)
				1279	return (PyObject *)v;
				1280	p = PyUnicode_AS_UNICODE(v);
				1281	while (size-- > 0)
				1282	p++ = (unsigned char)s++;
				1283	return (PyObject *)v;
				1284
				1285	onError:
				1286	Py_XDECREF(v);
				1287	return NULL;
				1288	}
				1289
				1290	static
				1291	int latin1_encoding_error(const Py_UNICODE **source,
				1292	char **dest,
				1293	const char *errors,
				1294	const char *details)
				1295	{
				1296	if ((errors == NULL) \|\|
				1297	(strcmp(errors,"strict") == 0)) {
				1298	PyErr_Format(PyExc_UnicodeError,
				1299	"Latin-1 encoding error: %s",
				1300	details);
				1301	return -1;
				1302	}
				1303	else if (strcmp(errors,"ignore") == 0) {
				1304	return 0;
				1305	}
				1306	else if (strcmp(errors,"replace") == 0) {
				1307	**dest = '?';
				1308	return 0;
				1309	}
				1310	else {
				1311	PyErr_Format(PyExc_ValueError,
				1312	"Latin-1 encoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1313	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1314	errors);
				1315	return -1;
				1316	}
				1317	}
				1318
				1319	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1320	int size,
				1321	const char *errors)
				1322	{
				1323	PyObject *repr;
				1324	char *s;
				1325	repr = PyString_FromStringAndSize(NULL, size);
				1326	if (repr == NULL)
				1327	return NULL;
				1328
				1329	s = PyString_AS_STRING(repr);
				1330	while (size-- > 0) {
				1331	Py_UNICODE ch = *p++;
				1332	if (ch >= 256) {
				1333	if (latin1_encoding_error(&p, &s, errors,
				1334	"ordinal not in range(256)"))
				1335	goto onError;
				1336	}
				1337	else
				1338	*s++ = (char)ch;
				1339	}
				1340	return repr;
				1341
				1342	onError:
				1343	Py_DECREF(repr);
				1344	return NULL;
				1345	}
				1346
				1347	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1348	{
				1349	if (!PyUnicode_Check(unicode)) {
				1350	PyErr_BadArgument();
				1351	return NULL;
				1352	}
				1353	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1354	PyUnicode_GET_SIZE(unicode),
				1355	NULL);
				1356	}
				1357
				1358	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1359
				1360	static
				1361	int ascii_decoding_error(const char **source,
				1362	Py_UNICODE **dest,
				1363	const char *errors,
				1364	const char *details)
				1365	{
				1366	if ((errors == NULL) \|\|
				1367	(strcmp(errors,"strict") == 0)) {
				1368	PyErr_Format(PyExc_UnicodeError,
				1369	"ASCII decoding error: %s",
				1370	details);
				1371	return -1;
				1372	}
				1373	else if (strcmp(errors,"ignore") == 0) {
				1374	return 0;
				1375	}
				1376	else if (strcmp(errors,"replace") == 0) {
				1377	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1378	(*dest)++;
				1379	return 0;
				1380	}
				1381	else {
				1382	PyErr_Format(PyExc_ValueError,
				1383	"ASCII decoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1384	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1385	errors);
				1386	return -1;
				1387	}
				1388	}
				1389
				1390	PyObject PyUnicode_DecodeASCII(const char s,
				1391	int size,
				1392	const char *errors)
				1393	{
				1394	PyUnicodeObject *v;
				1395	Py_UNICODE *p;
				1396
				1397	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1398	v = _PyUnicode_New(size);
				1399	if (v == NULL)
				1400	goto onError;
				1401	if (size == 0)
				1402	return (PyObject *)v;
				1403	p = PyUnicode_AS_UNICODE(v);
				1404	while (size-- > 0) {
				1405	register unsigned char c;
				1406
				1407	c = (unsigned char)*s++;
				1408	if (c < 128)
				1409	*p++ = c;
				1410	else if (ascii_decoding_error(&s, &p, errors,
				1411	"ordinal not in range(128)"))
				1412	goto onError;
				1413	}
				1414	if (p - PyUnicode_AS_UNICODE(v) < size)
				1415	_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
				1416	return (PyObject *)v;
				1417
				1418	onError:
				1419	Py_XDECREF(v);
				1420	return NULL;
				1421	}
				1422
				1423	static
				1424	int ascii_encoding_error(const Py_UNICODE **source,
				1425	char **dest,
				1426	const char *errors,
				1427	const char *details)
				1428	{
				1429	if ((errors == NULL) \|\|
				1430	(strcmp(errors,"strict") == 0)) {
				1431	PyErr_Format(PyExc_UnicodeError,
				1432	"ASCII encoding error: %s",
				1433	details);
				1434	return -1;
				1435	}
				1436	else if (strcmp(errors,"ignore") == 0) {
				1437	return 0;
				1438	}
				1439	else if (strcmp(errors,"replace") == 0) {
				1440	**dest = '?';
				1441	return 0;
				1442	}
				1443	else {
				1444	PyErr_Format(PyExc_ValueError,
				1445	"ASCII encoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1446	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1447	errors);
				1448	return -1;
				1449	}
				1450	}
				1451
				1452	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1453	int size,
				1454	const char *errors)
				1455	{
				1456	PyObject *repr;
				1457	char *s;
				1458	repr = PyString_FromStringAndSize(NULL, size);
				1459	if (repr == NULL)
				1460	return NULL;
				1461
				1462	s = PyString_AS_STRING(repr);
				1463	while (size-- > 0) {
				1464	Py_UNICODE ch = *p++;
				1465	if (ch >= 128) {
				1466	if (ascii_encoding_error(&p, &s, errors,
				1467	"ordinal not in range(128)"))
				1468	goto onError;
				1469	}
				1470	else
				1471	*s++ = (char)ch;
				1472	}
				1473	return repr;
				1474
				1475	onError:
				1476	Py_DECREF(repr);
				1477	return NULL;
				1478	}
				1479
				1480	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1481	{
				1482	if (!PyUnicode_Check(unicode)) {
				1483	PyErr_BadArgument();
				1484	return NULL;
				1485	}
				1486	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1487	PyUnicode_GET_SIZE(unicode),
				1488	NULL);
				1489	}
				1490
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1491	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1492
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1493	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1494
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1495	PyObject PyUnicode_DecodeMBCS(const char s,
				1496	int size,
				1497	const char *errors)
				1498	{
				1499	PyUnicodeObject *v;
				1500	Py_UNICODE *p;
				1501
				1502	/* First get the size of the result */
				1503	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
				1504	if (usize==0)
				1505	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1506
				1507	v = _PyUnicode_New(usize);
				1508	if (v == NULL)
				1509	return NULL;
				1510	if (usize == 0)
				1511	return (PyObject *)v;
				1512	p = PyUnicode_AS_UNICODE(v);
				1513	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1514	Py_DECREF(v);
				1515	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1516	}
				1517
				1518	return (PyObject *)v;
				1519	}
				1520
				1521	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1522	int size,
				1523	const char *errors)
				1524	{
				1525	PyObject *repr;
				1526	char *s;
				1527
				1528	/* First get the size of the result */
				1529	DWORD mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
				1530	if (mbcssize==0)
				1531	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1532
				1533	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1534	if (repr == NULL)
				1535	return NULL;
				1536	if (mbcssize==0)
				1537	return repr;
				1538
				1539	/* Do the conversion */
				1540	s = PyString_AS_STRING(repr);
				1541	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1542	Py_DECREF(repr);
				1543	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1544	}
				1545	return repr;
				1546	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1547
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1548	#endif /* MS_WIN32 */
				1549
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1550	/* --- Character Mapping Codec -------------------------------------------- */
				1551
				1552	static
				1553	int charmap_decoding_error(const char **source,
				1554	Py_UNICODE **dest,
				1555	const char *errors,
				1556	const char *details)
				1557	{
				1558	if ((errors == NULL) \|\|
				1559	(strcmp(errors,"strict") == 0)) {
				1560	PyErr_Format(PyExc_UnicodeError,
				1561	"charmap decoding error: %s",
				1562	details);
				1563	return -1;
				1564	}
				1565	else if (strcmp(errors,"ignore") == 0) {
				1566	return 0;
				1567	}
				1568	else if (strcmp(errors,"replace") == 0) {
				1569	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1570	(*dest)++;
				1571	return 0;
				1572	}
				1573	else {
				1574	PyErr_Format(PyExc_ValueError,
				1575	"charmap decoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1576	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1577	errors);
				1578	return -1;
				1579	}
				1580	}
				1581
				1582	PyObject PyUnicode_DecodeCharmap(const char s,
				1583	int size,
				1584	PyObject *mapping,
				1585	const char *errors)
				1586	{
				1587	PyUnicodeObject *v;
				1588	Py_UNICODE *p;
				1589
				1590	/* Default to Latin-1 */
				1591	if (mapping == NULL)
				1592	return PyUnicode_DecodeLatin1(s, size, errors);
				1593
				1594	v = _PyUnicode_New(size);
				1595	if (v == NULL)
				1596	goto onError;
				1597	if (size == 0)
				1598	return (PyObject *)v;
				1599	p = PyUnicode_AS_UNICODE(v);
				1600	while (size-- > 0) {
				1601	unsigned char ch = *s++;
				1602	PyObject w, x;
				1603
				1604	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1605	w = PyInt_FromLong((long)ch);
				1606	if (w == NULL)
				1607	goto onError;
				1608	x = PyObject_GetItem(mapping, w);
				1609	Py_DECREF(w);
				1610	if (x == NULL) {
				1611	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1612	/* No mapping found: default to Latin-1 mapping */
				1613	PyErr_Clear();
				1614	*p++ = (Py_UNICODE)ch;
				1615	continue;
				1616	}
				1617	goto onError;
				1618	}
				1619
				1620	/* Apply mapping */
				1621	if (PyInt_Check(x)) {
				1622	int value = PyInt_AS_LONG(x);
				1623	if (value < 0 \|\| value > 65535) {
				1624	PyErr_SetString(PyExc_TypeError,
				1625	"character mapping must be in range(65336)");
				1626	Py_DECREF(x);
				1627	goto onError;
				1628	}
				1629	*p++ = (Py_UNICODE)value;
				1630	}
				1631	else if (x == Py_None) {
				1632	/* undefined mapping */
				1633	if (charmap_decoding_error(&s, &p, errors,
				1634	"character maps to <undefined>")) {
				1635	Py_DECREF(x);
				1636	goto onError;
				1637	}
				1638	}
				1639	else if (PyUnicode_Check(x)) {
				1640	if (PyUnicode_GET_SIZE(x) != 1) {
				1641	/* 1-n mapping */
				1642	PyErr_SetString(PyExc_NotImplementedError,
				1643	"1-n mappings are currently not implemented");
				1644	Py_DECREF(x);
				1645	goto onError;
				1646	}
				1647	p++ = PyUnicode_AS_UNICODE(x);
				1648	}
				1649	else {
				1650	/* wrong return value */
				1651	PyErr_SetString(PyExc_TypeError,
				1652	"character mapping must return integer, None or unicode");
				1653	Py_DECREF(x);
				1654	goto onError;
				1655	}
				1656	Py_DECREF(x);
				1657	}
				1658	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1659	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1660	goto onError;
				1661	return (PyObject *)v;
				1662
				1663	onError:
				1664	Py_XDECREF(v);
				1665	return NULL;
				1666	}
				1667
				1668	static
				1669	int charmap_encoding_error(const Py_UNICODE **source,
				1670	char **dest,
				1671	const char *errors,
				1672	const char *details)
				1673	{
				1674	if ((errors == NULL) \|\|
				1675	(strcmp(errors,"strict") == 0)) {
				1676	PyErr_Format(PyExc_UnicodeError,
				1677	"charmap encoding error: %s",
				1678	details);
				1679	return -1;
				1680	}
				1681	else if (strcmp(errors,"ignore") == 0) {
				1682	return 0;
				1683	}
				1684	else if (strcmp(errors,"replace") == 0) {
				1685	**dest = '?';
				1686	(*dest)++;
				1687	return 0;
				1688	}
				1689	else {
				1690	PyErr_Format(PyExc_ValueError,
				1691	"charmap encoding error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1692	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1693	errors);
				1694	return -1;
				1695	}
				1696	}
				1697
				1698	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1699	int size,
				1700	PyObject *mapping,
				1701	const char *errors)
				1702	{
				1703	PyObject *v;
				1704	char *s;
				1705
				1706	/* Default to Latin-1 */
				1707	if (mapping == NULL)
				1708	return PyUnicode_EncodeLatin1(p, size, errors);
				1709
				1710	v = PyString_FromStringAndSize(NULL, size);
				1711	if (v == NULL)
				1712	return NULL;
				1713	s = PyString_AS_STRING(v);
				1714	while (size-- > 0) {
				1715	Py_UNICODE ch = *p++;
				1716	PyObject w, x;
				1717
				1718	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				1719	w = PyInt_FromLong((long)ch);
				1720	if (w == NULL)
				1721	goto onError;
				1722	x = PyObject_GetItem(mapping, w);
				1723	Py_DECREF(w);
				1724	if (x == NULL) {
				1725	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1726	/* No mapping found: default to Latin-1 mapping if possible */
				1727	PyErr_Clear();
				1728	if (ch < 256) {
				1729	*s++ = (char)ch;
				1730	continue;
				1731	}
				1732	else if (!charmap_encoding_error(&p, &s, errors,
				1733	"missing character mapping"))
				1734	continue;
				1735	}
				1736	goto onError;
				1737	}
				1738
				1739	/* Apply mapping */
				1740	if (PyInt_Check(x)) {
				1741	int value = PyInt_AS_LONG(x);
				1742	if (value < 0 \|\| value > 255) {
				1743	PyErr_SetString(PyExc_TypeError,
				1744	"character mapping must be in range(256)");
				1745	Py_DECREF(x);
				1746	goto onError;
				1747	}
				1748	*s++ = (char)value;
				1749	}
				1750	else if (x == Py_None) {
				1751	/* undefined mapping */
				1752	if (charmap_encoding_error(&p, &s, errors,
				1753	"character maps to <undefined>")) {
				1754	Py_DECREF(x);
				1755	goto onError;
				1756	}
				1757	}
				1758	else if (PyString_Check(x)) {
				1759	if (PyString_GET_SIZE(x) != 1) {
				1760	/* 1-n mapping */
				1761	PyErr_SetString(PyExc_NotImplementedError,
				1762	"1-n mappings are currently not implemented");
				1763	Py_DECREF(x);
				1764	goto onError;
				1765	}
				1766	s++ = PyString_AS_STRING(x);
				1767	}
				1768	else {
				1769	/* wrong return value */
				1770	PyErr_SetString(PyExc_TypeError,
				1771	"character mapping must return integer, None or unicode");
				1772	Py_DECREF(x);
				1773	goto onError;
				1774	}
				1775	Py_DECREF(x);
				1776	}
				1777	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				1778	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				1779	goto onError;
				1780	return v;
				1781
				1782	onError:
				1783	Py_DECREF(v);
				1784	return NULL;
				1785	}
				1786
				1787	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				1788	PyObject *mapping)
				1789	{
				1790	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				1791	PyErr_BadArgument();
				1792	return NULL;
				1793	}
				1794	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				1795	PyUnicode_GET_SIZE(unicode),
				1796	mapping,
				1797	NULL);
				1798	}
				1799
				1800	static
				1801	int translate_error(const Py_UNICODE **source,
				1802	Py_UNICODE **dest,
				1803	const char *errors,
				1804	const char *details)
				1805	{
				1806	if ((errors == NULL) \|\|
				1807	(strcmp(errors,"strict") == 0)) {
				1808	PyErr_Format(PyExc_UnicodeError,
				1809	"translate error: %s",
				1810	details);
				1811	return -1;
				1812	}
				1813	else if (strcmp(errors,"ignore") == 0) {
				1814	return 0;
				1815	}
				1816	else if (strcmp(errors,"replace") == 0) {
				1817	**dest = '?';
				1818	(*dest)++;
				1819	return 0;
				1820	}
				1821	else {
				1822	PyErr_Format(PyExc_ValueError,
				1823	"translate error; "
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1824	"unknown error handling code: %s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1825	errors);
				1826	return -1;
				1827	}
				1828	}
				1829
				1830	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				1831	int size,
				1832	PyObject *mapping,
				1833	const char *errors)
				1834	{
				1835	PyUnicodeObject *v;
				1836	Py_UNICODE *p;
				1837
				1838	if (mapping == NULL) {
				1839	PyErr_BadArgument();
				1840	return NULL;
				1841	}
				1842
				1843	/* Output will never be longer than input */
				1844	v = _PyUnicode_New(size);
				1845	if (v == NULL)
				1846	goto onError;
				1847	if (size == 0)
				1848	goto done;
				1849	p = PyUnicode_AS_UNICODE(v);
				1850	while (size-- > 0) {
				1851	Py_UNICODE ch = *s++;
				1852	PyObject w, x;
				1853
				1854	/* Get mapping */
				1855	w = PyInt_FromLong(ch);
				1856	if (w == NULL)
				1857	goto onError;
				1858	x = PyObject_GetItem(mapping, w);
				1859	Py_DECREF(w);
				1860	if (x == NULL) {
				1861	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1862	/* No mapping found: default to 1-1 mapping */
				1863	PyErr_Clear();
				1864	*p++ = ch;
				1865	continue;
				1866	}
				1867	goto onError;
				1868	}
				1869
				1870	/* Apply mapping */
				1871	if (PyInt_Check(x))
				1872	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				1873	else if (x == Py_None) {
				1874	/* undefined mapping */
				1875	if (translate_error(&s, &p, errors,
				1876	"character maps to <undefined>")) {
				1877	Py_DECREF(x);
				1878	goto onError;
				1879	}
				1880	}
				1881	else if (PyUnicode_Check(x)) {
				1882	if (PyUnicode_GET_SIZE(x) != 1) {
				1883	/* 1-n mapping */
				1884	PyErr_SetString(PyExc_NotImplementedError,
				1885	"1-n mappings are currently not implemented");
				1886	Py_DECREF(x);
				1887	goto onError;
				1888	}
				1889	p++ = PyUnicode_AS_UNICODE(x);
				1890	}
				1891	else {
				1892	/* wrong return value */
				1893	PyErr_SetString(PyExc_TypeError,
				1894	"translate mapping must return integer, None or unicode");
				1895	Py_DECREF(x);
				1896	goto onError;
				1897	}
				1898	Py_DECREF(x);
				1899	}
				1900	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1901	_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
				1902
				1903	done:
				1904	return (PyObject *)v;
				1905
				1906	onError:
				1907	Py_XDECREF(v);
				1908	return NULL;
				1909	}
				1910
				1911	PyObject PyUnicode_Translate(PyObject str,
				1912	PyObject *mapping,
				1913	const char *errors)
				1914	{
				1915	PyObject *result;
				1916
				1917	str = PyUnicode_FromObject(str);
				1918	if (str == NULL)
				1919	goto onError;
				1920	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				1921	PyUnicode_GET_SIZE(str),
				1922	mapping,
				1923	errors);
				1924	Py_DECREF(str);
				1925	return result;
				1926
				1927	onError:
				1928	Py_XDECREF(str);
				1929	return NULL;
				1930	}
				1931
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	1932	/* --- Decimal Encoder ---------------------------------------------------- */
				1933
				1934	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				1935	int length,
				1936	char *output,
				1937	const char *errors)
				1938	{
				1939	Py_UNICODE p, end;
				1940
				1941	if (output == NULL) {
				1942	PyErr_BadArgument();
				1943	return -1;
				1944	}
				1945
				1946	p = s;
				1947	end = s + length;
				1948	while (p < end) {
				1949	register Py_UNICODE ch = *p++;
				1950	int decimal;
				1951
				1952	if (Py_UNICODE_ISSPACE(ch)) {
				1953	*output++ = ' ';
				1954	continue;
				1955	}
				1956	decimal = Py_UNICODE_TODECIMAL(ch);
				1957	if (decimal >= 0) {
				1958	*output++ = '0' + decimal;
				1959	continue;
				1960	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	1961	if (0 < ch && ch < 256) {
Guido van Rossum	34888ed	2000-04-05 21:29:50 +0000	[diff] [blame]	1962	*output++ = (char) ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	1963	continue;
				1964	}
				1965	/* All other characters are considered invalid */
				1966	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				1967	PyErr_SetString(PyExc_ValueError,
				1968	"invalid decimal Unicode string");
				1969	goto onError;
				1970	}
				1971	else if (strcmp(errors, "ignore") == 0)
				1972	continue;
				1973	else if (strcmp(errors, "replace") == 0) {
				1974	*output++ = '?';
				1975	continue;
				1976	}
				1977	}
				1978	/* 0-terminate the output string */
				1979	*output++ = '\0';
				1980	return 0;
				1981
				1982	onError:
				1983	return -1;
				1984	}
				1985
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1986	/* --- Helpers ------------------------------------------------------------ */
				1987
				1988	static
				1989	int count(PyUnicodeObject *self,
				1990	int start,
				1991	int end,
				1992	PyUnicodeObject *substring)
				1993	{
				1994	int count = 0;
				1995
				1996	end -= substring->length;
				1997
				1998	while (start <= end)
				1999	if (Py_UNICODE_MATCH(self, start, substring)) {
				2000	count++;
				2001	start += substring->length;
				2002	} else
				2003	start++;
				2004
				2005	return count;
				2006	}
				2007
				2008	int PyUnicode_Count(PyObject *str,
				2009	PyObject *substr,
				2010	int start,
				2011	int end)
				2012	{
				2013	int result;
				2014
				2015	str = PyUnicode_FromObject(str);
				2016	if (str == NULL)
				2017	return -1;
				2018	substr = PyUnicode_FromObject(substr);
				2019	if (substr == NULL) {
				2020	Py_DECREF(substr);
				2021	return -1;
				2022	}
				2023
				2024	result = count((PyUnicodeObject *)str,
				2025	start, end,
				2026	(PyUnicodeObject *)substr);
				2027
				2028	Py_DECREF(str);
				2029	Py_DECREF(substr);
				2030	return result;
				2031	}
				2032
				2033	static
				2034	int findstring(PyUnicodeObject *self,
				2035	PyUnicodeObject *substring,
				2036	int start,
				2037	int end,
				2038	int direction)
				2039	{
				2040	if (start < 0)
				2041	start += self->length;
				2042	if (start < 0)
				2043	start = 0;
				2044
				2045	if (substring->length == 0)
				2046	return start;
				2047
				2048	if (end > self->length)
				2049	end = self->length;
				2050	if (end < 0)
				2051	end += self->length;
				2052	if (end < 0)
				2053	end = 0;
				2054
				2055	end -= substring->length;
				2056
				2057	if (direction < 0) {
				2058	for (; end >= start; end--)
				2059	if (Py_UNICODE_MATCH(self, end, substring))
				2060	return end;
				2061	} else {
				2062	for (; start <= end; start++)
				2063	if (Py_UNICODE_MATCH(self, start, substring))
				2064	return start;
				2065	}
				2066
				2067	return -1;
				2068	}
				2069
				2070	int PyUnicode_Find(PyObject *str,
				2071	PyObject *substr,
				2072	int start,
				2073	int end,
				2074	int direction)
				2075	{
				2076	int result;
				2077
				2078	str = PyUnicode_FromObject(str);
				2079	if (str == NULL)
				2080	return -1;
				2081	substr = PyUnicode_FromObject(substr);
				2082	if (substr == NULL) {
				2083	Py_DECREF(substr);
				2084	return -1;
				2085	}
				2086
				2087	result = findstring((PyUnicodeObject *)str,
				2088	(PyUnicodeObject *)substr,
				2089	start, end, direction);
				2090	Py_DECREF(str);
				2091	Py_DECREF(substr);
				2092	return result;
				2093	}
				2094
				2095	static
				2096	int tailmatch(PyUnicodeObject *self,
				2097	PyUnicodeObject *substring,
				2098	int start,
				2099	int end,
				2100	int direction)
				2101	{
				2102	if (start < 0)
				2103	start += self->length;
				2104	if (start < 0)
				2105	start = 0;
				2106
				2107	if (substring->length == 0)
				2108	return 1;
				2109
				2110	if (end > self->length)
				2111	end = self->length;
				2112	if (end < 0)
				2113	end += self->length;
				2114	if (end < 0)
				2115	end = 0;
				2116
				2117	end -= substring->length;
				2118	if (end < start)
				2119	return 0;
				2120
				2121	if (direction > 0) {
				2122	if (Py_UNICODE_MATCH(self, end, substring))
				2123	return 1;
				2124	} else {
				2125	if (Py_UNICODE_MATCH(self, start, substring))
				2126	return 1;
				2127	}
				2128
				2129	return 0;
				2130	}
				2131
				2132	int PyUnicode_Tailmatch(PyObject *str,
				2133	PyObject *substr,
				2134	int start,
				2135	int end,
				2136	int direction)
				2137	{
				2138	int result;
				2139
				2140	str = PyUnicode_FromObject(str);
				2141	if (str == NULL)
				2142	return -1;
				2143	substr = PyUnicode_FromObject(substr);
				2144	if (substr == NULL) {
				2145	Py_DECREF(substr);
				2146	return -1;
				2147	}
				2148
				2149	result = tailmatch((PyUnicodeObject *)str,
				2150	(PyUnicodeObject *)substr,
				2151	start, end, direction);
				2152	Py_DECREF(str);
				2153	Py_DECREF(substr);
				2154	return result;
				2155	}
				2156
				2157	static
				2158	const Py_UNICODE findchar(const Py_UNICODE s,
				2159	int size,
				2160	Py_UNICODE ch)
				2161	{
				2162	/* like wcschr, but doesn't stop at NULL characters */
				2163
				2164	while (size-- > 0) {
				2165	if (*s == ch)
				2166	return s;
				2167	s++;
				2168	}
				2169
				2170	return NULL;
				2171	}
				2172
				2173	/* Apply fixfct filter to the Unicode object self and return a
				2174	reference to the modified object */
				2175
				2176	static
				2177	PyObject fixup(PyUnicodeObject self,
				2178	int (fixfct)(PyUnicodeObject s))
				2179	{
				2180
				2181	PyUnicodeObject *u;
				2182
				2183	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2184	self->length);
				2185	if (u == NULL)
				2186	return NULL;
				2187	if (!fixfct(u)) {
				2188	/* fixfct should return TRUE if it modified the buffer. If
				2189	FALSE, return a reference to the original buffer instead
				2190	(to save space, not time) */
				2191	Py_INCREF(self);
				2192	Py_DECREF(u);
				2193	return (PyObject*) self;
				2194	}
				2195	return (PyObject*) u;
				2196	}
				2197
				2198	static
				2199	int fixupper(PyUnicodeObject *self)
				2200	{
				2201	int len = self->length;
				2202	Py_UNICODE *s = self->str;
				2203	int status = 0;
				2204
				2205	while (len-- > 0) {
				2206	register Py_UNICODE ch;
				2207
				2208	ch = Py_UNICODE_TOUPPER(*s);
				2209	if (ch != *s) {
				2210	status = 1;
				2211	*s = ch;
				2212	}
				2213	s++;
				2214	}
				2215
				2216	return status;
				2217	}
				2218
				2219	static
				2220	int fixlower(PyUnicodeObject *self)
				2221	{
				2222	int len = self->length;
				2223	Py_UNICODE *s = self->str;
				2224	int status = 0;
				2225
				2226	while (len-- > 0) {
				2227	register Py_UNICODE ch;
				2228
				2229	ch = Py_UNICODE_TOLOWER(*s);
				2230	if (ch != *s) {
				2231	status = 1;
				2232	*s = ch;
				2233	}
				2234	s++;
				2235	}
				2236
				2237	return status;
				2238	}
				2239
				2240	static
				2241	int fixswapcase(PyUnicodeObject *self)
				2242	{
				2243	int len = self->length;
				2244	Py_UNICODE *s = self->str;
				2245	int status = 0;
				2246
				2247	while (len-- > 0) {
				2248	if (Py_UNICODE_ISUPPER(*s)) {
				2249	s = Py_UNICODE_TOLOWER(s);
				2250	status = 1;
				2251	} else if (Py_UNICODE_ISLOWER(*s)) {
				2252	s = Py_UNICODE_TOUPPER(s);
				2253	status = 1;
				2254	}
				2255	s++;
				2256	}
				2257
				2258	return status;
				2259	}
				2260
				2261	static
				2262	int fixcapitalize(PyUnicodeObject *self)
				2263	{
				2264	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2265	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2266	return 1;
				2267	}
				2268	return 0;
				2269	}
				2270
				2271	static
				2272	int fixtitle(PyUnicodeObject *self)
				2273	{
				2274	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2275	register Py_UNICODE *e;
				2276	int previous_is_cased;
				2277
				2278	/* Shortcut for single character strings */
				2279	if (PyUnicode_GET_SIZE(self) == 1) {
				2280	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2281	if (*p != ch) {
				2282	*p = ch;
				2283	return 1;
				2284	}
				2285	else
				2286	return 0;
				2287	}
				2288
				2289	e = p + PyUnicode_GET_SIZE(self);
				2290	previous_is_cased = 0;
				2291	for (; p < e; p++) {
				2292	register const Py_UNICODE ch = *p;
				2293
				2294	if (previous_is_cased)
				2295	*p = Py_UNICODE_TOLOWER(ch);
				2296	else
				2297	*p = Py_UNICODE_TOTITLE(ch);
				2298
				2299	if (Py_UNICODE_ISLOWER(ch) \|\|
				2300	Py_UNICODE_ISUPPER(ch) \|\|
				2301	Py_UNICODE_ISTITLE(ch))
				2302	previous_is_cased = 1;
				2303	else
				2304	previous_is_cased = 0;
				2305	}
				2306	return 1;
				2307	}
				2308
				2309	PyObject PyUnicode_Join(PyObject separator,
				2310	PyObject *seq)
				2311	{
				2312	Py_UNICODE *sep;
				2313	int seplen;
				2314	PyUnicodeObject *res = NULL;
				2315	int reslen = 0;
				2316	Py_UNICODE *p;
				2317	int seqlen = 0;
				2318	int sz = 100;
				2319	int i;
				2320
				2321	seqlen = PySequence_Length(seq);
				2322	if (seqlen < 0 && PyErr_Occurred())
				2323	return NULL;
				2324
				2325	if (separator == NULL) {
				2326	Py_UNICODE blank = ' ';
				2327	sep = &blank;
				2328	seplen = 1;
				2329	}
				2330	else {
				2331	separator = PyUnicode_FromObject(separator);
				2332	if (separator == NULL)
				2333	return NULL;
				2334	sep = PyUnicode_AS_UNICODE(separator);
				2335	seplen = PyUnicode_GET_SIZE(separator);
				2336	}
				2337
				2338	res = _PyUnicode_New(sz);
				2339	if (res == NULL)
				2340	goto onError;
				2341	p = PyUnicode_AS_UNICODE(res);
				2342	reslen = 0;
				2343
				2344	for (i = 0; i < seqlen; i++) {
				2345	int itemlen;
				2346	PyObject *item;
				2347
				2348	item = PySequence_GetItem(seq, i);
				2349	if (item == NULL)
				2350	goto onError;
				2351	if (!PyUnicode_Check(item)) {
				2352	PyObject *v;
				2353	v = PyUnicode_FromObject(item);
				2354	Py_DECREF(item);
				2355	item = v;
				2356	if (item == NULL)
				2357	goto onError;
				2358	}
				2359	itemlen = PyUnicode_GET_SIZE(item);
				2360	while (reslen + itemlen + seplen >= sz) {
				2361	if (_PyUnicode_Resize(res, sz*2))
				2362	goto onError;
				2363	sz *= 2;
				2364	p = PyUnicode_AS_UNICODE(res) + reslen;
				2365	}
				2366	if (i > 0) {
				2367	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2368	p += seplen;
				2369	reslen += seplen;
				2370	}
				2371	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2372	p += itemlen;
				2373	reslen += itemlen;
				2374	Py_DECREF(item);
				2375	}
				2376	if (_PyUnicode_Resize(res, reslen))
				2377	goto onError;
				2378
				2379	Py_XDECREF(separator);
				2380	return (PyObject *)res;
				2381
				2382	onError:
				2383	Py_XDECREF(separator);
				2384	Py_DECREF(res);
				2385	return NULL;
				2386	}
				2387
				2388	static
				2389	PyUnicodeObject pad(PyUnicodeObject self,
				2390	int left,
				2391	int right,
				2392	Py_UNICODE fill)
				2393	{
				2394	PyUnicodeObject *u;
				2395
				2396	if (left < 0)
				2397	left = 0;
				2398	if (right < 0)
				2399	right = 0;
				2400
				2401	if (left == 0 && right == 0) {
				2402	Py_INCREF(self);
				2403	return self;
				2404	}
				2405
				2406	u = _PyUnicode_New(left + self->length + right);
				2407	if (u) {
				2408	if (left)
				2409	Py_UNICODE_FILL(u->str, fill, left);
				2410	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2411	if (right)
				2412	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2413	}
				2414
				2415	return u;
				2416	}
				2417
				2418	#define SPLIT_APPEND(data, left, right) \
				2419	str = PyUnicode_FromUnicode(data + left, right - left); \
				2420	if (!str) \
				2421	goto onError; \
				2422	if (PyList_Append(list, str)) { \
				2423	Py_DECREF(str); \
				2424	goto onError; \
				2425	} \
				2426	else \
				2427	Py_DECREF(str);
				2428
				2429	static
				2430	PyObject split_whitespace(PyUnicodeObject self,
				2431	PyObject *list,
				2432	int maxcount)
				2433	{
				2434	register int i;
				2435	register int j;
				2436	int len = self->length;
				2437	PyObject *str;
				2438
				2439	for (i = j = 0; i < len; ) {
				2440	/* find a token */
				2441	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2442	i++;
				2443	j = i;
				2444	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2445	i++;
				2446	if (j < i) {
				2447	if (maxcount-- <= 0)
				2448	break;
				2449	SPLIT_APPEND(self->str, j, i);
				2450	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2451	i++;
				2452	j = i;
				2453	}
				2454	}
				2455	if (j < len) {
				2456	SPLIT_APPEND(self->str, j, len);
				2457	}
				2458	return list;
				2459
				2460	onError:
				2461	Py_DECREF(list);
				2462	return NULL;
				2463	}
				2464
				2465	PyObject PyUnicode_Splitlines(PyObject string,
				2466	int maxcount)
				2467	{
				2468	register int i;
				2469	register int j;
				2470	int len;
				2471	PyObject *list;
				2472	PyObject *str;
				2473	Py_UNICODE *data;
				2474
				2475	string = PyUnicode_FromObject(string);
				2476	if (string == NULL)
				2477	return NULL;
				2478	data = PyUnicode_AS_UNICODE(string);
				2479	len = PyUnicode_GET_SIZE(string);
				2480
				2481	if (maxcount < 0)
				2482	maxcount = INT_MAX;
				2483
				2484	list = PyList_New(0);
				2485	if (!list)
				2486	goto onError;
				2487
				2488	for (i = j = 0; i < len; ) {
				2489	/* Find a line and append it */
				2490	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2491	i++;
				2492	if (maxcount-- <= 0)
				2493	break;
				2494	SPLIT_APPEND(data, j, i);
				2495
				2496	/* Skip the line break reading CRLF as one line break */
				2497	if (i < len) {
				2498	if (data[i] == '\r' && i + 1 < len &&
				2499	data[i+1] == '\n')
				2500	i += 2;
				2501	else
				2502	i++;
				2503	}
				2504	j = i;
				2505	}
				2506	if (j < len) {
				2507	SPLIT_APPEND(data, j, len);
				2508	}
				2509
				2510	Py_DECREF(string);
				2511	return list;
				2512
				2513	onError:
				2514	Py_DECREF(list);
				2515	Py_DECREF(string);
				2516	return NULL;
				2517	}
				2518
				2519	static
				2520	PyObject split_char(PyUnicodeObject self,
				2521	PyObject *list,
				2522	Py_UNICODE ch,
				2523	int maxcount)
				2524	{
				2525	register int i;
				2526	register int j;
				2527	int len = self->length;
				2528	PyObject *str;
				2529
				2530	for (i = j = 0; i < len; ) {
				2531	if (self->str[i] == ch) {
				2532	if (maxcount-- <= 0)
				2533	break;
				2534	SPLIT_APPEND(self->str, j, i);
				2535	i = j = i + 1;
				2536	} else
				2537	i++;
				2538	}
				2539	if (j <= len) {
				2540	SPLIT_APPEND(self->str, j, len);
				2541	}
				2542	return list;
				2543
				2544	onError:
				2545	Py_DECREF(list);
				2546	return NULL;
				2547	}
				2548
				2549	static
				2550	PyObject split_substring(PyUnicodeObject self,
				2551	PyObject *list,
				2552	PyUnicodeObject *substring,
				2553	int maxcount)
				2554	{
				2555	register int i;
				2556	register int j;
				2557	int len = self->length;
				2558	int sublen = substring->length;
				2559	PyObject *str;
				2560
				2561	for (i = j = 0; i < len - sublen; ) {
				2562	if (Py_UNICODE_MATCH(self, i, substring)) {
				2563	if (maxcount-- <= 0)
				2564	break;
				2565	SPLIT_APPEND(self->str, j, i);
				2566	i = j = i + sublen;
				2567	} else
				2568	i++;
				2569	}
				2570	if (j <= len) {
				2571	SPLIT_APPEND(self->str, j, len);
				2572	}
				2573	return list;
				2574
				2575	onError:
				2576	Py_DECREF(list);
				2577	return NULL;
				2578	}
				2579
				2580	#undef SPLIT_APPEND
				2581
				2582	static
				2583	PyObject split(PyUnicodeObject self,
				2584	PyUnicodeObject *substring,
				2585	int maxcount)
				2586	{
				2587	PyObject *list;
				2588
				2589	if (maxcount < 0)
				2590	maxcount = INT_MAX;
				2591
				2592	list = PyList_New(0);
				2593	if (!list)
				2594	return NULL;
				2595
				2596	if (substring == NULL)
				2597	return split_whitespace(self,list,maxcount);
				2598
				2599	else if (substring->length == 1)
				2600	return split_char(self,list,substring->str[0],maxcount);
				2601
				2602	else if (substring->length == 0) {
				2603	Py_DECREF(list);
				2604	PyErr_SetString(PyExc_ValueError, "empty separator");
				2605	return NULL;
				2606	}
				2607	else
				2608	return split_substring(self,list,substring,maxcount);
				2609	}
				2610
				2611	static
				2612	PyObject strip(PyUnicodeObject self,
				2613	int left,
				2614	int right)
				2615	{
				2616	Py_UNICODE *p = self->str;
				2617	int start = 0;
				2618	int end = self->length;
				2619
				2620	if (left)
				2621	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2622	start++;
				2623
				2624	if (right)
				2625	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2626	end--;
				2627
				2628	if (start == 0 && end == self->length) {
				2629	/* couldn't strip anything off, return original string */
				2630	Py_INCREF(self);
				2631	return (PyObject*) self;
				2632	}
				2633
				2634	return (PyObject*) PyUnicode_FromUnicode(
				2635	self->str + start,
				2636	end - start
				2637	);
				2638	}
				2639
				2640	static
				2641	PyObject replace(PyUnicodeObject self,
				2642	PyUnicodeObject *str1,
				2643	PyUnicodeObject *str2,
				2644	int maxcount)
				2645	{
				2646	PyUnicodeObject *u;
				2647
				2648	if (maxcount < 0)
				2649	maxcount = INT_MAX;
				2650
				2651	if (str1->length == 1 && str2->length == 1) {
				2652	int i;
				2653
				2654	/* replace characters */
				2655	if (!findchar(self->str, self->length, str1->str[0])) {
				2656	/* nothing to replace, return original string */
				2657	Py_INCREF(self);
				2658	u = self;
				2659	} else {
				2660	Py_UNICODE u1 = str1->str[0];
				2661	Py_UNICODE u2 = str2->str[0];
				2662
				2663	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2664	self->str,
				2665	self->length
				2666	);
				2667	if (u)
				2668	for (i = 0; i < u->length; i++)
				2669	if (u->str[i] == u1) {
				2670	if (--maxcount < 0)
				2671	break;
				2672	u->str[i] = u2;
				2673	}
				2674	}
				2675
				2676	} else {
				2677	int n, i;
				2678	Py_UNICODE *p;
				2679
				2680	/* replace strings */
				2681	n = count(self, 0, self->length, str1);
				2682	if (n > maxcount)
				2683	n = maxcount;
				2684	if (n == 0) {
				2685	/* nothing to replace, return original string */
				2686	Py_INCREF(self);
				2687	u = self;
				2688	} else {
				2689	u = _PyUnicode_New(
				2690	self->length + n * (str2->length - str1->length));
				2691	if (u) {
				2692	i = 0;
				2693	p = u->str;
				2694	while (i <= self->length - str1->length)
				2695	if (Py_UNICODE_MATCH(self, i, str1)) {
				2696	/* replace string segment */
				2697	Py_UNICODE_COPY(p, str2->str, str2->length);
				2698	p += str2->length;
				2699	i += str1->length;
				2700	if (--n <= 0) {
				2701	/* copy remaining part */
				2702	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				2703	break;
				2704	}
				2705	} else
				2706	*p++ = self->str[i++];
				2707	}
				2708	}
				2709	}
				2710
				2711	return (PyObject *) u;
				2712	}
				2713
				2714	/* --- Unicode Object Methods --------------------------------------------- */
				2715
				2716	static char title__doc__[] =
				2717	"S.title() -> unicode\n\
				2718	\n\
				2719	Return a titlecased version of S, i.e. words start with title case\n\
				2720	characters, all remaining cased characters have lower case.";
				2721
				2722	static PyObject*
				2723	unicode_title(PyUnicodeObject self, PyObject args)
				2724	{
				2725	if (!PyArg_NoArgs(args))
				2726	return NULL;
				2727	return fixup(self, fixtitle);
				2728	}
				2729
				2730	static char capitalize__doc__[] =
				2731	"S.capitalize() -> unicode\n\
				2732	\n\
				2733	Return a capitalized version of S, i.e. make the first character\n\
				2734	have upper case.";
				2735
				2736	static PyObject*
				2737	unicode_capitalize(PyUnicodeObject self, PyObject args)
				2738	{
				2739	if (!PyArg_NoArgs(args))
				2740	return NULL;
				2741	return fixup(self, fixcapitalize);
				2742	}
				2743
				2744	#if 0
				2745	static char capwords__doc__[] =
				2746	"S.capwords() -> unicode\n\
				2747	\n\
				2748	Apply .capitalize() to all words in S and return the result with\n\
				2749	normalized whitespace (all whitespace strings are replaced by ' ').";
				2750
				2751	static PyObject*
				2752	unicode_capwords(PyUnicodeObject self, PyObject args)
				2753	{
				2754	PyObject *list;
				2755	PyObject *item;
				2756	int i;
				2757
				2758	if (!PyArg_NoArgs(args))
				2759	return NULL;
				2760
				2761	/* Split into words */
				2762	list = split(self, NULL, -1);
				2763	if (!list)
				2764	return NULL;
				2765
				2766	/* Capitalize each word */
				2767	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				2768	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				2769	fixcapitalize);
				2770	if (item == NULL)
				2771	goto onError;
				2772	Py_DECREF(PyList_GET_ITEM(list, i));
				2773	PyList_SET_ITEM(list, i, item);
				2774	}
				2775
				2776	/* Join the words to form a new string */
				2777	item = PyUnicode_Join(NULL, list);
				2778
				2779	onError:
				2780	Py_DECREF(list);
				2781	return (PyObject *)item;
				2782	}
				2783	#endif
				2784
				2785	static char center__doc__[] =
				2786	"S.center(width) -> unicode\n\
				2787	\n\
				2788	Return S centered in a Unicode string of length width. Padding is done\n\
				2789	using spaces.";
				2790
				2791	static PyObject *
				2792	unicode_center(PyUnicodeObject self, PyObject args)
				2793	{
				2794	int marg, left;
				2795	int width;
				2796
				2797	if (!PyArg_ParseTuple(args, "i:center", &width))
				2798	return NULL;
				2799
				2800	if (self->length >= width) {
				2801	Py_INCREF(self);
				2802	return (PyObject*) self;
				2803	}
				2804
				2805	marg = width - self->length;
				2806	left = marg / 2 + (marg & width & 1);
				2807
				2808	return (PyObject*) pad(self, left, marg - left, ' ');
				2809	}
				2810
				2811	static int
				2812	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				2813	{
				2814	int len1, len2;
				2815	Py_UNICODE *s1 = str1->str;
				2816	Py_UNICODE *s2 = str2->str;
				2817
				2818	len1 = str1->length;
				2819	len2 = str2->length;
				2820
				2821	while (len1 > 0 && len2 > 0) {
				2822	int cmp = (s1++) - (s2++);
				2823	if (cmp)
				2824	/* This should make Christian happy! */
				2825	return (cmp < 0) ? -1 : (cmp != 0);
				2826	len1--, len2--;
				2827	}
				2828
				2829	return (len1 < len2) ? -1 : (len1 != len2);
				2830	}
				2831
				2832	int PyUnicode_Compare(PyObject *left,
				2833	PyObject *right)
				2834	{
				2835	PyUnicodeObject u = NULL, v = NULL;
				2836	int result;
				2837
				2838	/* Coerce the two arguments */
				2839	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2840	if (u == NULL)
				2841	goto onError;
				2842	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2843	if (v == NULL)
				2844	goto onError;
				2845
				2846	/* Shortcut for emtpy or interned objects */
				2847	if (v == u) {
				2848	Py_DECREF(u);
				2849	Py_DECREF(v);
				2850	return 0;
				2851	}
				2852
				2853	result = unicode_compare(u, v);
				2854
				2855	Py_DECREF(u);
				2856	Py_DECREF(v);
				2857	return result;
				2858
				2859	onError:
				2860	Py_XDECREF(u);
				2861	Py_XDECREF(v);
				2862	return -1;
				2863	}
				2864
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2865	int PyUnicode_Contains(PyObject *container,
				2866	PyObject *element)
				2867	{
				2868	PyUnicodeObject u = NULL, v = NULL;
				2869	int result;
				2870	register const Py_UNICODE p, e;
				2871	register Py_UNICODE ch;
				2872
				2873	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2874	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
				2875	if (v == NULL)
				2876	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2877	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				2878	if (u == NULL) {
				2879	Py_DECREF(v);
				2880	goto onError;
				2881	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2882
				2883	/* Check v in u */
				2884	if (PyUnicode_GET_SIZE(v) != 1) {
				2885	PyErr_SetString(PyExc_TypeError,
				2886	"string member test needs char left operand");
				2887	goto onError;
				2888	}
				2889	ch = *PyUnicode_AS_UNICODE(v);
				2890	p = PyUnicode_AS_UNICODE(u);
				2891	e = p + PyUnicode_GET_SIZE(u);
				2892	result = 0;
				2893	while (p < e) {
				2894	if (*p++ == ch) {
				2895	result = 1;
				2896	break;
				2897	}
				2898	}
				2899
				2900	Py_DECREF(u);
				2901	Py_DECREF(v);
				2902	return result;
				2903
				2904	onError:
				2905	Py_XDECREF(u);
				2906	Py_XDECREF(v);
				2907	return -1;
				2908	}
				2909
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2910	/* Concat to string or Unicode object giving a new Unicode object. */
				2911
				2912	PyObject PyUnicode_Concat(PyObject left,
				2913	PyObject *right)
				2914	{
				2915	PyUnicodeObject u = NULL, v = NULL, *w;
				2916
				2917	/* Coerce the two arguments */
				2918	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2919	if (u == NULL)
				2920	goto onError;
				2921	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2922	if (v == NULL)
				2923	goto onError;
				2924
				2925	/* Shortcuts */
				2926	if (v == unicode_empty) {
				2927	Py_DECREF(v);
				2928	return (PyObject *)u;
				2929	}
				2930	if (u == unicode_empty) {
				2931	Py_DECREF(u);
				2932	return (PyObject *)v;
				2933	}
				2934
				2935	/* Concat the two Unicode strings */
				2936	w = _PyUnicode_New(u->length + v->length);
				2937	if (w == NULL)
				2938	goto onError;
				2939	Py_UNICODE_COPY(w->str, u->str, u->length);
				2940	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				2941
				2942	Py_DECREF(u);
				2943	Py_DECREF(v);
				2944	return (PyObject *)w;
				2945
				2946	onError:
				2947	Py_XDECREF(u);
				2948	Py_XDECREF(v);
				2949	return NULL;
				2950	}
				2951
				2952	static char count__doc__[] =
				2953	"S.count(sub[, start[, end]]) -> int\n\
				2954	\n\
				2955	Return the number of occurrences of substring sub in Unicode string\n\
				2956	S[start:end]. Optional arguments start and end are\n\
				2957	interpreted as in slice notation.";
				2958
				2959	static PyObject *
				2960	unicode_count(PyUnicodeObject self, PyObject args)
				2961	{
				2962	PyUnicodeObject *substring;
				2963	int start = 0;
				2964	int end = INT_MAX;
				2965	PyObject *result;
				2966
				2967	if (!PyArg_ParseTuple(args, "O\|ii:count", &substring, &start, &end))
				2968	return NULL;
				2969
				2970	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				2971	(PyObject *)substring);
				2972	if (substring == NULL)
				2973	return NULL;
				2974
				2975	if (substring->length == 0) {
				2976	Py_DECREF(substring);
				2977	return PyInt_FromLong((long) 0);
				2978	}
				2979
				2980	if (start < 0)
				2981	start += self->length;
				2982	if (start < 0)
				2983	start = 0;
				2984	if (end > self->length)
				2985	end = self->length;
				2986	if (end < 0)
				2987	end += self->length;
				2988	if (end < 0)
				2989	end = 0;
				2990
				2991	result = PyInt_FromLong((long) count(self, start, end, substring));
				2992
				2993	Py_DECREF(substring);
				2994	return result;
				2995	}
				2996
				2997	static char encode__doc__[] =
				2998	"S.encode([encoding[,errors]]) -> string\n\
				2999	\n\
				3000	Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
				3001	errors may be given to set a different error handling scheme. Default\n\
				3002	is 'strict' meaning that encoding errors raise a ValueError. Other\n\
				3003	possible values are 'ignore' and 'replace'.";
				3004
				3005	static PyObject *
				3006	unicode_encode(PyUnicodeObject self, PyObject args)
				3007	{
				3008	char *encoding = NULL;
				3009	char *errors = NULL;
				3010	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3011	return NULL;
				3012	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3013	}
				3014
				3015	static char expandtabs__doc__[] =
				3016	"S.expandtabs([tabsize]) -> unicode\n\
				3017	\n\
				3018	Return a copy of S where all tab characters are expanded using spaces.\n\
				3019	If tabsize is not given, a tab size of 8 characters is assumed.";
				3020
				3021	static PyObject*
				3022	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3023	{
				3024	Py_UNICODE *e;
				3025	Py_UNICODE *p;
				3026	Py_UNICODE *q;
				3027	int i, j;
				3028	PyUnicodeObject *u;
				3029	int tabsize = 8;
				3030
				3031	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3032	return NULL;
				3033
				3034	/* First pass: determine size of ouput string */
				3035	i = j = 0;
				3036	e = self->str + self->length;
				3037	for (p = self->str; p < e; p++)
				3038	if (*p == '\t') {
				3039	if (tabsize > 0)
				3040	j += tabsize - (j % tabsize);
				3041	}
				3042	else {
				3043	j++;
				3044	if (p == '\n' \|\| p == '\r') {
				3045	i += j;
				3046	j = 0;
				3047	}
				3048	}
				3049
				3050	/* Second pass: create output string and fill it */
				3051	u = _PyUnicode_New(i + j);
				3052	if (!u)
				3053	return NULL;
				3054
				3055	j = 0;
				3056	q = u->str;
				3057
				3058	for (p = self->str; p < e; p++)
				3059	if (*p == '\t') {
				3060	if (tabsize > 0) {
				3061	i = tabsize - (j % tabsize);
				3062	j += i;
				3063	while (i--)
				3064	*q++ = ' ';
				3065	}
				3066	}
				3067	else {
				3068	j++;
				3069	q++ = p;
				3070	if (p == '\n' \|\| p == '\r')
				3071	j = 0;
				3072	}
				3073
				3074	return (PyObject*) u;
				3075	}
				3076
				3077	static char find__doc__[] =
				3078	"S.find(sub [,start [,end]]) -> int\n\
				3079	\n\
				3080	Return the lowest index in S where substring sub is found,\n\
				3081	such that sub is contained within s[start,end]. Optional\n\
				3082	arguments start and end are interpreted as in slice notation.\n\
				3083	\n\
				3084	Return -1 on failure.";
				3085
				3086	static PyObject *
				3087	unicode_find(PyUnicodeObject self, PyObject args)
				3088	{
				3089	PyUnicodeObject *substring;
				3090	int start = 0;
				3091	int end = INT_MAX;
				3092	PyObject *result;
				3093
				3094	if (!PyArg_ParseTuple(args, "O\|ii:find", &substring, &start, &end))
				3095	return NULL;
				3096	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3097	(PyObject *)substring);
				3098	if (substring == NULL)
				3099	return NULL;
				3100
				3101	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3102
				3103	Py_DECREF(substring);
				3104	return result;
				3105	}
				3106
				3107	static PyObject *
				3108	unicode_getitem(PyUnicodeObject *self, int index)
				3109	{
				3110	if (index < 0 \|\| index >= self->length) {
				3111	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3112	return NULL;
				3113	}
				3114
				3115	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3116	}
				3117
				3118	static long
				3119	unicode_hash(PyUnicodeObject *self)
				3120	{
				3121	long hash;
				3122	PyObject *utf8;
				3123
				3124	/* Since Unicode objects compare equal to their UTF-8 string
				3125	counterparts, they should also use the UTF-8 strings as basis
				3126	for their hash value. This is needed to assure that strings and
				3127	Unicode objects behave in the same way as dictionary
				3128	keys. Unfortunately, this costs some performance and also some
				3129	memory if the cached UTF-8 representation is not used later
				3130	on. */
				3131	if (self->hash != -1)
				3132	return self->hash;
				3133	utf8 = utf8_string(self, NULL);
				3134	if (utf8 == NULL)
				3135	return -1;
				3136	hash = PyObject_Hash(utf8);
				3137	if (hash == -1)
				3138	return -1;
				3139	self->hash = hash;
				3140	return hash;
				3141	}
				3142
				3143	static char index__doc__[] =
				3144	"S.index(sub [,start [,end]]) -> int\n\
				3145	\n\
				3146	Like S.find() but raise ValueError when the substring is not found.";
				3147
				3148	static PyObject *
				3149	unicode_index(PyUnicodeObject self, PyObject args)
				3150	{
				3151	int result;
				3152	PyUnicodeObject *substring;
				3153	int start = 0;
				3154	int end = INT_MAX;
				3155
				3156	if (!PyArg_ParseTuple(args, "O\|ii:index", &substring, &start, &end))
				3157	return NULL;
				3158
				3159	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3160	(PyObject *)substring);
				3161	if (substring == NULL)
				3162	return NULL;
				3163
				3164	result = findstring(self, substring, start, end, 1);
				3165
				3166	Py_DECREF(substring);
				3167	if (result < 0) {
				3168	PyErr_SetString(PyExc_ValueError, "substring not found");
				3169	return NULL;
				3170	}
				3171	return PyInt_FromLong(result);
				3172	}
				3173
				3174	static char islower__doc__[] =
				3175	"S.islower() -> int\n\
				3176	\n\
				3177	Return 1 if all cased characters in S are lowercase and there is\n\
				3178	at least one cased character in S, 0 otherwise.";
				3179
				3180	static PyObject*
				3181	unicode_islower(PyUnicodeObject self, PyObject args)
				3182	{
				3183	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3184	register const Py_UNICODE *e;
				3185	int cased;
				3186
				3187	if (!PyArg_NoArgs(args))
				3188	return NULL;
				3189
				3190	/* Shortcut for single character strings */
				3191	if (PyUnicode_GET_SIZE(self) == 1)
				3192	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3193
				3194	e = p + PyUnicode_GET_SIZE(self);
				3195	cased = 0;
				3196	for (; p < e; p++) {
				3197	register const Py_UNICODE ch = *p;
				3198
				3199	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3200	return PyInt_FromLong(0);
				3201	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3202	cased = 1;
				3203	}
				3204	return PyInt_FromLong(cased);
				3205	}
				3206
				3207	static char isupper__doc__[] =
				3208	"S.isupper() -> int\n\
				3209	\n\
				3210	Return 1 if all cased characters in S are uppercase and there is\n\
				3211	at least one cased character in S, 0 otherwise.";
				3212
				3213	static PyObject*
				3214	unicode_isupper(PyUnicodeObject self, PyObject args)
				3215	{
				3216	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3217	register const Py_UNICODE *e;
				3218	int cased;
				3219
				3220	if (!PyArg_NoArgs(args))
				3221	return NULL;
				3222
				3223	/* Shortcut for single character strings */
				3224	if (PyUnicode_GET_SIZE(self) == 1)
				3225	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3226
				3227	e = p + PyUnicode_GET_SIZE(self);
				3228	cased = 0;
				3229	for (; p < e; p++) {
				3230	register const Py_UNICODE ch = *p;
				3231
				3232	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3233	return PyInt_FromLong(0);
				3234	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3235	cased = 1;
				3236	}
				3237	return PyInt_FromLong(cased);
				3238	}
				3239
				3240	static char istitle__doc__[] =
				3241	"S.istitle() -> int\n\
				3242	\n\
				3243	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3244	may only follow uncased characters and lowercase characters only cased\n\
				3245	ones. Return 0 otherwise.";
				3246
				3247	static PyObject*
				3248	unicode_istitle(PyUnicodeObject self, PyObject args)
				3249	{
				3250	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3251	register const Py_UNICODE *e;
				3252	int cased, previous_is_cased;
				3253
				3254	if (!PyArg_NoArgs(args))
				3255	return NULL;
				3256
				3257	/* Shortcut for single character strings */
				3258	if (PyUnicode_GET_SIZE(self) == 1)
				3259	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3260	(Py_UNICODE_ISUPPER(*p) != 0));
				3261
				3262	e = p + PyUnicode_GET_SIZE(self);
				3263	cased = 0;
				3264	previous_is_cased = 0;
				3265	for (; p < e; p++) {
				3266	register const Py_UNICODE ch = *p;
				3267
				3268	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3269	if (previous_is_cased)
				3270	return PyInt_FromLong(0);
				3271	previous_is_cased = 1;
				3272	cased = 1;
				3273	}
				3274	else if (Py_UNICODE_ISLOWER(ch)) {
				3275	if (!previous_is_cased)
				3276	return PyInt_FromLong(0);
				3277	previous_is_cased = 1;
				3278	cased = 1;
				3279	}
				3280	else
				3281	previous_is_cased = 0;
				3282	}
				3283	return PyInt_FromLong(cased);
				3284	}
				3285
				3286	static char isspace__doc__[] =
				3287	"S.isspace() -> int\n\
				3288	\n\
				3289	Return 1 if there are only whitespace characters in S,\n\
				3290	0 otherwise.";
				3291
				3292	static PyObject*
				3293	unicode_isspace(PyUnicodeObject self, PyObject args)
				3294	{
				3295	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3296	register const Py_UNICODE *e;
				3297
				3298	if (!PyArg_NoArgs(args))
				3299	return NULL;
				3300
				3301	/* Shortcut for single character strings */
				3302	if (PyUnicode_GET_SIZE(self) == 1 &&
				3303	Py_UNICODE_ISSPACE(*p))
				3304	return PyInt_FromLong(1);
				3305
				3306	e = p + PyUnicode_GET_SIZE(self);
				3307	for (; p < e; p++) {
				3308	if (!Py_UNICODE_ISSPACE(*p))
				3309	return PyInt_FromLong(0);
				3310	}
				3311	return PyInt_FromLong(1);
				3312	}
				3313
				3314	static char isdecimal__doc__[] =
				3315	"S.isdecimal() -> int\n\
				3316	\n\
				3317	Return 1 if there are only decimal characters in S,\n\
				3318	0 otherwise.";
				3319
				3320	static PyObject*
				3321	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3322	{
				3323	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3324	register const Py_UNICODE *e;
				3325
				3326	if (!PyArg_NoArgs(args))
				3327	return NULL;
				3328
				3329	/* Shortcut for single character strings */
				3330	if (PyUnicode_GET_SIZE(self) == 1 &&
				3331	Py_UNICODE_ISDECIMAL(*p))
				3332	return PyInt_FromLong(1);
				3333
				3334	e = p + PyUnicode_GET_SIZE(self);
				3335	for (; p < e; p++) {
				3336	if (!Py_UNICODE_ISDECIMAL(*p))
				3337	return PyInt_FromLong(0);
				3338	}
				3339	return PyInt_FromLong(1);
				3340	}
				3341
				3342	static char isdigit__doc__[] =
				3343	"S.isdigit() -> int\n\
				3344	\n\
				3345	Return 1 if there are only digit characters in S,\n\
				3346	0 otherwise.";
				3347
				3348	static PyObject*
				3349	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3350	{
				3351	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3352	register const Py_UNICODE *e;
				3353
				3354	if (!PyArg_NoArgs(args))
				3355	return NULL;
				3356
				3357	/* Shortcut for single character strings */
				3358	if (PyUnicode_GET_SIZE(self) == 1 &&
				3359	Py_UNICODE_ISDIGIT(*p))
				3360	return PyInt_FromLong(1);
				3361
				3362	e = p + PyUnicode_GET_SIZE(self);
				3363	for (; p < e; p++) {
				3364	if (!Py_UNICODE_ISDIGIT(*p))
				3365	return PyInt_FromLong(0);
				3366	}
				3367	return PyInt_FromLong(1);
				3368	}
				3369
				3370	static char isnumeric__doc__[] =
				3371	"S.isnumeric() -> int\n\
				3372	\n\
				3373	Return 1 if there are only numeric characters in S,\n\
				3374	0 otherwise.";
				3375
				3376	static PyObject*
				3377	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3378	{
				3379	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3380	register const Py_UNICODE *e;
				3381
				3382	if (!PyArg_NoArgs(args))
				3383	return NULL;
				3384
				3385	/* Shortcut for single character strings */
				3386	if (PyUnicode_GET_SIZE(self) == 1 &&
				3387	Py_UNICODE_ISNUMERIC(*p))
				3388	return PyInt_FromLong(1);
				3389
				3390	e = p + PyUnicode_GET_SIZE(self);
				3391	for (; p < e; p++) {
				3392	if (!Py_UNICODE_ISNUMERIC(*p))
				3393	return PyInt_FromLong(0);
				3394	}
				3395	return PyInt_FromLong(1);
				3396	}
				3397
				3398	static char join__doc__[] =
				3399	"S.join(sequence) -> unicode\n\
				3400	\n\
				3401	Return a string which is the concatenation of the strings in the\n\
				3402	sequence. The separator between elements is S.";
				3403
				3404	static PyObject*
				3405	unicode_join(PyUnicodeObject self, PyObject args)
				3406	{
				3407	PyObject *data;
				3408	if (!PyArg_ParseTuple(args, "O:join", &data))
				3409	return NULL;
				3410
				3411	return PyUnicode_Join((PyObject *)self, data);
				3412	}
				3413
				3414	static int
				3415	unicode_length(PyUnicodeObject *self)
				3416	{
				3417	return self->length;
				3418	}
				3419
				3420	static char ljust__doc__[] =
				3421	"S.ljust(width) -> unicode\n\
				3422	\n\
				3423	Return S left justified in a Unicode string of length width. Padding is\n\
				3424	done using spaces.";
				3425
				3426	static PyObject *
				3427	unicode_ljust(PyUnicodeObject self, PyObject args)
				3428	{
				3429	int width;
				3430	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3431	return NULL;
				3432
				3433	if (self->length >= width) {
				3434	Py_INCREF(self);
				3435	return (PyObject*) self;
				3436	}
				3437
				3438	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3439	}
				3440
				3441	static char lower__doc__[] =
				3442	"S.lower() -> unicode\n\
				3443	\n\
				3444	Return a copy of the string S converted to lowercase.";
				3445
				3446	static PyObject*
				3447	unicode_lower(PyUnicodeObject self, PyObject args)
				3448	{
				3449	if (!PyArg_NoArgs(args))
				3450	return NULL;
				3451	return fixup(self, fixlower);
				3452	}
				3453
				3454	static char lstrip__doc__[] =
				3455	"S.lstrip() -> unicode\n\
				3456	\n\
				3457	Return a copy of the string S with leading whitespace removed.";
				3458
				3459	static PyObject *
				3460	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3461	{
				3462	if (!PyArg_NoArgs(args))
				3463	return NULL;
				3464	return strip(self, 1, 0);
				3465	}
				3466
				3467	static PyObject*
				3468	unicode_repeat(PyUnicodeObject *str, int len)
				3469	{
				3470	PyUnicodeObject *u;
				3471	Py_UNICODE *p;
				3472
				3473	if (len < 0)
				3474	len = 0;
				3475
				3476	if (len == 1) {
				3477	/* no repeat, return original string */
				3478	Py_INCREF(str);
				3479	return (PyObject*) str;
				3480	}
				3481
				3482	u = _PyUnicode_New(len * str->length);
				3483	if (!u)
				3484	return NULL;
				3485
				3486	p = u->str;
				3487
				3488	while (len-- > 0) {
				3489	Py_UNICODE_COPY(p, str->str, str->length);
				3490	p += str->length;
				3491	}
				3492
				3493	return (PyObject*) u;
				3494	}
				3495
				3496	PyObject PyUnicode_Replace(PyObject obj,
				3497	PyObject *subobj,
				3498	PyObject *replobj,
				3499	int maxcount)
				3500	{
				3501	PyObject *self;
				3502	PyObject *str1;
				3503	PyObject *str2;
				3504	PyObject *result;
				3505
				3506	self = PyUnicode_FromObject(obj);
				3507	if (self == NULL)
				3508	return NULL;
				3509	str1 = PyUnicode_FromObject(subobj);
				3510	if (str1 == NULL) {
				3511	Py_DECREF(self);
				3512	return NULL;
				3513	}
				3514	str2 = PyUnicode_FromObject(replobj);
				3515	if (str2 == NULL) {
				3516	Py_DECREF(self);
				3517	Py_DECREF(str1);
				3518	return NULL;
				3519	}
				3520	result = replace((PyUnicodeObject *)self,
				3521	(PyUnicodeObject *)str1,
				3522	(PyUnicodeObject *)str2,
				3523	maxcount);
				3524	Py_DECREF(self);
				3525	Py_DECREF(str1);
				3526	Py_DECREF(str2);
				3527	return result;
				3528	}
				3529
				3530	static char replace__doc__[] =
				3531	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3532	\n\
				3533	Return a copy of S with all occurrences of substring\n\
				3534	old replaced by new. If the optional argument maxsplit is\n\
				3535	given, only the first maxsplit occurrences are replaced.";
				3536
				3537	static PyObject*
				3538	unicode_replace(PyUnicodeObject self, PyObject args)
				3539	{
				3540	PyUnicodeObject *str1;
				3541	PyUnicodeObject *str2;
				3542	int maxcount = -1;
				3543	PyObject *result;
				3544
				3545	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3546	return NULL;
				3547	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3548	if (str1 == NULL)
				3549	return NULL;
				3550	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3551	if (str2 == NULL)
				3552	return NULL;
				3553
				3554	result = replace(self, str1, str2, maxcount);
				3555
				3556	Py_DECREF(str1);
				3557	Py_DECREF(str2);
				3558	return result;
				3559	}
				3560
				3561	static
				3562	PyObject unicode_repr(PyObject unicode)
				3563	{
				3564	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				3565	PyUnicode_GET_SIZE(unicode),
				3566	1);
				3567	}
				3568
				3569	static char rfind__doc__[] =
				3570	"S.rfind(sub [,start [,end]]) -> int\n\
				3571	\n\
				3572	Return the highest index in S where substring sub is found,\n\
				3573	such that sub is contained within s[start,end]. Optional\n\
				3574	arguments start and end are interpreted as in slice notation.\n\
				3575	\n\
				3576	Return -1 on failure.";
				3577
				3578	static PyObject *
				3579	unicode_rfind(PyUnicodeObject self, PyObject args)
				3580	{
				3581	PyUnicodeObject *substring;
				3582	int start = 0;
				3583	int end = INT_MAX;
				3584	PyObject *result;
				3585
				3586	if (!PyArg_ParseTuple(args, "O\|ii:rfind", &substring, &start, &end))
				3587	return NULL;
				3588	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3589	(PyObject *)substring);
				3590	if (substring == NULL)
				3591	return NULL;
				3592
				3593	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				3594
				3595	Py_DECREF(substring);
				3596	return result;
				3597	}
				3598
				3599	static char rindex__doc__[] =
				3600	"S.rindex(sub [,start [,end]]) -> int\n\
				3601	\n\
				3602	Like S.rfind() but raise ValueError when the substring is not found.";
				3603
				3604	static PyObject *
				3605	unicode_rindex(PyUnicodeObject self, PyObject args)
				3606	{
				3607	int result;
				3608	PyUnicodeObject *substring;
				3609	int start = 0;
				3610	int end = INT_MAX;
				3611
				3612	if (!PyArg_ParseTuple(args, "O\|ii:rindex", &substring, &start, &end))
				3613	return NULL;
				3614	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3615	(PyObject *)substring);
				3616	if (substring == NULL)
				3617	return NULL;
				3618
				3619	result = findstring(self, substring, start, end, -1);
				3620
				3621	Py_DECREF(substring);
				3622	if (result < 0) {
				3623	PyErr_SetString(PyExc_ValueError, "substring not found");
				3624	return NULL;
				3625	}
				3626	return PyInt_FromLong(result);
				3627	}
				3628
				3629	static char rjust__doc__[] =
				3630	"S.rjust(width) -> unicode\n\
				3631	\n\
				3632	Return S right justified in a Unicode string of length width. Padding is\n\
				3633	done using spaces.";
				3634
				3635	static PyObject *
				3636	unicode_rjust(PyUnicodeObject self, PyObject args)
				3637	{
				3638	int width;
				3639	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				3640	return NULL;
				3641
				3642	if (self->length >= width) {
				3643	Py_INCREF(self);
				3644	return (PyObject*) self;
				3645	}
				3646
				3647	return (PyObject*) pad(self, width - self->length, 0, ' ');
				3648	}
				3649
				3650	static char rstrip__doc__[] =
				3651	"S.rstrip() -> unicode\n\
				3652	\n\
				3653	Return a copy of the string S with trailing whitespace removed.";
				3654
				3655	static PyObject *
				3656	unicode_rstrip(PyUnicodeObject self, PyObject args)
				3657	{
				3658	if (!PyArg_NoArgs(args))
				3659	return NULL;
				3660	return strip(self, 0, 1);
				3661	}
				3662
				3663	static PyObject*
				3664	unicode_slice(PyUnicodeObject *self, int start, int end)
				3665	{
				3666	/* standard clamping */
				3667	if (start < 0)
				3668	start = 0;
				3669	if (end < 0)
				3670	end = 0;
				3671	if (end > self->length)
				3672	end = self->length;
				3673	if (start == 0 && end == self->length) {
				3674	/* full slice, return original string */
				3675	Py_INCREF(self);
				3676	return (PyObject*) self;
				3677	}
				3678	if (start > end)
				3679	start = end;
				3680	/* copy slice */
				3681	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				3682	end - start);
				3683	}
				3684
				3685	PyObject PyUnicode_Split(PyObject s,
				3686	PyObject *sep,
				3687	int maxsplit)
				3688	{
				3689	PyObject *result;
				3690
				3691	s = PyUnicode_FromObject(s);
				3692	if (s == NULL)
				3693	return NULL;
				3694	if (sep != NULL) {
				3695	sep = PyUnicode_FromObject(sep);
				3696	if (sep == NULL) {
				3697	Py_DECREF(s);
				3698	return NULL;
				3699	}
				3700	}
				3701
				3702	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				3703
				3704	Py_DECREF(s);
				3705	Py_XDECREF(sep);
				3706	return result;
				3707	}
				3708
				3709	static char split__doc__[] =
				3710	"S.split([sep [,maxsplit]]) -> list of strings\n\
				3711	\n\
				3712	Return a list of the words in S, using sep as the\n\
				3713	delimiter string. If maxsplit is given, at most maxsplit\n\
				3714	splits are done. If sep is not specified, any whitespace string\n\
				3715	is a separator.";
				3716
				3717	static PyObject*
				3718	unicode_split(PyUnicodeObject self, PyObject args)
				3719	{
				3720	PyObject *substring = Py_None;
				3721	int maxcount = -1;
				3722
				3723	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				3724	return NULL;
				3725
				3726	if (substring == Py_None)
				3727	return split(self, NULL, maxcount);
				3728	else if (PyUnicode_Check(substring))
				3729	return split(self, (PyUnicodeObject *)substring, maxcount);
				3730	else
				3731	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				3732	}
				3733
				3734	static char splitlines__doc__[] =
				3735	"S.splitlines([maxsplit]]) -> list of strings\n\
				3736	\n\
				3737	Return a list of the lines in S, breaking at line boundaries.\n\
				3738	If maxsplit is given, at most maxsplit are done. Line breaks are not\n\
				3739	included in the resulting list.";
				3740
				3741	static PyObject*
				3742	unicode_splitlines(PyUnicodeObject self, PyObject args)
				3743	{
				3744	int maxcount = -1;
				3745
				3746	if (!PyArg_ParseTuple(args, "\|i:splitlines", &maxcount))
				3747	return NULL;
				3748
				3749	return PyUnicode_Splitlines((PyObject *)self, maxcount);
				3750	}
				3751
				3752	static
				3753	PyObject unicode_str(PyUnicodeObject self)
				3754	{
				3755	return PyUnicode_AsUTF8String((PyObject *)self);
				3756	}
				3757
				3758	static char strip__doc__[] =
				3759	"S.strip() -> unicode\n\
				3760	\n\
				3761	Return a copy of S with leading and trailing whitespace removed.";
				3762
				3763	static PyObject *
				3764	unicode_strip(PyUnicodeObject self, PyObject args)
				3765	{
				3766	if (!PyArg_NoArgs(args))
				3767	return NULL;
				3768	return strip(self, 1, 1);
				3769	}
				3770
				3771	static char swapcase__doc__[] =
				3772	"S.swapcase() -> unicode\n\
				3773	\n\
				3774	Return a copy of S with uppercase characters converted to lowercase\n\
				3775	and vice versa.";
				3776
				3777	static PyObject*
				3778	unicode_swapcase(PyUnicodeObject self, PyObject args)
				3779	{
				3780	if (!PyArg_NoArgs(args))
				3781	return NULL;
				3782	return fixup(self, fixswapcase);
				3783	}
				3784
				3785	static char translate__doc__[] =
				3786	"S.translate(table) -> unicode\n\
				3787	\n\
				3788	Return a copy of the string S, where all characters have been mapped\n\
				3789	through the given translation table, which must be a mapping of\n\
				3790	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				3791	are left untouched. Characters mapped to None are deleted.";
				3792
				3793	static PyObject*
				3794	unicode_translate(PyUnicodeObject self, PyObject args)
				3795	{
				3796	PyObject *table;
				3797
				3798	if (!PyArg_ParseTuple(args, "O:translate", &table))
				3799	return NULL;
				3800	return PyUnicode_TranslateCharmap(self->str,
				3801	self->length,
				3802	table,
				3803	"ignore");
				3804	}
				3805
				3806	static char upper__doc__[] =
				3807	"S.upper() -> unicode\n\
				3808	\n\
				3809	Return a copy of S converted to uppercase.";
				3810
				3811	static PyObject*
				3812	unicode_upper(PyUnicodeObject self, PyObject args)
				3813	{
				3814	if (!PyArg_NoArgs(args))
				3815	return NULL;
				3816	return fixup(self, fixupper);
				3817	}
				3818
				3819	#if 0
				3820	static char zfill__doc__[] =
				3821	"S.zfill(width) -> unicode\n\
				3822	\n\
				3823	Pad a numeric string x with zeros on the left, to fill a field\n\
				3824	of the specified width. The string x is never truncated.";
				3825
				3826	static PyObject *
				3827	unicode_zfill(PyUnicodeObject self, PyObject args)
				3828	{
				3829	int fill;
				3830	PyUnicodeObject *u;
				3831
				3832	int width;
				3833	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				3834	return NULL;
				3835
				3836	if (self->length >= width) {
				3837	Py_INCREF(self);
				3838	return (PyObject*) self;
				3839	}
				3840
				3841	fill = width - self->length;
				3842
				3843	u = pad(self, fill, 0, '0');
				3844
				3845	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				3846	/* move sign to beginning of string */
				3847	u->str[0] = u->str[fill];
				3848	u->str[fill] = '0';
				3849	}
				3850
				3851	return (PyObject*) u;
				3852	}
				3853	#endif
				3854
				3855	#if 0
				3856	static PyObject*
				3857	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				3858	{
				3859	if (!PyArg_NoArgs(args))
				3860	return NULL;
				3861	return PyInt_FromLong(unicode_freelist_size);
				3862	}
				3863	#endif
				3864
				3865	static char startswith__doc__[] =
				3866	"S.startswith(prefix[, start[, end]]) -> int\n\
				3867	\n\
				3868	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				3869	optional start, test S beginning at that position. With optional end, stop\n\
				3870	comparing S at that position.";
				3871
				3872	static PyObject *
				3873	unicode_startswith(PyUnicodeObject *self,
				3874	PyObject *args)
				3875	{
				3876	PyUnicodeObject *substring;
				3877	int start = 0;
				3878	int end = INT_MAX;
				3879	PyObject *result;
				3880
				3881	if (!PyArg_ParseTuple(args, "O\|ii:startswith", &substring, &start, &end))
				3882	return NULL;
				3883	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3884	(PyObject *)substring);
				3885	if (substring == NULL)
				3886	return NULL;
				3887
				3888	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				3889
				3890	Py_DECREF(substring);
				3891	return result;
				3892	}
				3893
				3894
				3895	static char endswith__doc__[] =
				3896	"S.endswith(suffix[, start[, end]]) -> int\n\
				3897	\n\
				3898	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				3899	optional start, test S beginning at that position. With optional end, stop\n\
				3900	comparing S at that position.";
				3901
				3902	static PyObject *
				3903	unicode_endswith(PyUnicodeObject *self,
				3904	PyObject *args)
				3905	{
				3906	PyUnicodeObject *substring;
				3907	int start = 0;
				3908	int end = INT_MAX;
				3909	PyObject *result;
				3910
				3911	if (!PyArg_ParseTuple(args, "O\|ii:endswith", &substring, &start, &end))
				3912	return NULL;
				3913	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3914	(PyObject *)substring);
				3915	if (substring == NULL)
				3916	return NULL;
				3917
				3918	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				3919
				3920	Py_DECREF(substring);
				3921	return result;
				3922	}
				3923
				3924
				3925	static PyMethodDef unicode_methods[] = {
				3926
				3927	/* Order is according to common usage: often used methods should
				3928	appear first, since lookup is done sequentially. */
				3929
				3930	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				3931	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				3932	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				3933	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				3934	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				3935	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				3936	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				3937	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				3938	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				3939	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				3940	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				3941	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				3942	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				3943	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				3944	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				3945	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				3946	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				3947	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				3948	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				3949	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				3950	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				3951	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				3952	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				3953	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				3954	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				3955	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				3956	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				3957	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				3958	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				3959	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				3960	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				3961	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				3962	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
				3963	#if 0
				3964	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				3965	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				3966	#endif
				3967
				3968	#if 0
				3969	/* This one is just used for debugging the implementation. */
				3970	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				3971	#endif
				3972
				3973	{NULL, NULL}
				3974	};
				3975
				3976	static PyObject *
				3977	unicode_getattr(PyUnicodeObject self, char name)
				3978	{
				3979	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				3980	}
				3981
				3982	static PySequenceMethods unicode_as_sequence = {
				3983	(inquiry) unicode_length, /* sq_length */
				3984	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				3985	(intargfunc) unicode_repeat, /* sq_repeat */
				3986	(intargfunc) unicode_getitem, /* sq_item */
				3987	(intintargfunc) unicode_slice, /* sq_slice */
				3988	0, /* sq_ass_item */
				3989	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3990	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3991	};
				3992
				3993	static int
				3994	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				3995	int index,
				3996	const void **ptr)
				3997	{
				3998	if (index != 0) {
				3999	PyErr_SetString(PyExc_SystemError,
				4000	"accessing non-existent unicode segment");
				4001	return -1;
				4002	}
				4003	ptr = (void ) self->str;
				4004	return PyUnicode_GET_DATA_SIZE(self);
				4005	}
				4006
				4007	static int
				4008	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4009	const void **ptr)
				4010	{
				4011	PyErr_SetString(PyExc_TypeError,
				4012	"cannot use unicode as modifyable buffer");
				4013	return -1;
				4014	}
				4015
				4016	static int
				4017	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4018	int *lenp)
				4019	{
				4020	if (lenp)
				4021	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4022	return 1;
				4023	}
				4024
				4025	static int
				4026	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4027	int index,
				4028	const void **ptr)
				4029	{
				4030	PyObject *str;
				4031
				4032	if (index != 0) {
				4033	PyErr_SetString(PyExc_SystemError,
				4034	"accessing non-existent unicode segment");
				4035	return -1;
				4036	}
				4037	str = utf8_string(self, NULL);
				4038	if (str == NULL)
				4039	return -1;
				4040	ptr = (void ) PyString_AS_STRING(str);
				4041	return PyString_GET_SIZE(str);
				4042	}
				4043
				4044	/* Helpers for PyUnicode_Format() */
				4045
				4046	static PyObject *
				4047	getnextarg(args, arglen, p_argidx)
				4048	PyObject *args;
				4049	int arglen;
				4050	int *p_argidx;
				4051	{
				4052	int argidx = *p_argidx;
				4053	if (argidx < arglen) {
				4054	(*p_argidx)++;
				4055	if (arglen < 0)
				4056	return args;
				4057	else
				4058	return PyTuple_GetItem(args, argidx);
				4059	}
				4060	PyErr_SetString(PyExc_TypeError,
				4061	"not enough arguments for format string");
				4062	return NULL;
				4063	}
				4064
				4065	#define F_LJUST (1<<0)
				4066	#define F_SIGN (1<<1)
				4067	#define F_BLANK (1<<2)
				4068	#define F_ALT (1<<3)
				4069	#define F_ZERO (1<<4)
				4070
				4071	static
				4072	#ifdef HAVE_STDARG_PROTOTYPES
				4073	int usprintf(register Py_UNICODE buffer, char format, ...)
				4074	#else
				4075	int usprintf(va_alist) va_dcl
				4076	#endif
				4077	{
				4078	register int i;
				4079	int len;
				4080	va_list va;
				4081	char *charbuffer;
				4082	#ifdef HAVE_STDARG_PROTOTYPES
				4083	va_start(va, format);
				4084	#else
				4085	Py_UNICODE *args;
				4086	char *format;
				4087
				4088	va_start(va);
				4089	buffer = va_arg(va, Py_UNICODE *);
				4090	format = va_arg(va, char *);
				4091	#endif
				4092
				4093	/* First, format the string as char array, then expand to Py_UNICODE
				4094	array. */
				4095	charbuffer = (char *)buffer;
				4096	len = vsprintf(charbuffer, format, va);
				4097	for (i = len - 1; i >= 0; i--)
				4098	buffer[i] = (Py_UNICODE) charbuffer[i];
				4099
				4100	va_end(va);
				4101	return len;
				4102	}
				4103
				4104	static int
				4105	formatfloat(Py_UNICODE *buf,
				4106	int flags,
				4107	int prec,
				4108	int type,
				4109	PyObject *v)
				4110	{
				4111	char fmt[20];
				4112	double x;
				4113
				4114	x = PyFloat_AsDouble(v);
				4115	if (x == -1.0 && PyErr_Occurred())
				4116	return -1;
				4117	if (prec < 0)
				4118	prec = 6;
				4119	if (prec > 50)
				4120	prec = 50; /* Arbitrary limitation */
				4121	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4122	type = 'g';
				4123	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
				4124	return usprintf(buf, fmt, x);
				4125	}
				4126
				4127	static int
				4128	formatint(Py_UNICODE *buf,
				4129	int flags,
				4130	int prec,
				4131	int type,
				4132	PyObject *v)
				4133	{
				4134	char fmt[20];
				4135	long x;
				4136
				4137	x = PyInt_AsLong(v);
				4138	if (x == -1 && PyErr_Occurred())
				4139	return -1;
				4140	if (prec < 0)
				4141	prec = 1;
				4142	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4143	return usprintf(buf, fmt, x);
				4144	}
				4145
				4146	static int
				4147	formatchar(Py_UNICODE *buf,
				4148	PyObject *v)
				4149	{
				4150	if (PyUnicode_Check(v))
				4151	buf[0] = PyUnicode_AS_UNICODE(v)[0];
				4152
				4153	else if (PyString_Check(v))
				4154	buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
				4155
				4156	else {
				4157	/* Integer input truncated to a character */
				4158	long x;
				4159	x = PyInt_AsLong(v);
				4160	if (x == -1 && PyErr_Occurred())
				4161	return -1;
				4162	buf[0] = (char) x;
				4163	}
				4164	buf[1] = '\0';
				4165	return 1;
				4166	}
				4167
				4168	PyObject PyUnicode_Format(PyObject format,
				4169	PyObject *args)
				4170	{
				4171	Py_UNICODE fmt, res;
				4172	int fmtcnt, rescnt, reslen, arglen, argidx;
				4173	int args_owned = 0;
				4174	PyUnicodeObject *result = NULL;
				4175	PyObject *dict = NULL;
				4176	PyObject *uformat;
				4177
				4178	if (format == NULL \|\| args == NULL) {
				4179	PyErr_BadInternalCall();
				4180	return NULL;
				4181	}
				4182	uformat = PyUnicode_FromObject(format);
				4183	fmt = PyUnicode_AS_UNICODE(uformat);
				4184	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4185
				4186	reslen = rescnt = fmtcnt + 100;
				4187	result = _PyUnicode_New(reslen);
				4188	if (result == NULL)
				4189	goto onError;
				4190	res = PyUnicode_AS_UNICODE(result);
				4191
				4192	if (PyTuple_Check(args)) {
				4193	arglen = PyTuple_Size(args);
				4194	argidx = 0;
				4195	}
				4196	else {
				4197	arglen = -1;
				4198	argidx = -2;
				4199	}
				4200	if (args->ob_type->tp_as_mapping)
				4201	dict = args;
				4202
				4203	while (--fmtcnt >= 0) {
				4204	if (*fmt != '%') {
				4205	if (--rescnt < 0) {
				4206	rescnt = fmtcnt + 100;
				4207	reslen += rescnt;
				4208	if (_PyUnicode_Resize(result, reslen) < 0)
				4209	return NULL;
				4210	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4211	--rescnt;
				4212	}
				4213	res++ = fmt++;
				4214	}
				4215	else {
				4216	/* Got a format specifier */
				4217	int flags = 0;
				4218	int width = -1;
				4219	int prec = -1;
				4220	int size = 0;
				4221	Py_UNICODE c = '\0';
				4222	Py_UNICODE fill;
				4223	PyObject *v = NULL;
				4224	PyObject *temp = NULL;
				4225	Py_UNICODE *buf;
				4226	Py_UNICODE sign;
				4227	int len;
				4228	Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
				4229
				4230	fmt++;
				4231	if (*fmt == '(') {
				4232	Py_UNICODE *keystart;
				4233	int keylen;
				4234	PyObject *key;
				4235	int pcount = 1;
				4236
				4237	if (dict == NULL) {
				4238	PyErr_SetString(PyExc_TypeError,
				4239	"format requires a mapping");
				4240	goto onError;
				4241	}
				4242	++fmt;
				4243	--fmtcnt;
				4244	keystart = fmt;
				4245	/* Skip over balanced parentheses */
				4246	while (pcount > 0 && --fmtcnt >= 0) {
				4247	if (*fmt == ')')
				4248	--pcount;
				4249	else if (*fmt == '(')
				4250	++pcount;
				4251	fmt++;
				4252	}
				4253	keylen = fmt - keystart - 1;
				4254	if (fmtcnt < 0 \|\| pcount > 0) {
				4255	PyErr_SetString(PyExc_ValueError,
				4256	"incomplete format key");
				4257	goto onError;
				4258	}
				4259	/* keys are converted to strings (using UTF-8) and
				4260	then looked up since Python uses strings to hold
				4261	variables names etc. in its namespaces and we
				4262	wouldn't want to break common idioms. The
				4263	alternative would be using Unicode objects for the
				4264	lookup but u"abc" and "abc" have different hash
				4265	values (on purpose). */
				4266	key = PyUnicode_EncodeUTF8(keystart,
				4267	keylen,
				4268	NULL);
				4269	if (key == NULL)
				4270	goto onError;
				4271	if (args_owned) {
				4272	Py_DECREF(args);
				4273	args_owned = 0;
				4274	}
				4275	args = PyObject_GetItem(dict, key);
				4276	Py_DECREF(key);
				4277	if (args == NULL) {
				4278	goto onError;
				4279	}
				4280	args_owned = 1;
				4281	arglen = -1;
				4282	argidx = -2;
				4283	}
				4284	while (--fmtcnt >= 0) {
				4285	switch (c = *fmt++) {
				4286	case '-': flags \|= F_LJUST; continue;
				4287	case '+': flags \|= F_SIGN; continue;
				4288	case ' ': flags \|= F_BLANK; continue;
				4289	case '#': flags \|= F_ALT; continue;
				4290	case '0': flags \|= F_ZERO; continue;
				4291	}
				4292	break;
				4293	}
				4294	if (c == '*') {
				4295	v = getnextarg(args, arglen, &argidx);
				4296	if (v == NULL)
				4297	goto onError;
				4298	if (!PyInt_Check(v)) {
				4299	PyErr_SetString(PyExc_TypeError,
				4300	"* wants int");
				4301	goto onError;
				4302	}
				4303	width = PyInt_AsLong(v);
				4304	if (width < 0) {
				4305	flags \|= F_LJUST;
				4306	width = -width;
				4307	}
				4308	if (--fmtcnt >= 0)
				4309	c = *fmt++;
				4310	}
				4311	else if (c >= '0' && c <= '9') {
				4312	width = c - '0';
				4313	while (--fmtcnt >= 0) {
				4314	c = *fmt++;
				4315	if (c < '0' \|\| c > '9')
				4316	break;
				4317	if ((width*10) / 10 != width) {
				4318	PyErr_SetString(PyExc_ValueError,
				4319	"width too big");
				4320	goto onError;
				4321	}
				4322	width = width*10 + (c - '0');
				4323	}
				4324	}
				4325	if (c == '.') {
				4326	prec = 0;
				4327	if (--fmtcnt >= 0)
				4328	c = *fmt++;
				4329	if (c == '*') {
				4330	v = getnextarg(args, arglen, &argidx);
				4331	if (v == NULL)
				4332	goto onError;
				4333	if (!PyInt_Check(v)) {
				4334	PyErr_SetString(PyExc_TypeError,
				4335	"* wants int");
				4336	goto onError;
				4337	}
				4338	prec = PyInt_AsLong(v);
				4339	if (prec < 0)
				4340	prec = 0;
				4341	if (--fmtcnt >= 0)
				4342	c = *fmt++;
				4343	}
				4344	else if (c >= '0' && c <= '9') {
				4345	prec = c - '0';
				4346	while (--fmtcnt >= 0) {
				4347	c = Py_CHARMASK(*fmt++);
				4348	if (c < '0' \|\| c > '9')
				4349	break;
				4350	if ((prec*10) / 10 != prec) {
				4351	PyErr_SetString(PyExc_ValueError,
				4352	"prec too big");
				4353	goto onError;
				4354	}
				4355	prec = prec*10 + (c - '0');
				4356	}
				4357	}
				4358	} /* prec */
				4359	if (fmtcnt >= 0) {
				4360	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4361	size = c;
				4362	if (--fmtcnt >= 0)
				4363	c = *fmt++;
				4364	}
				4365	}
				4366	if (fmtcnt < 0) {
				4367	PyErr_SetString(PyExc_ValueError,
				4368	"incomplete format");
				4369	goto onError;
				4370	}
				4371	if (c != '%') {
				4372	v = getnextarg(args, arglen, &argidx);
				4373	if (v == NULL)
				4374	goto onError;
				4375	}
				4376	sign = 0;
				4377	fill = ' ';
				4378	switch (c) {
				4379
				4380	case '%':
				4381	buf = tmpbuf;
				4382	buf[0] = '%';
				4383	len = 1;
				4384	break;
				4385
				4386	case 's':
				4387	case 'r':
				4388	if (PyUnicode_Check(v) && c == 's') {
				4389	temp = v;
				4390	Py_INCREF(temp);
				4391	}
				4392	else {
				4393	PyObject *unicode;
				4394	if (c == 's')
				4395	temp = PyObject_Str(v);
				4396	else
				4397	temp = PyObject_Repr(v);
				4398	if (temp == NULL)
				4399	goto onError;
				4400	if (!PyString_Check(temp)) {
				4401	/* XXX Note: this should never happen, since
				4402	PyObject_Repr() and PyObject_Str() assure
				4403	this */
				4404	Py_DECREF(temp);
				4405	PyErr_SetString(PyExc_TypeError,
				4406	"%s argument has non-string str()");
				4407	goto onError;
				4408	}
				4409	unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
				4410	PyString_GET_SIZE(temp),
				4411	"strict");
				4412	Py_DECREF(temp);
				4413	temp = unicode;
				4414	if (temp == NULL)
				4415	goto onError;
				4416	}
				4417	buf = PyUnicode_AS_UNICODE(temp);
				4418	len = PyUnicode_GET_SIZE(temp);
				4419	if (prec >= 0 && len > prec)
				4420	len = prec;
				4421	break;
				4422
				4423	case 'i':
				4424	case 'd':
				4425	case 'u':
				4426	case 'o':
				4427	case 'x':
				4428	case 'X':
				4429	if (c == 'i')
				4430	c = 'd';
				4431	buf = tmpbuf;
				4432	len = formatint(buf, flags, prec, c, v);
				4433	if (len < 0)
				4434	goto onError;
				4435	sign = (c == 'd');
				4436	if (flags & F_ZERO) {
				4437	fill = '0';
				4438	if ((flags&F_ALT) &&
				4439	(c == 'x' \|\| c == 'X') &&
				4440	buf[0] == '0' && buf[1] == c) {
				4441	res++ = buf++;
				4442	res++ = buf++;
				4443	rescnt -= 2;
				4444	len -= 2;
				4445	width -= 2;
				4446	if (width < 0)
				4447	width = 0;
				4448	}
				4449	}
				4450	break;
				4451
				4452	case 'e':
				4453	case 'E':
				4454	case 'f':
				4455	case 'g':
				4456	case 'G':
				4457	buf = tmpbuf;
				4458	len = formatfloat(buf, flags, prec, c, v);
				4459	if (len < 0)
				4460	goto onError;
				4461	sign = 1;
				4462	if (flags&F_ZERO)
				4463	fill = '0';
				4464	break;
				4465
				4466	case 'c':
				4467	buf = tmpbuf;
				4468	len = formatchar(buf, v);
				4469	if (len < 0)
				4470	goto onError;
				4471	break;
				4472
				4473	default:
				4474	PyErr_Format(PyExc_ValueError,
				4475	"unsupported format character '%c' (0x%x)",
				4476	c, c);
				4477	goto onError;
				4478	}
				4479	if (sign) {
				4480	if (buf == '-' \|\| buf == '+') {
				4481	sign = *buf++;
				4482	len--;
				4483	}
				4484	else if (flags & F_SIGN)
				4485	sign = '+';
				4486	else if (flags & F_BLANK)
				4487	sign = ' ';
				4488	else
				4489	sign = 0;
				4490	}
				4491	if (width < len)
				4492	width = len;
				4493	if (rescnt < width + (sign != 0)) {
				4494	reslen -= rescnt;
				4495	rescnt = width + fmtcnt + 100;
				4496	reslen += rescnt;
				4497	if (_PyUnicode_Resize(result, reslen) < 0)
				4498	return NULL;
				4499	res = PyUnicode_AS_UNICODE(result)
				4500	+ reslen - rescnt;
				4501	}
				4502	if (sign) {
				4503	if (fill != ' ')
				4504	*res++ = sign;
				4505	rescnt--;
				4506	if (width > len)
				4507	width--;
				4508	}
				4509	if (width > len && !(flags & F_LJUST)) {
				4510	do {
				4511	--rescnt;
				4512	*res++ = fill;
				4513	} while (--width > len);
				4514	}
				4515	if (sign && fill == ' ')
				4516	*res++ = sign;
				4517	memcpy(res, buf, len * sizeof(Py_UNICODE));
				4518	res += len;
				4519	rescnt -= len;
				4520	while (--width >= len) {
				4521	--rescnt;
				4522	*res++ = ' ';
				4523	}
				4524	if (dict && (argidx < arglen) && c != '%') {
				4525	PyErr_SetString(PyExc_TypeError,
				4526	"not all arguments converted");
				4527	goto onError;
				4528	}
				4529	Py_XDECREF(temp);
				4530	} /* '%' */
				4531	} /* until end */
				4532	if (argidx < arglen && !dict) {
				4533	PyErr_SetString(PyExc_TypeError,
				4534	"not all arguments converted");
				4535	goto onError;
				4536	}
				4537
				4538	if (args_owned) {
				4539	Py_DECREF(args);
				4540	}
				4541	Py_DECREF(uformat);
				4542	_PyUnicode_Resize(result, reslen - rescnt);
				4543	return (PyObject *)result;
				4544
				4545	onError:
				4546	Py_XDECREF(result);
				4547	Py_DECREF(uformat);
				4548	if (args_owned) {
				4549	Py_DECREF(args);
				4550	}
				4551	return NULL;
				4552	}
				4553
				4554	static PyBufferProcs unicode_as_buffer = {
				4555	(getreadbufferproc) unicode_buffer_getreadbuf,
				4556	(getwritebufferproc) unicode_buffer_getwritebuf,
				4557	(getsegcountproc) unicode_buffer_getsegcount,
				4558	(getcharbufferproc) unicode_buffer_getcharbuf,
				4559	};
				4560
				4561	PyTypeObject PyUnicode_Type = {
				4562	PyObject_HEAD_INIT(&PyType_Type)
				4563	0, /* ob_size */
				4564	"unicode", /* tp_name */
				4565	sizeof(PyUnicodeObject), /* tp_size */
				4566	0, /* tp_itemsize */
				4567	/* Slots */
				4568	(destructor)_PyUnicode_Free, /* tp_dealloc */
				4569	0, /* tp_print */
				4570	(getattrfunc)unicode_getattr, /* tp_getattr */
				4571	0, /* tp_setattr */
				4572	(cmpfunc) unicode_compare, /* tp_compare */
				4573	(reprfunc) unicode_repr, /* tp_repr */
				4574	0, /* tp_as_number */
				4575	&unicode_as_sequence, /* tp_as_sequence */
				4576	0, /* tp_as_mapping */
				4577	(hashfunc) unicode_hash, /* tp_hash*/
				4578	0, /* tp_call*/
				4579	(reprfunc) unicode_str, /* tp_str */
				4580	(getattrofunc) NULL, /* tp_getattro */
				4581	(setattrofunc) NULL, /* tp_setattro */
				4582	&unicode_as_buffer, /* tp_as_buffer */
				4583	Py_TPFLAGS_DEFAULT, /* tp_flags */
				4584	};
				4585
				4586	/* Initialize the Unicode implementation */
				4587
				4588	void _PyUnicode_Init()
				4589	{
				4590	/* Doublecheck the configuration... */
				4591	if (sizeof(Py_UNICODE) != 2)
				4592	Py_FatalError("Unicode configuration error: "
				4593	"sizeof(Py_UNICODE) != 2 bytes");
				4594
				4595	unicode_empty = _PyUnicode_New(0);
				4596	}
				4597
				4598	/* Finalize the Unicode implementation */
				4599
				4600	void
				4601	_PyUnicode_Fini()
				4602	{
				4603	PyUnicodeObject *u = unicode_freelist;
				4604
				4605	while (u != NULL) {
				4606	PyUnicodeObject *v = u;
				4607	u = (PyUnicodeObject *)u;
				4608	free(v);
				4609	}
				4610	Py_XDECREF(unicode_empty);
				4611	}