Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: f10f9ab75697b9d6544763989b2c5de7851762bb [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
				4	modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
				5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
				69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	79
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	80	/* Limit for the Unicode object free list */
				81
				82	#define MAX_UNICODE_FREELIST_SIZE 1024
				83
				84	/* Limit for the Unicode object free list stay alive optimization.
				85
				86	The implementation will keep allocated Unicode memory intact for
				87	all objects on the free list having a size less than this
				88	limit. This reduces malloc() overhead for small Unicode objects.
				89
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	90	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	91	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92	malloc()-overhead) bytes of unused garbage.
				93
				94	Setting the limit to 0 effectively turns the feature off.
				95
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	96	Note: This is an experimental feature ! If you get core dumps when
				97	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	98
				99	*/
				100
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	101	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
				111	/* --- Globals ------------------------------------------------------------ */
				112
				113	/* The empty Unicode object */
				114	static PyUnicodeObject *unicode_empty = NULL;
				115
				116	/* Free list for Unicode objects */
				117	static PyUnicodeObject *unicode_freelist = NULL;
				118	static int unicode_freelist_size = 0;
				119
				120	/* --- Unicode Object ----------------------------------------------------- */
				121
				122	static
				123	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				124	int length)
				125	{
				126	void *oldstr;
				127
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	130	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	131
				132	/* Resizing unicode_empty is not allowed. */
				133	if (unicode == unicode_empty) {
				134	PyErr_SetString(PyExc_SystemError,
				135	"can't resize empty unicode object");
				136	return -1;
				137	}
				138
				139	/* We allocate one more byte to make sure the string is
				140	Ux0000 terminated -- XXX is this needed ? */
				141	oldstr = unicode->str;
				142	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				143	if (!unicode->str) {
				144	unicode->str = oldstr;
				145	PyErr_NoMemory();
				146	return -1;
				147	}
				148	unicode->str[length] = 0;
				149	unicode->length = length;
				150
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	151	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	152	/* Reset the object caches */
				153	if (unicode->utf8str) {
				154	Py_DECREF(unicode->utf8str);
				155	unicode->utf8str = NULL;
				156	}
				157	unicode->hash = -1;
				158
				159	return 0;
				160	}
				161
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	162	int PyUnicode_Resize(PyObject **unicode,
				163	int length)
				164	{
				165	PyUnicodeObject *v;
				166
				167	if (unicode == NULL) {
				168	PyErr_BadInternalCall();
				169	return -1;
				170	}
				171	v = (PyUnicodeObject )unicode;
				172	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				173	PyErr_BadInternalCall();
				174	return -1;
				175	}
				176	return _PyUnicode_Resize(v, length);
				177	}
				178
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	179	/* We allocate one more byte to make sure the string is
				180	Ux0000 terminated -- XXX is this needed ?
				181
				182	XXX This allocator could further be enhanced by assuring that the
				183	free list never reduces its size below 1.
				184
				185	*/
				186
				187	static
				188	PyUnicodeObject *_PyUnicode_New(int length)
				189	{
				190	register PyUnicodeObject *unicode;
				191
				192	/* Optimization for empty strings */
				193	if (length == 0 && unicode_empty != NULL) {
				194	Py_INCREF(unicode_empty);
				195	return unicode_empty;
				196	}
				197
				198	/* Unicode freelist & memory allocation */
				199	if (unicode_freelist) {
				200	unicode = unicode_freelist;
				201	unicode_freelist = (PyUnicodeObject *)unicode_freelist;
				202	unicode_freelist_size--;
				203	unicode->ob_type = &PyUnicode_Type;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	204	_Py_NewReference((PyObject *)unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	205	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	206	/* Keep-Alive optimization: we only upsize the buffer,
				207	never downsize it. */
				208	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	209	_PyUnicode_Resize(unicode, length)) {
				210	free(unicode->str);
				211	PyMem_DEL(unicode);
				212	return NULL;
				213	}
				214	}
				215	else
				216	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				217	}
				218	else {
				219	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				220	if (unicode == NULL)
				221	return NULL;
				222	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				223	}
				224
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	225	if (!unicode->str)
				226	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	227	unicode->str[length] = 0;
				228	unicode->length = length;
				229	unicode->hash = -1;
				230	unicode->utf8str = NULL;
				231	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	232
				233	onError:
				234	_Py_ForgetReference((PyObject *)unicode);
				235	PyMem_DEL(unicode);
				236	PyErr_NoMemory();
				237	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	238	}
				239
				240	static
				241	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				242	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	243	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	244	/* Keep-Alive optimization */
				245	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	246	free(unicode->str);
				247	unicode->str = NULL;
				248	unicode->length = 0;
				249	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	250	if (unicode->utf8str) {
				251	Py_DECREF(unicode->utf8str);
				252	unicode->utf8str = NULL;
				253	}
				254	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	255	(PyUnicodeObject *)unicode = unicode_freelist;
				256	unicode_freelist = unicode;
				257	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	258	}
				259	else {
				260	free(unicode->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	261	Py_XDECREF(unicode->utf8str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	262	PyMem_DEL(unicode);
				263	}
				264	}
				265
				266	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				267	int size)
				268	{
				269	PyUnicodeObject *unicode;
				270
				271	unicode = _PyUnicode_New(size);
				272	if (!unicode)
				273	return NULL;
				274
				275	/* Copy the Unicode data into the new object */
				276	if (u != NULL)
				277	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				278
				279	return (PyObject *)unicode;
				280	}
				281
				282	#ifdef HAVE_WCHAR_H
				283
				284	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				285	int size)
				286	{
				287	PyUnicodeObject *unicode;
				288
				289	if (w == NULL) {
				290	PyErr_BadInternalCall();
				291	return NULL;
				292	}
				293
				294	unicode = _PyUnicode_New(size);
				295	if (!unicode)
				296	return NULL;
				297
				298	/* Copy the wchar_t data into the new object */
				299	#ifdef HAVE_USABLE_WCHAR_T
				300	memcpy(unicode->str, w, size * sizeof(wchar_t));
				301	#else
				302	{
				303	register Py_UNICODE *u;
				304	register int i;
				305	u = PyUnicode_AS_UNICODE(unicode);
				306	for (i = size; i >= 0; i--)
				307	u++ = w++;
				308	}
				309	#endif
				310
				311	return (PyObject *)unicode;
				312	}
				313
				314	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				315	register wchar_t *w,
				316	int size)
				317	{
				318	if (unicode == NULL) {
				319	PyErr_BadInternalCall();
				320	return -1;
				321	}
				322	if (size > PyUnicode_GET_SIZE(unicode))
				323	size = PyUnicode_GET_SIZE(unicode);
				324	#ifdef HAVE_USABLE_WCHAR_T
				325	memcpy(w, unicode->str, size * sizeof(wchar_t));
				326	#else
				327	{
				328	register Py_UNICODE *u;
				329	register int i;
				330	u = PyUnicode_AS_UNICODE(unicode);
				331	for (i = size; i >= 0; i--)
				332	w++ = u++;
				333	}
				334	#endif
				335
				336	return size;
				337	}
				338
				339	#endif
				340
				341	PyObject PyUnicode_FromObject(register PyObject obj)
				342	{
				343	const char *s;
				344	int len;
				345
				346	if (obj == NULL) {
				347	PyErr_BadInternalCall();
				348	return NULL;
				349	}
				350	else if (PyUnicode_Check(obj)) {
				351	Py_INCREF(obj);
				352	return obj;
				353	}
				354	else if (PyString_Check(obj)) {
				355	s = PyString_AS_STRING(obj);
				356	len = PyString_GET_SIZE(obj);
				357	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	358	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				359	/* Overwrite the error message with something more useful in
				360	case of a TypeError. */
				361	if (PyErr_ExceptionMatches(PyExc_TypeError))
				362	PyErr_SetString(PyExc_TypeError,
				363	"coercing to Unicode: need string or charbuffer");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	364	return NULL;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	365	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	366	if (len == 0) {
				367	Py_INCREF(unicode_empty);
				368	return (PyObject *)unicode_empty;
				369	}
				370	return PyUnicode_DecodeUTF8(s, len, "strict");
				371	}
				372
				373	PyObject PyUnicode_Decode(const char s,
				374	int size,
				375	const char *encoding,
				376	const char *errors)
				377	{
				378	PyObject buffer = NULL, unicode;
				379
				380	/* Shortcut for the default encoding UTF-8 */
				381	if (encoding == NULL \|\|
				382	(strcmp(encoding, "utf-8") == 0))
				383	return PyUnicode_DecodeUTF8(s, size, errors);
				384
				385	/* Decode via the codec registry */
				386	buffer = PyBuffer_FromMemory((void *)s, size);
				387	if (buffer == NULL)
				388	goto onError;
				389	unicode = PyCodec_Decode(buffer, encoding, errors);
				390	if (unicode == NULL)
				391	goto onError;
				392	if (!PyUnicode_Check(unicode)) {
				393	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	394	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	395	unicode->ob_type->tp_name);
				396	Py_DECREF(unicode);
				397	goto onError;
				398	}
				399	Py_DECREF(buffer);
				400	return unicode;
				401
				402	onError:
				403	Py_XDECREF(buffer);
				404	return NULL;
				405	}
				406
				407	PyObject PyUnicode_Encode(const Py_UNICODE s,
				408	int size,
				409	const char *encoding,
				410	const char *errors)
				411	{
				412	PyObject v, unicode;
				413
				414	unicode = PyUnicode_FromUnicode(s, size);
				415	if (unicode == NULL)
				416	return NULL;
				417	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				418	Py_DECREF(unicode);
				419	return v;
				420	}
				421
				422	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				423	const char *encoding,
				424	const char *errors)
				425	{
				426	PyObject *v;
				427
				428	if (!PyUnicode_Check(unicode)) {
				429	PyErr_BadArgument();
				430	goto onError;
				431	}
				432	/* Shortcut for the default encoding UTF-8 */
				433	if ((encoding == NULL \|\|
				434	(strcmp(encoding, "utf-8") == 0)) &&
				435	errors == NULL)
				436	return PyUnicode_AsUTF8String(unicode);
				437
				438	/* Encode via the codec registry */
				439	v = PyCodec_Encode(unicode, encoding, errors);
				440	if (v == NULL)
				441	goto onError;
				442	/* XXX Should we really enforce this ? */
				443	if (!PyString_Check(v)) {
				444	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	445	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	446	v->ob_type->tp_name);
				447	Py_DECREF(v);
				448	goto onError;
				449	}
				450	return v;
				451
				452	onError:
				453	return NULL;
				454	}
				455
				456	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				457	{
				458	if (!PyUnicode_Check(unicode)) {
				459	PyErr_BadArgument();
				460	goto onError;
				461	}
				462	return PyUnicode_AS_UNICODE(unicode);
				463
				464	onError:
				465	return NULL;
				466	}
				467
				468	int PyUnicode_GetSize(PyObject *unicode)
				469	{
				470	if (!PyUnicode_Check(unicode)) {
				471	PyErr_BadArgument();
				472	goto onError;
				473	}
				474	return PyUnicode_GET_SIZE(unicode);
				475
				476	onError:
				477	return -1;
				478	}
				479
				480	/* --- UTF-8 Codec -------------------------------------------------------- */
				481
				482	static
				483	char utf8_code_length[256] = {
				484	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				485	illegal prefix. see RFC 2279 for details */
				486	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				487	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				488	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				489	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				490	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				491	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				492	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				493	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				494	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				495	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				496	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				497	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				498	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				499	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				500	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				501	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				502	};
				503
				504	static
				505	int utf8_decoding_error(const char **source,
				506	Py_UNICODE **dest,
				507	const char *errors,
				508	const char *details)
				509	{
				510	if ((errors == NULL) \|\|
				511	(strcmp(errors,"strict") == 0)) {
				512	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	513	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	514	details);
				515	return -1;
				516	}
				517	else if (strcmp(errors,"ignore") == 0) {
				518	(*source)++;
				519	return 0;
				520	}
				521	else if (strcmp(errors,"replace") == 0) {
				522	(*source)++;
				523	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				524	(*dest)++;
				525	return 0;
				526	}
				527	else {
				528	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	529	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	530	errors);
				531	return -1;
				532	}
				533	}
				534
				535	#define UTF8_ERROR(details) do { \
				536	if (utf8_decoding_error(&s, &p, errors, details)) \
				537	goto onError; \
				538	continue; \
				539	} while (0)
				540
				541	PyObject PyUnicode_DecodeUTF8(const char s,
				542	int size,
				543	const char *errors)
				544	{
				545	int n;
				546	const char *e;
				547	PyUnicodeObject *unicode;
				548	Py_UNICODE *p;
				549
				550	/* Note: size will always be longer than the resulting Unicode
				551	character count */
				552	unicode = _PyUnicode_New(size);
				553	if (!unicode)
				554	return NULL;
				555	if (size == 0)
				556	return (PyObject *)unicode;
				557
				558	/* Unpack UTF-8 encoded data */
				559	p = unicode->str;
				560	e = s + size;
				561
				562	while (s < e) {
				563	register Py_UNICODE ch = (unsigned char)*s;
				564
				565	if (ch < 0x80) {
				566	*p++ = ch;
				567	s++;
				568	continue;
				569	}
				570
				571	n = utf8_code_length[ch];
				572
				573	if (s + n > e)
				574	UTF8_ERROR("unexpected end of data");
				575
				576	switch (n) {
				577
				578	case 0:
				579	UTF8_ERROR("unexpected code byte");
				580	break;
				581
				582	case 1:
				583	UTF8_ERROR("internal error");
				584	break;
				585
				586	case 2:
				587	if ((s[1] & 0xc0) != 0x80)
				588	UTF8_ERROR("invalid data");
				589	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				590	if (ch < 0x80)
				591	UTF8_ERROR("illegal encoding");
				592	else
				593	*p++ = ch;
				594	break;
				595
				596	case 3:
				597	if ((s[1] & 0xc0) != 0x80 \|\|
				598	(s[2] & 0xc0) != 0x80)
				599	UTF8_ERROR("invalid data");
				600	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				601	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				602	UTF8_ERROR("illegal encoding");
				603	else
				604	*p++ = ch;
				605	break;
				606
				607	default:
				608	/* Other sizes are only needed for UCS-4 */
				609	UTF8_ERROR("unsupported Unicode code range");
				610	}
				611	s += n;
				612	}
				613
				614	/* Adjust length */
				615	if (_PyUnicode_Resize(unicode, p - unicode->str))
				616	goto onError;
				617
				618	return (PyObject *)unicode;
				619
				620	onError:
				621	Py_DECREF(unicode);
				622	return NULL;
				623	}
				624
				625	#undef UTF8_ERROR
				626
				627	static
				628	int utf8_encoding_error(const Py_UNICODE **source,
				629	char **dest,
				630	const char *errors,
				631	const char *details)
				632	{
				633	if ((errors == NULL) \|\|
				634	(strcmp(errors,"strict") == 0)) {
				635	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	636	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	637	details);
				638	return -1;
				639	}
				640	else if (strcmp(errors,"ignore") == 0) {
				641	return 0;
				642	}
				643	else if (strcmp(errors,"replace") == 0) {
				644	**dest = '?';
				645	(*dest)++;
				646	return 0;
				647	}
				648	else {
				649	PyErr_Format(PyExc_ValueError,
				650	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	651	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	652	errors);
				653	return -1;
				654	}
				655	}
				656
				657	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				658	int size,
				659	const char *errors)
				660	{
				661	PyObject *v;
				662	char *p;
				663	char *q;
				664
				665	v = PyString_FromStringAndSize(NULL, 3 * size);
				666	if (v == NULL)
				667	return NULL;
				668	if (size == 0)
				669	goto done;
				670
				671	p = q = PyString_AS_STRING(v);
				672	while (size-- > 0) {
				673	Py_UNICODE ch = *s++;
				674	if (ch < 0x80)
				675	*p++ = (char) ch;
				676	else if (ch < 0x0800) {
				677	*p++ = 0xc0 \| (ch >> 6);
				678	*p++ = 0x80 \| (ch & 0x3f);
				679	} else if (0xD800 <= ch && ch <= 0xDFFF) {
				680	/* These byte ranges are reserved for UTF-16 surrogate
				681	bytes which the Python implementation currently does
				682	not support. */
				683	printf("code range problem: U+%04x\n", ch);
				684	if (utf8_encoding_error(&s, &p, errors,
				685	"unsupported code range"))
				686	goto onError;
				687	} else {
				688	*p++ = 0xe0 \| (ch >> 12);
				689	*p++ = 0x80 \| ((ch >> 6) & 0x3f);
				690	*p++ = 0x80 \| (ch & 0x3f);
				691	}
				692	}
				693	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	694	if (_PyString_Resize(&v, p - q))
				695	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	696
				697	done:
				698	return v;
				699
				700	onError:
				701	Py_DECREF(v);
				702	return NULL;
				703	}
				704
				705	/* Return a Python string holding the UTF-8 encoded value of the
				706	Unicode object.
				707
				708	The resulting string is cached in the Unicode object for subsequent
				709	usage by this function. The cached version is needed to implement
				710	the character buffer interface.
				711
				712	The refcount of the string is not incremented.
				713
				714	*/
				715
				716	static
				717	PyObject utf8_string(PyUnicodeObject self,
				718	const char *errors)
				719	{
				720	PyObject *v = self->utf8str;
				721
				722	if (v)
				723	return v;
				724	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(self),
				725	PyUnicode_GET_SIZE(self),
				726	errors);
				727	if (v && errors == NULL)
				728	self->utf8str = v;
				729	return v;
				730	}
				731
				732	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				733	{
				734	PyObject *str;
				735
				736	if (!PyUnicode_Check(unicode)) {
				737	PyErr_BadArgument();
				738	return NULL;
				739	}
				740	str = utf8_string((PyUnicodeObject *)unicode, NULL);
				741	if (str == NULL)
				742	return NULL;
				743	Py_INCREF(str);
				744	return str;
				745	}
				746
				747	/* --- UTF-16 Codec ------------------------------------------------------- */
				748
				749	static
				750	int utf16_decoding_error(const Py_UNICODE **source,
				751	Py_UNICODE **dest,
				752	const char *errors,
				753	const char *details)
				754	{
				755	if ((errors == NULL) \|\|
				756	(strcmp(errors,"strict") == 0)) {
				757	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	758	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	759	details);
				760	return -1;
				761	}
				762	else if (strcmp(errors,"ignore") == 0) {
				763	return 0;
				764	}
				765	else if (strcmp(errors,"replace") == 0) {
				766	if (dest) {
				767	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				768	(*dest)++;
				769	}
				770	return 0;
				771	}
				772	else {
				773	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	774	"UTF-16 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	775	errors);
				776	return -1;
				777	}
				778	}
				779
				780	#define UTF16_ERROR(details) do { \
				781	if (utf16_decoding_error(&q, &p, errors, details)) \
				782	goto onError; \
				783	continue; \
				784	} while(0)
				785
				786	PyObject PyUnicode_DecodeUTF16(const char s,
				787	int size,
				788	const char *errors,
				789	int *byteorder)
				790	{
				791	PyUnicodeObject *unicode;
				792	Py_UNICODE *p;
				793	const Py_UNICODE q, e;
				794	int bo = 0;
				795
				796	/* size should be an even number */
				797	if (size % sizeof(Py_UNICODE) != 0) {
				798	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				799	return NULL;
				800	/* The remaining input chars are ignored if we fall through
				801	here... */
				802	}
				803
				804	/* Note: size will always be longer than the resulting Unicode
				805	character count */
				806	unicode = _PyUnicode_New(size);
				807	if (!unicode)
				808	return NULL;
				809	if (size == 0)
				810	return (PyObject *)unicode;
				811
				812	/* Unpack UTF-16 encoded data */
				813	p = unicode->str;
				814	q = (Py_UNICODE *)s;
				815	e = q + (size / sizeof(Py_UNICODE));
				816
				817	if (byteorder)
				818	bo = *byteorder;
				819
				820	while (q < e) {
				821	register Py_UNICODE ch = *q++;
				822
				823	/* Check for BOM marks (U+FEFF) in the input and adjust
				824	current byte order setting accordingly. Swap input
				825	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				826	!) */
				827	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				828	if (ch == 0xFEFF) {
				829	bo = -1;
				830	continue;
				831	} else if (ch == 0xFFFE) {
				832	bo = 1;
				833	continue;
				834	}
				835	if (bo == 1)
				836	ch = (ch >> 8) \| (ch << 8);
				837	#else
				838	if (ch == 0xFEFF) {
				839	bo = 1;
				840	continue;
				841	} else if (ch == 0xFFFE) {
				842	bo = -1;
				843	continue;
				844	}
				845	if (bo == -1)
				846	ch = (ch >> 8) \| (ch << 8);
				847	#endif
				848	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				849	*p++ = ch;
				850	continue;
				851	}
				852
				853	/* UTF-16 code pair: */
				854	if (q >= e)
				855	UTF16_ERROR("unexpected end of data");
				856	if (0xDC00 <= q && q <= 0xDFFF) {
				857	q++;
				858	if (0xD800 <= q && q <= 0xDBFF)
				859	/* This is valid data (a UTF-16 surrogate pair), but
				860	we are not able to store this information since our
				861	Py_UNICODE type only has 16 bits... this might
				862	change someday, even though it's unlikely. */
				863	UTF16_ERROR("code pairs are not supported");
				864	else
				865	continue;
				866	}
				867	UTF16_ERROR("illegal encoding");
				868	}
				869
				870	if (byteorder)
				871	*byteorder = bo;
				872
				873	/* Adjust length */
				874	if (_PyUnicode_Resize(unicode, p - unicode->str))
				875	goto onError;
				876
				877	return (PyObject *)unicode;
				878
				879	onError:
				880	Py_DECREF(unicode);
				881	return NULL;
				882	}
				883
				884	#undef UTF16_ERROR
				885
				886	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				887	int size,
				888	const char *errors,
				889	int byteorder)
				890	{
				891	PyObject *v;
				892	Py_UNICODE *p;
				893	char *q;
				894
				895	/* We don't create UTF-16 pairs... */
				896	v = PyString_FromStringAndSize(NULL,
				897	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				898	if (v == NULL)
				899	return NULL;
				900	if (size == 0)
				901	goto done;
				902
				903	q = PyString_AS_STRING(v);
				904	p = (Py_UNICODE *)q;
				905
				906	if (byteorder == 0)
				907	*p++ = 0xFEFF;
				908	if (byteorder == 0 \|\|
				909	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				910	byteorder == -1
				911	#else
				912	byteorder == 1
				913	#endif
				914	)
				915	memcpy(p, s, size * sizeof(Py_UNICODE));
				916	else
				917	while (size-- > 0) {
				918	Py_UNICODE ch = *s++;
				919	*p++ = (ch >> 8) \| (ch << 8);
				920	}
				921	done:
				922	return v;
				923	}
				924
				925	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				926	{
				927	if (!PyUnicode_Check(unicode)) {
				928	PyErr_BadArgument();
				929	return NULL;
				930	}
				931	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				932	PyUnicode_GET_SIZE(unicode),
				933	NULL,
				934	0);
				935	}
				936
				937	/* --- Unicode Escape Codec ----------------------------------------------- */
				938
				939	static
				940	int unicodeescape_decoding_error(const char **source,
				941	unsigned int *x,
				942	const char *errors,
				943	const char *details)
				944	{
				945	if ((errors == NULL) \|\|
				946	(strcmp(errors,"strict") == 0)) {
				947	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	948	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	949	details);
				950	return -1;
				951	}
				952	else if (strcmp(errors,"ignore") == 0) {
				953	return 0;
				954	}
				955	else if (strcmp(errors,"replace") == 0) {
				956	*x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
				957	return 0;
				958	}
				959	else {
				960	PyErr_Format(PyExc_ValueError,
				961	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	962	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	963	errors);
				964	return -1;
				965	}
				966	}
				967
				968	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				969	int size,
				970	const char *errors)
				971	{
				972	PyUnicodeObject *v;
				973	Py_UNICODE p = NULL, buf = NULL;
				974	const char *end;
				975
				976	/* Escaped strings will always be longer than the resulting
				977	Unicode string, so we start with size here and then reduce the
				978	length after conversion to the true value. */
				979	v = _PyUnicode_New(size);
				980	if (v == NULL)
				981	goto onError;
				982	if (size == 0)
				983	return (PyObject *)v;
				984	p = buf = PyUnicode_AS_UNICODE(v);
				985	end = s + size;
				986	while (s < end) {
				987	unsigned char c;
				988	unsigned int x;
				989	int i;
				990
				991	/* Non-escape characters are interpreted as Unicode ordinals */
				992	if (*s != '\\') {
				993	p++ = (unsigned char)s++;
				994	continue;
				995	}
				996
				997	/* \ - Escapes */
				998	s++;
				999	switch (*s++) {
				1000
				1001	/* \x escapes */
				1002	case '\n': break;
				1003	case '\\': *p++ = '\\'; break;
				1004	case '\'': *p++ = '\''; break;
				1005	case '\"': *p++ = '\"'; break;
				1006	case 'b': *p++ = '\b'; break;
				1007	case 'f': p++ = '\014'; break; / FF */
				1008	case 't': *p++ = '\t'; break;
				1009	case 'n': *p++ = '\n'; break;
				1010	case 'r': *p++ = '\r'; break;
				1011	case 'v': p++ = '\013'; break; / VT */
				1012	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1013
				1014	/* \OOO (octal) escapes */
				1015	case '0': case '1': case '2': case '3':
				1016	case '4': case '5': case '6': case '7':
				1017	c = s[-1] - '0';
				1018	if ('0' <= s && s <= '7') {
				1019	c = (c<<3) + *s++ - '0';
				1020	if ('0' <= s && s <= '7')
				1021	c = (c<<3) + *s++ - '0';
				1022	}
				1023	*p++ = c;
				1024	break;
				1025
				1026	/* \xXXXX escape with 0-4 hex digits */
				1027	case 'x':
				1028	x = 0;
				1029	c = (unsigned char)*s;
				1030	if (isxdigit(c)) {
				1031	do {
				1032	x = (x<<4) & ~0xF;
				1033	if ('0' <= c && c <= '9')
				1034	x += c - '0';
				1035	else if ('a' <= c && c <= 'f')
				1036	x += 10 + c - 'a';
				1037	else
				1038	x += 10 + c - 'A';
				1039	c = (unsigned char)*++s;
				1040	} while (isxdigit(c));
				1041	*p++ = x;
				1042	} else {
				1043	*p++ = '\\';
				1044	*p++ = (unsigned char)s[-1];
				1045	}
				1046	break;
				1047
				1048	/* \uXXXX with 4 hex digits */
				1049	case 'u':
				1050	for (x = 0, i = 0; i < 4; i++) {
				1051	c = (unsigned char)s[i];
				1052	if (!isxdigit(c)) {
				1053	if (unicodeescape_decoding_error(&s, &x, errors,
				1054	"truncated \\uXXXX"))
				1055	goto onError;
				1056	i++;
				1057	break;
				1058	}
				1059	x = (x<<4) & ~0xF;
				1060	if (c >= '0' && c <= '9')
				1061	x += c - '0';
				1062	else if (c >= 'a' && c <= 'f')
				1063	x += 10 + c - 'a';
				1064	else
				1065	x += 10 + c - 'A';
				1066	}
				1067	s += i;
				1068	*p++ = x;
				1069	break;
				1070
				1071	default:
				1072	*p++ = '\\';
				1073	*p++ = (unsigned char)s[-1];
				1074	break;
				1075	}
				1076	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1077	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1078	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1079	return (PyObject *)v;
				1080
				1081	onError:
				1082	Py_XDECREF(v);
				1083	return NULL;
				1084	}
				1085
				1086	/* Return a Unicode-Escape string version of the Unicode object.
				1087
				1088	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1089	appropriate.
				1090
				1091	*/
				1092
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1093	static const Py_UNICODE findchar(const Py_UNICODE s,
				1094	int size,
				1095	Py_UNICODE ch);
				1096
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1097	static
				1098	PyObject unicodeescape_string(const Py_UNICODE s,
				1099	int size,
				1100	int quotes)
				1101	{
				1102	PyObject *repr;
				1103	char *p;
				1104	char *q;
				1105
				1106	static const char *hexdigit = "0123456789ABCDEF";
				1107
				1108	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1109	if (repr == NULL)
				1110	return NULL;
				1111
				1112	p = q = PyString_AS_STRING(repr);
				1113
				1114	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1115	*p++ = 'u';
				1116	*p++ = (findchar(s, size, '\'') &&
				1117	!findchar(s, size, '"')) ? '"' : '\'';
				1118	}
				1119	while (size-- > 0) {
				1120	Py_UNICODE ch = *s++;
				1121	/* Escape quotes */
				1122	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1123	*p++ = '\\';
				1124	*p++ = (char) ch;
				1125	}
				1126	/* Map 16-bit characters to '\uxxxx' */
				1127	else if (ch >= 256) {
				1128	*p++ = '\\';
				1129	*p++ = 'u';
				1130	*p++ = hexdigit[(ch >> 12) & 0xf];
				1131	*p++ = hexdigit[(ch >> 8) & 0xf];
				1132	*p++ = hexdigit[(ch >> 4) & 0xf];
				1133	*p++ = hexdigit[ch & 15];
				1134	}
				1135	/* Map non-printable US ASCII to '\ooo' */
				1136	else if (ch < ' ' \|\| ch >= 128) {
				1137	*p++ = '\\';
				1138	*p++ = hexdigit[(ch >> 6) & 7];
				1139	*p++ = hexdigit[(ch >> 3) & 7];
				1140	*p++ = hexdigit[ch & 7];
				1141	}
				1142	/* Copy everything else as-is */
				1143	else
				1144	*p++ = (char) ch;
				1145	}
				1146	if (quotes)
				1147	*p++ = q[1];
				1148
				1149	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1150	if (_PyString_Resize(&repr, p - q))
				1151	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1152
				1153	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1154
				1155	onError:
				1156	Py_DECREF(repr);
				1157	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1158	}
				1159
				1160	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1161	int size)
				1162	{
				1163	return unicodeescape_string(s, size, 0);
				1164	}
				1165
				1166	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1167	{
				1168	if (!PyUnicode_Check(unicode)) {
				1169	PyErr_BadArgument();
				1170	return NULL;
				1171	}
				1172	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1173	PyUnicode_GET_SIZE(unicode));
				1174	}
				1175
				1176	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1177
				1178	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1179	int size,
				1180	const char *errors)
				1181	{
				1182	PyUnicodeObject *v;
				1183	Py_UNICODE p, buf;
				1184	const char *end;
				1185	const char *bs;
				1186
				1187	/* Escaped strings will always be longer than the resulting
				1188	Unicode string, so we start with size here and then reduce the
				1189	length after conversion to the true value. */
				1190	v = _PyUnicode_New(size);
				1191	if (v == NULL)
				1192	goto onError;
				1193	if (size == 0)
				1194	return (PyObject *)v;
				1195	p = buf = PyUnicode_AS_UNICODE(v);
				1196	end = s + size;
				1197	while (s < end) {
				1198	unsigned char c;
				1199	unsigned int x;
				1200	int i;
				1201
				1202	/* Non-escape characters are interpreted as Unicode ordinals */
				1203	if (*s != '\\') {
				1204	p++ = (unsigned char)s++;
				1205	continue;
				1206	}
				1207
				1208	/* \u-escapes are only interpreted iff the number of leading
				1209	backslashes if odd */
				1210	bs = s;
				1211	for (;s < end;) {
				1212	if (*s != '\\')
				1213	break;
				1214	p++ = (unsigned char)s++;
				1215	}
				1216	if (((s - bs) & 1) == 0 \|\|
				1217	s >= end \|\|
				1218	*s != 'u') {
				1219	continue;
				1220	}
				1221	p--;
				1222	s++;
				1223
				1224	/* \uXXXX with 4 hex digits */
				1225	for (x = 0, i = 0; i < 4; i++) {
				1226	c = (unsigned char)s[i];
				1227	if (!isxdigit(c)) {
				1228	if (unicodeescape_decoding_error(&s, &x, errors,
				1229	"truncated \\uXXXX"))
				1230	goto onError;
				1231	i++;
				1232	break;
				1233	}
				1234	x = (x<<4) & ~0xF;
				1235	if (c >= '0' && c <= '9')
				1236	x += c - '0';
				1237	else if (c >= 'a' && c <= 'f')
				1238	x += 10 + c - 'a';
				1239	else
				1240	x += 10 + c - 'A';
				1241	}
				1242	s += i;
				1243	*p++ = x;
				1244	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1245	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1246	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1247	return (PyObject *)v;
				1248
				1249	onError:
				1250	Py_XDECREF(v);
				1251	return NULL;
				1252	}
				1253
				1254	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1255	int size)
				1256	{
				1257	PyObject *repr;
				1258	char *p;
				1259	char *q;
				1260
				1261	static const char *hexdigit = "0123456789ABCDEF";
				1262
				1263	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1264	if (repr == NULL)
				1265	return NULL;
				1266
				1267	p = q = PyString_AS_STRING(repr);
				1268	while (size-- > 0) {
				1269	Py_UNICODE ch = *s++;
				1270	/* Map 16-bit characters to '\uxxxx' */
				1271	if (ch >= 256) {
				1272	*p++ = '\\';
				1273	*p++ = 'u';
				1274	*p++ = hexdigit[(ch >> 12) & 0xf];
				1275	*p++ = hexdigit[(ch >> 8) & 0xf];
				1276	*p++ = hexdigit[(ch >> 4) & 0xf];
				1277	*p++ = hexdigit[ch & 15];
				1278	}
				1279	/* Copy everything else as-is */
				1280	else
				1281	*p++ = (char) ch;
				1282	}
				1283	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1284	if (_PyString_Resize(&repr, p - q))
				1285	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1286
				1287	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1288
				1289	onError:
				1290	Py_DECREF(repr);
				1291	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1292	}
				1293
				1294	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1295	{
				1296	if (!PyUnicode_Check(unicode)) {
				1297	PyErr_BadArgument();
				1298	return NULL;
				1299	}
				1300	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1301	PyUnicode_GET_SIZE(unicode));
				1302	}
				1303
				1304	/* --- Latin-1 Codec ------------------------------------------------------ */
				1305
				1306	PyObject PyUnicode_DecodeLatin1(const char s,
				1307	int size,
				1308	const char *errors)
				1309	{
				1310	PyUnicodeObject *v;
				1311	Py_UNICODE *p;
				1312
				1313	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1314	v = _PyUnicode_New(size);
				1315	if (v == NULL)
				1316	goto onError;
				1317	if (size == 0)
				1318	return (PyObject *)v;
				1319	p = PyUnicode_AS_UNICODE(v);
				1320	while (size-- > 0)
				1321	p++ = (unsigned char)s++;
				1322	return (PyObject *)v;
				1323
				1324	onError:
				1325	Py_XDECREF(v);
				1326	return NULL;
				1327	}
				1328
				1329	static
				1330	int latin1_encoding_error(const Py_UNICODE **source,
				1331	char **dest,
				1332	const char *errors,
				1333	const char *details)
				1334	{
				1335	if ((errors == NULL) \|\|
				1336	(strcmp(errors,"strict") == 0)) {
				1337	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1338	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1339	details);
				1340	return -1;
				1341	}
				1342	else if (strcmp(errors,"ignore") == 0) {
				1343	return 0;
				1344	}
				1345	else if (strcmp(errors,"replace") == 0) {
				1346	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1347	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1348	return 0;
				1349	}
				1350	else {
				1351	PyErr_Format(PyExc_ValueError,
				1352	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1353	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1354	errors);
				1355	return -1;
				1356	}
				1357	}
				1358
				1359	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1360	int size,
				1361	const char *errors)
				1362	{
				1363	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1364	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1365	repr = PyString_FromStringAndSize(NULL, size);
				1366	if (repr == NULL)
				1367	return NULL;
				1368
				1369	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1370	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1371	while (size-- > 0) {
				1372	Py_UNICODE ch = *p++;
				1373	if (ch >= 256) {
				1374	if (latin1_encoding_error(&p, &s, errors,
				1375	"ordinal not in range(256)"))
				1376	goto onError;
				1377	}
				1378	else
				1379	*s++ = (char)ch;
				1380	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1381	/* Resize if error handling skipped some characters */
				1382	if (s - start < PyString_GET_SIZE(repr))
				1383	if (_PyString_Resize(&repr, s - start))
				1384	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1385	return repr;
				1386
				1387	onError:
				1388	Py_DECREF(repr);
				1389	return NULL;
				1390	}
				1391
				1392	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1393	{
				1394	if (!PyUnicode_Check(unicode)) {
				1395	PyErr_BadArgument();
				1396	return NULL;
				1397	}
				1398	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1399	PyUnicode_GET_SIZE(unicode),
				1400	NULL);
				1401	}
				1402
				1403	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1404
				1405	static
				1406	int ascii_decoding_error(const char **source,
				1407	Py_UNICODE **dest,
				1408	const char *errors,
				1409	const char *details)
				1410	{
				1411	if ((errors == NULL) \|\|
				1412	(strcmp(errors,"strict") == 0)) {
				1413	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1414	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1415	details);
				1416	return -1;
				1417	}
				1418	else if (strcmp(errors,"ignore") == 0) {
				1419	return 0;
				1420	}
				1421	else if (strcmp(errors,"replace") == 0) {
				1422	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1423	(*dest)++;
				1424	return 0;
				1425	}
				1426	else {
				1427	PyErr_Format(PyExc_ValueError,
				1428	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1429	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1430	errors);
				1431	return -1;
				1432	}
				1433	}
				1434
				1435	PyObject PyUnicode_DecodeASCII(const char s,
				1436	int size,
				1437	const char *errors)
				1438	{
				1439	PyUnicodeObject *v;
				1440	Py_UNICODE *p;
				1441
				1442	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1443	v = _PyUnicode_New(size);
				1444	if (v == NULL)
				1445	goto onError;
				1446	if (size == 0)
				1447	return (PyObject *)v;
				1448	p = PyUnicode_AS_UNICODE(v);
				1449	while (size-- > 0) {
				1450	register unsigned char c;
				1451
				1452	c = (unsigned char)*s++;
				1453	if (c < 128)
				1454	*p++ = c;
				1455	else if (ascii_decoding_error(&s, &p, errors,
				1456	"ordinal not in range(128)"))
				1457	goto onError;
				1458	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1459	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1460	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1461	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1462	return (PyObject *)v;
				1463
				1464	onError:
				1465	Py_XDECREF(v);
				1466	return NULL;
				1467	}
				1468
				1469	static
				1470	int ascii_encoding_error(const Py_UNICODE **source,
				1471	char **dest,
				1472	const char *errors,
				1473	const char *details)
				1474	{
				1475	if ((errors == NULL) \|\|
				1476	(strcmp(errors,"strict") == 0)) {
				1477	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1478	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1479	details);
				1480	return -1;
				1481	}
				1482	else if (strcmp(errors,"ignore") == 0) {
				1483	return 0;
				1484	}
				1485	else if (strcmp(errors,"replace") == 0) {
				1486	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1487	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1488	return 0;
				1489	}
				1490	else {
				1491	PyErr_Format(PyExc_ValueError,
				1492	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1493	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1494	errors);
				1495	return -1;
				1496	}
				1497	}
				1498
				1499	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1500	int size,
				1501	const char *errors)
				1502	{
				1503	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1504	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1505	repr = PyString_FromStringAndSize(NULL, size);
				1506	if (repr == NULL)
				1507	return NULL;
				1508
				1509	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1510	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1511	while (size-- > 0) {
				1512	Py_UNICODE ch = *p++;
				1513	if (ch >= 128) {
				1514	if (ascii_encoding_error(&p, &s, errors,
				1515	"ordinal not in range(128)"))
				1516	goto onError;
				1517	}
				1518	else
				1519	*s++ = (char)ch;
				1520	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1521	/* Resize if error handling skipped some characters */
				1522	if (s - start < PyString_GET_SIZE(repr))
				1523	if (_PyString_Resize(&repr, s - start))
				1524	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1525	return repr;
				1526
				1527	onError:
				1528	Py_DECREF(repr);
				1529	return NULL;
				1530	}
				1531
				1532	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1533	{
				1534	if (!PyUnicode_Check(unicode)) {
				1535	PyErr_BadArgument();
				1536	return NULL;
				1537	}
				1538	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1539	PyUnicode_GET_SIZE(unicode),
				1540	NULL);
				1541	}
				1542
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1543	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1544
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1545	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1546
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1547	PyObject PyUnicode_DecodeMBCS(const char s,
				1548	int size,
				1549	const char *errors)
				1550	{
				1551	PyUnicodeObject *v;
				1552	Py_UNICODE *p;
				1553
				1554	/* First get the size of the result */
				1555	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
				1556	if (usize==0)
				1557	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1558
				1559	v = _PyUnicode_New(usize);
				1560	if (v == NULL)
				1561	return NULL;
				1562	if (usize == 0)
				1563	return (PyObject *)v;
				1564	p = PyUnicode_AS_UNICODE(v);
				1565	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1566	Py_DECREF(v);
				1567	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1568	}
				1569
				1570	return (PyObject *)v;
				1571	}
				1572
				1573	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1574	int size,
				1575	const char *errors)
				1576	{
				1577	PyObject *repr;
				1578	char *s;
				1579
				1580	/* First get the size of the result */
				1581	DWORD mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
				1582	if (mbcssize==0)
				1583	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1584
				1585	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1586	if (repr == NULL)
				1587	return NULL;
				1588	if (mbcssize==0)
				1589	return repr;
				1590
				1591	/* Do the conversion */
				1592	s = PyString_AS_STRING(repr);
				1593	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1594	Py_DECREF(repr);
				1595	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1596	}
				1597	return repr;
				1598	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1599
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1600	#endif /* MS_WIN32 */
				1601
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1602	/* --- Character Mapping Codec -------------------------------------------- */
				1603
				1604	static
				1605	int charmap_decoding_error(const char **source,
				1606	Py_UNICODE **dest,
				1607	const char *errors,
				1608	const char *details)
				1609	{
				1610	if ((errors == NULL) \|\|
				1611	(strcmp(errors,"strict") == 0)) {
				1612	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1613	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1614	details);
				1615	return -1;
				1616	}
				1617	else if (strcmp(errors,"ignore") == 0) {
				1618	return 0;
				1619	}
				1620	else if (strcmp(errors,"replace") == 0) {
				1621	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1622	(*dest)++;
				1623	return 0;
				1624	}
				1625	else {
				1626	PyErr_Format(PyExc_ValueError,
				1627	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1628	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1629	errors);
				1630	return -1;
				1631	}
				1632	}
				1633
				1634	PyObject PyUnicode_DecodeCharmap(const char s,
				1635	int size,
				1636	PyObject *mapping,
				1637	const char *errors)
				1638	{
				1639	PyUnicodeObject *v;
				1640	Py_UNICODE *p;
				1641
				1642	/* Default to Latin-1 */
				1643	if (mapping == NULL)
				1644	return PyUnicode_DecodeLatin1(s, size, errors);
				1645
				1646	v = _PyUnicode_New(size);
				1647	if (v == NULL)
				1648	goto onError;
				1649	if (size == 0)
				1650	return (PyObject *)v;
				1651	p = PyUnicode_AS_UNICODE(v);
				1652	while (size-- > 0) {
				1653	unsigned char ch = *s++;
				1654	PyObject w, x;
				1655
				1656	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1657	w = PyInt_FromLong((long)ch);
				1658	if (w == NULL)
				1659	goto onError;
				1660	x = PyObject_GetItem(mapping, w);
				1661	Py_DECREF(w);
				1662	if (x == NULL) {
				1663	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1664	/* No mapping found: default to Latin-1 mapping */
				1665	PyErr_Clear();
				1666	*p++ = (Py_UNICODE)ch;
				1667	continue;
				1668	}
				1669	goto onError;
				1670	}
				1671
				1672	/* Apply mapping */
				1673	if (PyInt_Check(x)) {
				1674	int value = PyInt_AS_LONG(x);
				1675	if (value < 0 \|\| value > 65535) {
				1676	PyErr_SetString(PyExc_TypeError,
				1677	"character mapping must be in range(65336)");
				1678	Py_DECREF(x);
				1679	goto onError;
				1680	}
				1681	*p++ = (Py_UNICODE)value;
				1682	}
				1683	else if (x == Py_None) {
				1684	/* undefined mapping */
				1685	if (charmap_decoding_error(&s, &p, errors,
				1686	"character maps to <undefined>")) {
				1687	Py_DECREF(x);
				1688	goto onError;
				1689	}
				1690	}
				1691	else if (PyUnicode_Check(x)) {
				1692	if (PyUnicode_GET_SIZE(x) != 1) {
				1693	/* 1-n mapping */
				1694	PyErr_SetString(PyExc_NotImplementedError,
				1695	"1-n mappings are currently not implemented");
				1696	Py_DECREF(x);
				1697	goto onError;
				1698	}
				1699	p++ = PyUnicode_AS_UNICODE(x);
				1700	}
				1701	else {
				1702	/* wrong return value */
				1703	PyErr_SetString(PyExc_TypeError,
				1704	"character mapping must return integer, None or unicode");
				1705	Py_DECREF(x);
				1706	goto onError;
				1707	}
				1708	Py_DECREF(x);
				1709	}
				1710	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1711	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1712	goto onError;
				1713	return (PyObject *)v;
				1714
				1715	onError:
				1716	Py_XDECREF(v);
				1717	return NULL;
				1718	}
				1719
				1720	static
				1721	int charmap_encoding_error(const Py_UNICODE **source,
				1722	char **dest,
				1723	const char *errors,
				1724	const char *details)
				1725	{
				1726	if ((errors == NULL) \|\|
				1727	(strcmp(errors,"strict") == 0)) {
				1728	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1729	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1730	details);
				1731	return -1;
				1732	}
				1733	else if (strcmp(errors,"ignore") == 0) {
				1734	return 0;
				1735	}
				1736	else if (strcmp(errors,"replace") == 0) {
				1737	**dest = '?';
				1738	(*dest)++;
				1739	return 0;
				1740	}
				1741	else {
				1742	PyErr_Format(PyExc_ValueError,
				1743	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1744	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1745	errors);
				1746	return -1;
				1747	}
				1748	}
				1749
				1750	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1751	int size,
				1752	PyObject *mapping,
				1753	const char *errors)
				1754	{
				1755	PyObject *v;
				1756	char *s;
				1757
				1758	/* Default to Latin-1 */
				1759	if (mapping == NULL)
				1760	return PyUnicode_EncodeLatin1(p, size, errors);
				1761
				1762	v = PyString_FromStringAndSize(NULL, size);
				1763	if (v == NULL)
				1764	return NULL;
				1765	s = PyString_AS_STRING(v);
				1766	while (size-- > 0) {
				1767	Py_UNICODE ch = *p++;
				1768	PyObject w, x;
				1769
				1770	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				1771	w = PyInt_FromLong((long)ch);
				1772	if (w == NULL)
				1773	goto onError;
				1774	x = PyObject_GetItem(mapping, w);
				1775	Py_DECREF(w);
				1776	if (x == NULL) {
				1777	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1778	/* No mapping found: default to Latin-1 mapping if possible */
				1779	PyErr_Clear();
				1780	if (ch < 256) {
				1781	*s++ = (char)ch;
				1782	continue;
				1783	}
				1784	else if (!charmap_encoding_error(&p, &s, errors,
				1785	"missing character mapping"))
				1786	continue;
				1787	}
				1788	goto onError;
				1789	}
				1790
				1791	/* Apply mapping */
				1792	if (PyInt_Check(x)) {
				1793	int value = PyInt_AS_LONG(x);
				1794	if (value < 0 \|\| value > 255) {
				1795	PyErr_SetString(PyExc_TypeError,
				1796	"character mapping must be in range(256)");
				1797	Py_DECREF(x);
				1798	goto onError;
				1799	}
				1800	*s++ = (char)value;
				1801	}
				1802	else if (x == Py_None) {
				1803	/* undefined mapping */
				1804	if (charmap_encoding_error(&p, &s, errors,
				1805	"character maps to <undefined>")) {
				1806	Py_DECREF(x);
				1807	goto onError;
				1808	}
				1809	}
				1810	else if (PyString_Check(x)) {
				1811	if (PyString_GET_SIZE(x) != 1) {
				1812	/* 1-n mapping */
				1813	PyErr_SetString(PyExc_NotImplementedError,
				1814	"1-n mappings are currently not implemented");
				1815	Py_DECREF(x);
				1816	goto onError;
				1817	}
				1818	s++ = PyString_AS_STRING(x);
				1819	}
				1820	else {
				1821	/* wrong return value */
				1822	PyErr_SetString(PyExc_TypeError,
				1823	"character mapping must return integer, None or unicode");
				1824	Py_DECREF(x);
				1825	goto onError;
				1826	}
				1827	Py_DECREF(x);
				1828	}
				1829	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				1830	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				1831	goto onError;
				1832	return v;
				1833
				1834	onError:
				1835	Py_DECREF(v);
				1836	return NULL;
				1837	}
				1838
				1839	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				1840	PyObject *mapping)
				1841	{
				1842	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				1843	PyErr_BadArgument();
				1844	return NULL;
				1845	}
				1846	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				1847	PyUnicode_GET_SIZE(unicode),
				1848	mapping,
				1849	NULL);
				1850	}
				1851
				1852	static
				1853	int translate_error(const Py_UNICODE **source,
				1854	Py_UNICODE **dest,
				1855	const char *errors,
				1856	const char *details)
				1857	{
				1858	if ((errors == NULL) \|\|
				1859	(strcmp(errors,"strict") == 0)) {
				1860	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1861	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1862	details);
				1863	return -1;
				1864	}
				1865	else if (strcmp(errors,"ignore") == 0) {
				1866	return 0;
				1867	}
				1868	else if (strcmp(errors,"replace") == 0) {
				1869	**dest = '?';
				1870	(*dest)++;
				1871	return 0;
				1872	}
				1873	else {
				1874	PyErr_Format(PyExc_ValueError,
				1875	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1876	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1877	errors);
				1878	return -1;
				1879	}
				1880	}
				1881
				1882	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				1883	int size,
				1884	PyObject *mapping,
				1885	const char *errors)
				1886	{
				1887	PyUnicodeObject *v;
				1888	Py_UNICODE *p;
				1889
				1890	if (mapping == NULL) {
				1891	PyErr_BadArgument();
				1892	return NULL;
				1893	}
				1894
				1895	/* Output will never be longer than input */
				1896	v = _PyUnicode_New(size);
				1897	if (v == NULL)
				1898	goto onError;
				1899	if (size == 0)
				1900	goto done;
				1901	p = PyUnicode_AS_UNICODE(v);
				1902	while (size-- > 0) {
				1903	Py_UNICODE ch = *s++;
				1904	PyObject w, x;
				1905
				1906	/* Get mapping */
				1907	w = PyInt_FromLong(ch);
				1908	if (w == NULL)
				1909	goto onError;
				1910	x = PyObject_GetItem(mapping, w);
				1911	Py_DECREF(w);
				1912	if (x == NULL) {
				1913	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1914	/* No mapping found: default to 1-1 mapping */
				1915	PyErr_Clear();
				1916	*p++ = ch;
				1917	continue;
				1918	}
				1919	goto onError;
				1920	}
				1921
				1922	/* Apply mapping */
				1923	if (PyInt_Check(x))
				1924	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				1925	else if (x == Py_None) {
				1926	/* undefined mapping */
				1927	if (translate_error(&s, &p, errors,
				1928	"character maps to <undefined>")) {
				1929	Py_DECREF(x);
				1930	goto onError;
				1931	}
				1932	}
				1933	else if (PyUnicode_Check(x)) {
				1934	if (PyUnicode_GET_SIZE(x) != 1) {
				1935	/* 1-n mapping */
				1936	PyErr_SetString(PyExc_NotImplementedError,
				1937	"1-n mappings are currently not implemented");
				1938	Py_DECREF(x);
				1939	goto onError;
				1940	}
				1941	p++ = PyUnicode_AS_UNICODE(x);
				1942	}
				1943	else {
				1944	/* wrong return value */
				1945	PyErr_SetString(PyExc_TypeError,
				1946	"translate mapping must return integer, None or unicode");
				1947	Py_DECREF(x);
				1948	goto onError;
				1949	}
				1950	Py_DECREF(x);
				1951	}
				1952	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1953	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1954	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1955
				1956	done:
				1957	return (PyObject *)v;
				1958
				1959	onError:
				1960	Py_XDECREF(v);
				1961	return NULL;
				1962	}
				1963
				1964	PyObject PyUnicode_Translate(PyObject str,
				1965	PyObject *mapping,
				1966	const char *errors)
				1967	{
				1968	PyObject *result;
				1969
				1970	str = PyUnicode_FromObject(str);
				1971	if (str == NULL)
				1972	goto onError;
				1973	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				1974	PyUnicode_GET_SIZE(str),
				1975	mapping,
				1976	errors);
				1977	Py_DECREF(str);
				1978	return result;
				1979
				1980	onError:
				1981	Py_XDECREF(str);
				1982	return NULL;
				1983	}
				1984
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	1985	/* --- Decimal Encoder ---------------------------------------------------- */
				1986
				1987	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				1988	int length,
				1989	char *output,
				1990	const char *errors)
				1991	{
				1992	Py_UNICODE p, end;
				1993
				1994	if (output == NULL) {
				1995	PyErr_BadArgument();
				1996	return -1;
				1997	}
				1998
				1999	p = s;
				2000	end = s + length;
				2001	while (p < end) {
				2002	register Py_UNICODE ch = *p++;
				2003	int decimal;
				2004
				2005	if (Py_UNICODE_ISSPACE(ch)) {
				2006	*output++ = ' ';
				2007	continue;
				2008	}
				2009	decimal = Py_UNICODE_TODECIMAL(ch);
				2010	if (decimal >= 0) {
				2011	*output++ = '0' + decimal;
				2012	continue;
				2013	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2014	if (0 < ch && ch < 256) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2015	*output++ = ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2016	continue;
				2017	}
				2018	/* All other characters are considered invalid */
				2019	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2020	PyErr_SetString(PyExc_ValueError,
				2021	"invalid decimal Unicode string");
				2022	goto onError;
				2023	}
				2024	else if (strcmp(errors, "ignore") == 0)
				2025	continue;
				2026	else if (strcmp(errors, "replace") == 0) {
				2027	*output++ = '?';
				2028	continue;
				2029	}
				2030	}
				2031	/* 0-terminate the output string */
				2032	*output++ = '\0';
				2033	return 0;
				2034
				2035	onError:
				2036	return -1;
				2037	}
				2038
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2039	/* --- Helpers ------------------------------------------------------------ */
				2040
				2041	static
				2042	int count(PyUnicodeObject *self,
				2043	int start,
				2044	int end,
				2045	PyUnicodeObject *substring)
				2046	{
				2047	int count = 0;
				2048
				2049	end -= substring->length;
				2050
				2051	while (start <= end)
				2052	if (Py_UNICODE_MATCH(self, start, substring)) {
				2053	count++;
				2054	start += substring->length;
				2055	} else
				2056	start++;
				2057
				2058	return count;
				2059	}
				2060
				2061	int PyUnicode_Count(PyObject *str,
				2062	PyObject *substr,
				2063	int start,
				2064	int end)
				2065	{
				2066	int result;
				2067
				2068	str = PyUnicode_FromObject(str);
				2069	if (str == NULL)
				2070	return -1;
				2071	substr = PyUnicode_FromObject(substr);
				2072	if (substr == NULL) {
				2073	Py_DECREF(substr);
				2074	return -1;
				2075	}
				2076
				2077	result = count((PyUnicodeObject *)str,
				2078	start, end,
				2079	(PyUnicodeObject *)substr);
				2080
				2081	Py_DECREF(str);
				2082	Py_DECREF(substr);
				2083	return result;
				2084	}
				2085
				2086	static
				2087	int findstring(PyUnicodeObject *self,
				2088	PyUnicodeObject *substring,
				2089	int start,
				2090	int end,
				2091	int direction)
				2092	{
				2093	if (start < 0)
				2094	start += self->length;
				2095	if (start < 0)
				2096	start = 0;
				2097
				2098	if (substring->length == 0)
				2099	return start;
				2100
				2101	if (end > self->length)
				2102	end = self->length;
				2103	if (end < 0)
				2104	end += self->length;
				2105	if (end < 0)
				2106	end = 0;
				2107
				2108	end -= substring->length;
				2109
				2110	if (direction < 0) {
				2111	for (; end >= start; end--)
				2112	if (Py_UNICODE_MATCH(self, end, substring))
				2113	return end;
				2114	} else {
				2115	for (; start <= end; start++)
				2116	if (Py_UNICODE_MATCH(self, start, substring))
				2117	return start;
				2118	}
				2119
				2120	return -1;
				2121	}
				2122
				2123	int PyUnicode_Find(PyObject *str,
				2124	PyObject *substr,
				2125	int start,
				2126	int end,
				2127	int direction)
				2128	{
				2129	int result;
				2130
				2131	str = PyUnicode_FromObject(str);
				2132	if (str == NULL)
				2133	return -1;
				2134	substr = PyUnicode_FromObject(substr);
				2135	if (substr == NULL) {
				2136	Py_DECREF(substr);
				2137	return -1;
				2138	}
				2139
				2140	result = findstring((PyUnicodeObject *)str,
				2141	(PyUnicodeObject *)substr,
				2142	start, end, direction);
				2143	Py_DECREF(str);
				2144	Py_DECREF(substr);
				2145	return result;
				2146	}
				2147
				2148	static
				2149	int tailmatch(PyUnicodeObject *self,
				2150	PyUnicodeObject *substring,
				2151	int start,
				2152	int end,
				2153	int direction)
				2154	{
				2155	if (start < 0)
				2156	start += self->length;
				2157	if (start < 0)
				2158	start = 0;
				2159
				2160	if (substring->length == 0)
				2161	return 1;
				2162
				2163	if (end > self->length)
				2164	end = self->length;
				2165	if (end < 0)
				2166	end += self->length;
				2167	if (end < 0)
				2168	end = 0;
				2169
				2170	end -= substring->length;
				2171	if (end < start)
				2172	return 0;
				2173
				2174	if (direction > 0) {
				2175	if (Py_UNICODE_MATCH(self, end, substring))
				2176	return 1;
				2177	} else {
				2178	if (Py_UNICODE_MATCH(self, start, substring))
				2179	return 1;
				2180	}
				2181
				2182	return 0;
				2183	}
				2184
				2185	int PyUnicode_Tailmatch(PyObject *str,
				2186	PyObject *substr,
				2187	int start,
				2188	int end,
				2189	int direction)
				2190	{
				2191	int result;
				2192
				2193	str = PyUnicode_FromObject(str);
				2194	if (str == NULL)
				2195	return -1;
				2196	substr = PyUnicode_FromObject(substr);
				2197	if (substr == NULL) {
				2198	Py_DECREF(substr);
				2199	return -1;
				2200	}
				2201
				2202	result = tailmatch((PyUnicodeObject *)str,
				2203	(PyUnicodeObject *)substr,
				2204	start, end, direction);
				2205	Py_DECREF(str);
				2206	Py_DECREF(substr);
				2207	return result;
				2208	}
				2209
				2210	static
				2211	const Py_UNICODE findchar(const Py_UNICODE s,
				2212	int size,
				2213	Py_UNICODE ch)
				2214	{
				2215	/* like wcschr, but doesn't stop at NULL characters */
				2216
				2217	while (size-- > 0) {
				2218	if (*s == ch)
				2219	return s;
				2220	s++;
				2221	}
				2222
				2223	return NULL;
				2224	}
				2225
				2226	/* Apply fixfct filter to the Unicode object self and return a
				2227	reference to the modified object */
				2228
				2229	static
				2230	PyObject fixup(PyUnicodeObject self,
				2231	int (fixfct)(PyUnicodeObject s))
				2232	{
				2233
				2234	PyUnicodeObject *u;
				2235
				2236	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2237	self->length);
				2238	if (u == NULL)
				2239	return NULL;
				2240	if (!fixfct(u)) {
				2241	/* fixfct should return TRUE if it modified the buffer. If
				2242	FALSE, return a reference to the original buffer instead
				2243	(to save space, not time) */
				2244	Py_INCREF(self);
				2245	Py_DECREF(u);
				2246	return (PyObject*) self;
				2247	}
				2248	return (PyObject*) u;
				2249	}
				2250
				2251	static
				2252	int fixupper(PyUnicodeObject *self)
				2253	{
				2254	int len = self->length;
				2255	Py_UNICODE *s = self->str;
				2256	int status = 0;
				2257
				2258	while (len-- > 0) {
				2259	register Py_UNICODE ch;
				2260
				2261	ch = Py_UNICODE_TOUPPER(*s);
				2262	if (ch != *s) {
				2263	status = 1;
				2264	*s = ch;
				2265	}
				2266	s++;
				2267	}
				2268
				2269	return status;
				2270	}
				2271
				2272	static
				2273	int fixlower(PyUnicodeObject *self)
				2274	{
				2275	int len = self->length;
				2276	Py_UNICODE *s = self->str;
				2277	int status = 0;
				2278
				2279	while (len-- > 0) {
				2280	register Py_UNICODE ch;
				2281
				2282	ch = Py_UNICODE_TOLOWER(*s);
				2283	if (ch != *s) {
				2284	status = 1;
				2285	*s = ch;
				2286	}
				2287	s++;
				2288	}
				2289
				2290	return status;
				2291	}
				2292
				2293	static
				2294	int fixswapcase(PyUnicodeObject *self)
				2295	{
				2296	int len = self->length;
				2297	Py_UNICODE *s = self->str;
				2298	int status = 0;
				2299
				2300	while (len-- > 0) {
				2301	if (Py_UNICODE_ISUPPER(*s)) {
				2302	s = Py_UNICODE_TOLOWER(s);
				2303	status = 1;
				2304	} else if (Py_UNICODE_ISLOWER(*s)) {
				2305	s = Py_UNICODE_TOUPPER(s);
				2306	status = 1;
				2307	}
				2308	s++;
				2309	}
				2310
				2311	return status;
				2312	}
				2313
				2314	static
				2315	int fixcapitalize(PyUnicodeObject *self)
				2316	{
				2317	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2318	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2319	return 1;
				2320	}
				2321	return 0;
				2322	}
				2323
				2324	static
				2325	int fixtitle(PyUnicodeObject *self)
				2326	{
				2327	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2328	register Py_UNICODE *e;
				2329	int previous_is_cased;
				2330
				2331	/* Shortcut for single character strings */
				2332	if (PyUnicode_GET_SIZE(self) == 1) {
				2333	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2334	if (*p != ch) {
				2335	*p = ch;
				2336	return 1;
				2337	}
				2338	else
				2339	return 0;
				2340	}
				2341
				2342	e = p + PyUnicode_GET_SIZE(self);
				2343	previous_is_cased = 0;
				2344	for (; p < e; p++) {
				2345	register const Py_UNICODE ch = *p;
				2346
				2347	if (previous_is_cased)
				2348	*p = Py_UNICODE_TOLOWER(ch);
				2349	else
				2350	*p = Py_UNICODE_TOTITLE(ch);
				2351
				2352	if (Py_UNICODE_ISLOWER(ch) \|\|
				2353	Py_UNICODE_ISUPPER(ch) \|\|
				2354	Py_UNICODE_ISTITLE(ch))
				2355	previous_is_cased = 1;
				2356	else
				2357	previous_is_cased = 0;
				2358	}
				2359	return 1;
				2360	}
				2361
				2362	PyObject PyUnicode_Join(PyObject separator,
				2363	PyObject *seq)
				2364	{
				2365	Py_UNICODE *sep;
				2366	int seplen;
				2367	PyUnicodeObject *res = NULL;
				2368	int reslen = 0;
				2369	Py_UNICODE *p;
				2370	int seqlen = 0;
				2371	int sz = 100;
				2372	int i;
				2373
				2374	seqlen = PySequence_Length(seq);
				2375	if (seqlen < 0 && PyErr_Occurred())
				2376	return NULL;
				2377
				2378	if (separator == NULL) {
				2379	Py_UNICODE blank = ' ';
				2380	sep = &blank;
				2381	seplen = 1;
				2382	}
				2383	else {
				2384	separator = PyUnicode_FromObject(separator);
				2385	if (separator == NULL)
				2386	return NULL;
				2387	sep = PyUnicode_AS_UNICODE(separator);
				2388	seplen = PyUnicode_GET_SIZE(separator);
				2389	}
				2390
				2391	res = _PyUnicode_New(sz);
				2392	if (res == NULL)
				2393	goto onError;
				2394	p = PyUnicode_AS_UNICODE(res);
				2395	reslen = 0;
				2396
				2397	for (i = 0; i < seqlen; i++) {
				2398	int itemlen;
				2399	PyObject *item;
				2400
				2401	item = PySequence_GetItem(seq, i);
				2402	if (item == NULL)
				2403	goto onError;
				2404	if (!PyUnicode_Check(item)) {
				2405	PyObject *v;
				2406	v = PyUnicode_FromObject(item);
				2407	Py_DECREF(item);
				2408	item = v;
				2409	if (item == NULL)
				2410	goto onError;
				2411	}
				2412	itemlen = PyUnicode_GET_SIZE(item);
				2413	while (reslen + itemlen + seplen >= sz) {
				2414	if (_PyUnicode_Resize(res, sz*2))
				2415	goto onError;
				2416	sz *= 2;
				2417	p = PyUnicode_AS_UNICODE(res) + reslen;
				2418	}
				2419	if (i > 0) {
				2420	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2421	p += seplen;
				2422	reslen += seplen;
				2423	}
				2424	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2425	p += itemlen;
				2426	reslen += itemlen;
				2427	Py_DECREF(item);
				2428	}
				2429	if (_PyUnicode_Resize(res, reslen))
				2430	goto onError;
				2431
				2432	Py_XDECREF(separator);
				2433	return (PyObject *)res;
				2434
				2435	onError:
				2436	Py_XDECREF(separator);
				2437	Py_DECREF(res);
				2438	return NULL;
				2439	}
				2440
				2441	static
				2442	PyUnicodeObject pad(PyUnicodeObject self,
				2443	int left,
				2444	int right,
				2445	Py_UNICODE fill)
				2446	{
				2447	PyUnicodeObject *u;
				2448
				2449	if (left < 0)
				2450	left = 0;
				2451	if (right < 0)
				2452	right = 0;
				2453
				2454	if (left == 0 && right == 0) {
				2455	Py_INCREF(self);
				2456	return self;
				2457	}
				2458
				2459	u = _PyUnicode_New(left + self->length + right);
				2460	if (u) {
				2461	if (left)
				2462	Py_UNICODE_FILL(u->str, fill, left);
				2463	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2464	if (right)
				2465	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2466	}
				2467
				2468	return u;
				2469	}
				2470
				2471	#define SPLIT_APPEND(data, left, right) \
				2472	str = PyUnicode_FromUnicode(data + left, right - left); \
				2473	if (!str) \
				2474	goto onError; \
				2475	if (PyList_Append(list, str)) { \
				2476	Py_DECREF(str); \
				2477	goto onError; \
				2478	} \
				2479	else \
				2480	Py_DECREF(str);
				2481
				2482	static
				2483	PyObject split_whitespace(PyUnicodeObject self,
				2484	PyObject *list,
				2485	int maxcount)
				2486	{
				2487	register int i;
				2488	register int j;
				2489	int len = self->length;
				2490	PyObject *str;
				2491
				2492	for (i = j = 0; i < len; ) {
				2493	/* find a token */
				2494	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2495	i++;
				2496	j = i;
				2497	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2498	i++;
				2499	if (j < i) {
				2500	if (maxcount-- <= 0)
				2501	break;
				2502	SPLIT_APPEND(self->str, j, i);
				2503	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2504	i++;
				2505	j = i;
				2506	}
				2507	}
				2508	if (j < len) {
				2509	SPLIT_APPEND(self->str, j, len);
				2510	}
				2511	return list;
				2512
				2513	onError:
				2514	Py_DECREF(list);
				2515	return NULL;
				2516	}
				2517
				2518	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2519	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2520	{
				2521	register int i;
				2522	register int j;
				2523	int len;
				2524	PyObject *list;
				2525	PyObject *str;
				2526	Py_UNICODE *data;
				2527
				2528	string = PyUnicode_FromObject(string);
				2529	if (string == NULL)
				2530	return NULL;
				2531	data = PyUnicode_AS_UNICODE(string);
				2532	len = PyUnicode_GET_SIZE(string);
				2533
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2534	list = PyList_New(0);
				2535	if (!list)
				2536	goto onError;
				2537
				2538	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2539	int eol;
				2540
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2541	/* Find a line and append it */
				2542	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2543	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2544
				2545	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2546	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2547	if (i < len) {
				2548	if (data[i] == '\r' && i + 1 < len &&
				2549	data[i+1] == '\n')
				2550	i += 2;
				2551	else
				2552	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2553	if (keepends)
				2554	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2555	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2556	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2557	j = i;
				2558	}
				2559	if (j < len) {
				2560	SPLIT_APPEND(data, j, len);
				2561	}
				2562
				2563	Py_DECREF(string);
				2564	return list;
				2565
				2566	onError:
				2567	Py_DECREF(list);
				2568	Py_DECREF(string);
				2569	return NULL;
				2570	}
				2571
				2572	static
				2573	PyObject split_char(PyUnicodeObject self,
				2574	PyObject *list,
				2575	Py_UNICODE ch,
				2576	int maxcount)
				2577	{
				2578	register int i;
				2579	register int j;
				2580	int len = self->length;
				2581	PyObject *str;
				2582
				2583	for (i = j = 0; i < len; ) {
				2584	if (self->str[i] == ch) {
				2585	if (maxcount-- <= 0)
				2586	break;
				2587	SPLIT_APPEND(self->str, j, i);
				2588	i = j = i + 1;
				2589	} else
				2590	i++;
				2591	}
				2592	if (j <= len) {
				2593	SPLIT_APPEND(self->str, j, len);
				2594	}
				2595	return list;
				2596
				2597	onError:
				2598	Py_DECREF(list);
				2599	return NULL;
				2600	}
				2601
				2602	static
				2603	PyObject split_substring(PyUnicodeObject self,
				2604	PyObject *list,
				2605	PyUnicodeObject *substring,
				2606	int maxcount)
				2607	{
				2608	register int i;
				2609	register int j;
				2610	int len = self->length;
				2611	int sublen = substring->length;
				2612	PyObject *str;
				2613
				2614	for (i = j = 0; i < len - sublen; ) {
				2615	if (Py_UNICODE_MATCH(self, i, substring)) {
				2616	if (maxcount-- <= 0)
				2617	break;
				2618	SPLIT_APPEND(self->str, j, i);
				2619	i = j = i + sublen;
				2620	} else
				2621	i++;
				2622	}
				2623	if (j <= len) {
				2624	SPLIT_APPEND(self->str, j, len);
				2625	}
				2626	return list;
				2627
				2628	onError:
				2629	Py_DECREF(list);
				2630	return NULL;
				2631	}
				2632
				2633	#undef SPLIT_APPEND
				2634
				2635	static
				2636	PyObject split(PyUnicodeObject self,
				2637	PyUnicodeObject *substring,
				2638	int maxcount)
				2639	{
				2640	PyObject *list;
				2641
				2642	if (maxcount < 0)
				2643	maxcount = INT_MAX;
				2644
				2645	list = PyList_New(0);
				2646	if (!list)
				2647	return NULL;
				2648
				2649	if (substring == NULL)
				2650	return split_whitespace(self,list,maxcount);
				2651
				2652	else if (substring->length == 1)
				2653	return split_char(self,list,substring->str[0],maxcount);
				2654
				2655	else if (substring->length == 0) {
				2656	Py_DECREF(list);
				2657	PyErr_SetString(PyExc_ValueError, "empty separator");
				2658	return NULL;
				2659	}
				2660	else
				2661	return split_substring(self,list,substring,maxcount);
				2662	}
				2663
				2664	static
				2665	PyObject strip(PyUnicodeObject self,
				2666	int left,
				2667	int right)
				2668	{
				2669	Py_UNICODE *p = self->str;
				2670	int start = 0;
				2671	int end = self->length;
				2672
				2673	if (left)
				2674	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2675	start++;
				2676
				2677	if (right)
				2678	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2679	end--;
				2680
				2681	if (start == 0 && end == self->length) {
				2682	/* couldn't strip anything off, return original string */
				2683	Py_INCREF(self);
				2684	return (PyObject*) self;
				2685	}
				2686
				2687	return (PyObject*) PyUnicode_FromUnicode(
				2688	self->str + start,
				2689	end - start
				2690	);
				2691	}
				2692
				2693	static
				2694	PyObject replace(PyUnicodeObject self,
				2695	PyUnicodeObject *str1,
				2696	PyUnicodeObject *str2,
				2697	int maxcount)
				2698	{
				2699	PyUnicodeObject *u;
				2700
				2701	if (maxcount < 0)
				2702	maxcount = INT_MAX;
				2703
				2704	if (str1->length == 1 && str2->length == 1) {
				2705	int i;
				2706
				2707	/* replace characters */
				2708	if (!findchar(self->str, self->length, str1->str[0])) {
				2709	/* nothing to replace, return original string */
				2710	Py_INCREF(self);
				2711	u = self;
				2712	} else {
				2713	Py_UNICODE u1 = str1->str[0];
				2714	Py_UNICODE u2 = str2->str[0];
				2715
				2716	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2717	self->str,
				2718	self->length
				2719	);
				2720	if (u)
				2721	for (i = 0; i < u->length; i++)
				2722	if (u->str[i] == u1) {
				2723	if (--maxcount < 0)
				2724	break;
				2725	u->str[i] = u2;
				2726	}
				2727	}
				2728
				2729	} else {
				2730	int n, i;
				2731	Py_UNICODE *p;
				2732
				2733	/* replace strings */
				2734	n = count(self, 0, self->length, str1);
				2735	if (n > maxcount)
				2736	n = maxcount;
				2737	if (n == 0) {
				2738	/* nothing to replace, return original string */
				2739	Py_INCREF(self);
				2740	u = self;
				2741	} else {
				2742	u = _PyUnicode_New(
				2743	self->length + n * (str2->length - str1->length));
				2744	if (u) {
				2745	i = 0;
				2746	p = u->str;
				2747	while (i <= self->length - str1->length)
				2748	if (Py_UNICODE_MATCH(self, i, str1)) {
				2749	/* replace string segment */
				2750	Py_UNICODE_COPY(p, str2->str, str2->length);
				2751	p += str2->length;
				2752	i += str1->length;
				2753	if (--n <= 0) {
				2754	/* copy remaining part */
				2755	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				2756	break;
				2757	}
				2758	} else
				2759	*p++ = self->str[i++];
				2760	}
				2761	}
				2762	}
				2763
				2764	return (PyObject *) u;
				2765	}
				2766
				2767	/* --- Unicode Object Methods --------------------------------------------- */
				2768
				2769	static char title__doc__[] =
				2770	"S.title() -> unicode\n\
				2771	\n\
				2772	Return a titlecased version of S, i.e. words start with title case\n\
				2773	characters, all remaining cased characters have lower case.";
				2774
				2775	static PyObject*
				2776	unicode_title(PyUnicodeObject self, PyObject args)
				2777	{
				2778	if (!PyArg_NoArgs(args))
				2779	return NULL;
				2780	return fixup(self, fixtitle);
				2781	}
				2782
				2783	static char capitalize__doc__[] =
				2784	"S.capitalize() -> unicode\n\
				2785	\n\
				2786	Return a capitalized version of S, i.e. make the first character\n\
				2787	have upper case.";
				2788
				2789	static PyObject*
				2790	unicode_capitalize(PyUnicodeObject self, PyObject args)
				2791	{
				2792	if (!PyArg_NoArgs(args))
				2793	return NULL;
				2794	return fixup(self, fixcapitalize);
				2795	}
				2796
				2797	#if 0
				2798	static char capwords__doc__[] =
				2799	"S.capwords() -> unicode\n\
				2800	\n\
				2801	Apply .capitalize() to all words in S and return the result with\n\
				2802	normalized whitespace (all whitespace strings are replaced by ' ').";
				2803
				2804	static PyObject*
				2805	unicode_capwords(PyUnicodeObject self, PyObject args)
				2806	{
				2807	PyObject *list;
				2808	PyObject *item;
				2809	int i;
				2810
				2811	if (!PyArg_NoArgs(args))
				2812	return NULL;
				2813
				2814	/* Split into words */
				2815	list = split(self, NULL, -1);
				2816	if (!list)
				2817	return NULL;
				2818
				2819	/* Capitalize each word */
				2820	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				2821	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				2822	fixcapitalize);
				2823	if (item == NULL)
				2824	goto onError;
				2825	Py_DECREF(PyList_GET_ITEM(list, i));
				2826	PyList_SET_ITEM(list, i, item);
				2827	}
				2828
				2829	/* Join the words to form a new string */
				2830	item = PyUnicode_Join(NULL, list);
				2831
				2832	onError:
				2833	Py_DECREF(list);
				2834	return (PyObject *)item;
				2835	}
				2836	#endif
				2837
				2838	static char center__doc__[] =
				2839	"S.center(width) -> unicode\n\
				2840	\n\
				2841	Return S centered in a Unicode string of length width. Padding is done\n\
				2842	using spaces.";
				2843
				2844	static PyObject *
				2845	unicode_center(PyUnicodeObject self, PyObject args)
				2846	{
				2847	int marg, left;
				2848	int width;
				2849
				2850	if (!PyArg_ParseTuple(args, "i:center", &width))
				2851	return NULL;
				2852
				2853	if (self->length >= width) {
				2854	Py_INCREF(self);
				2855	return (PyObject*) self;
				2856	}
				2857
				2858	marg = width - self->length;
				2859	left = marg / 2 + (marg & width & 1);
				2860
				2861	return (PyObject*) pad(self, left, marg - left, ' ');
				2862	}
				2863
				2864	static int
				2865	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				2866	{
				2867	int len1, len2;
				2868	Py_UNICODE *s1 = str1->str;
				2869	Py_UNICODE *s2 = str2->str;
				2870
				2871	len1 = str1->length;
				2872	len2 = str2->length;
				2873
				2874	while (len1 > 0 && len2 > 0) {
				2875	int cmp = (s1++) - (s2++);
				2876	if (cmp)
				2877	/* This should make Christian happy! */
				2878	return (cmp < 0) ? -1 : (cmp != 0);
				2879	len1--, len2--;
				2880	}
				2881
				2882	return (len1 < len2) ? -1 : (len1 != len2);
				2883	}
				2884
				2885	int PyUnicode_Compare(PyObject *left,
				2886	PyObject *right)
				2887	{
				2888	PyUnicodeObject u = NULL, v = NULL;
				2889	int result;
				2890
				2891	/* Coerce the two arguments */
				2892	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2893	if (u == NULL)
				2894	goto onError;
				2895	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2896	if (v == NULL)
				2897	goto onError;
				2898
				2899	/* Shortcut for emtpy or interned objects */
				2900	if (v == u) {
				2901	Py_DECREF(u);
				2902	Py_DECREF(v);
				2903	return 0;
				2904	}
				2905
				2906	result = unicode_compare(u, v);
				2907
				2908	Py_DECREF(u);
				2909	Py_DECREF(v);
				2910	return result;
				2911
				2912	onError:
				2913	Py_XDECREF(u);
				2914	Py_XDECREF(v);
				2915	return -1;
				2916	}
				2917
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2918	int PyUnicode_Contains(PyObject *container,
				2919	PyObject *element)
				2920	{
				2921	PyUnicodeObject u = NULL, v = NULL;
				2922	int result;
				2923	register const Py_UNICODE p, e;
				2924	register Py_UNICODE ch;
				2925
				2926	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2927	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
				2928	if (v == NULL)
				2929	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2930	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				2931	if (u == NULL) {
				2932	Py_DECREF(v);
				2933	goto onError;
				2934	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2935
				2936	/* Check v in u */
				2937	if (PyUnicode_GET_SIZE(v) != 1) {
				2938	PyErr_SetString(PyExc_TypeError,
				2939	"string member test needs char left operand");
				2940	goto onError;
				2941	}
				2942	ch = *PyUnicode_AS_UNICODE(v);
				2943	p = PyUnicode_AS_UNICODE(u);
				2944	e = p + PyUnicode_GET_SIZE(u);
				2945	result = 0;
				2946	while (p < e) {
				2947	if (*p++ == ch) {
				2948	result = 1;
				2949	break;
				2950	}
				2951	}
				2952
				2953	Py_DECREF(u);
				2954	Py_DECREF(v);
				2955	return result;
				2956
				2957	onError:
				2958	Py_XDECREF(u);
				2959	Py_XDECREF(v);
				2960	return -1;
				2961	}
				2962
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2963	/* Concat to string or Unicode object giving a new Unicode object. */
				2964
				2965	PyObject PyUnicode_Concat(PyObject left,
				2966	PyObject *right)
				2967	{
				2968	PyUnicodeObject u = NULL, v = NULL, *w;
				2969
				2970	/* Coerce the two arguments */
				2971	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2972	if (u == NULL)
				2973	goto onError;
				2974	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2975	if (v == NULL)
				2976	goto onError;
				2977
				2978	/* Shortcuts */
				2979	if (v == unicode_empty) {
				2980	Py_DECREF(v);
				2981	return (PyObject *)u;
				2982	}
				2983	if (u == unicode_empty) {
				2984	Py_DECREF(u);
				2985	return (PyObject *)v;
				2986	}
				2987
				2988	/* Concat the two Unicode strings */
				2989	w = _PyUnicode_New(u->length + v->length);
				2990	if (w == NULL)
				2991	goto onError;
				2992	Py_UNICODE_COPY(w->str, u->str, u->length);
				2993	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				2994
				2995	Py_DECREF(u);
				2996	Py_DECREF(v);
				2997	return (PyObject *)w;
				2998
				2999	onError:
				3000	Py_XDECREF(u);
				3001	Py_XDECREF(v);
				3002	return NULL;
				3003	}
				3004
				3005	static char count__doc__[] =
				3006	"S.count(sub[, start[, end]]) -> int\n\
				3007	\n\
				3008	Return the number of occurrences of substring sub in Unicode string\n\
				3009	S[start:end]. Optional arguments start and end are\n\
				3010	interpreted as in slice notation.";
				3011
				3012	static PyObject *
				3013	unicode_count(PyUnicodeObject self, PyObject args)
				3014	{
				3015	PyUnicodeObject *substring;
				3016	int start = 0;
				3017	int end = INT_MAX;
				3018	PyObject *result;
				3019
				3020	if (!PyArg_ParseTuple(args, "O\|ii:count", &substring, &start, &end))
				3021	return NULL;
				3022
				3023	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3024	(PyObject *)substring);
				3025	if (substring == NULL)
				3026	return NULL;
				3027
				3028	if (substring->length == 0) {
				3029	Py_DECREF(substring);
				3030	return PyInt_FromLong((long) 0);
				3031	}
				3032
				3033	if (start < 0)
				3034	start += self->length;
				3035	if (start < 0)
				3036	start = 0;
				3037	if (end > self->length)
				3038	end = self->length;
				3039	if (end < 0)
				3040	end += self->length;
				3041	if (end < 0)
				3042	end = 0;
				3043
				3044	result = PyInt_FromLong((long) count(self, start, end, substring));
				3045
				3046	Py_DECREF(substring);
				3047	return result;
				3048	}
				3049
				3050	static char encode__doc__[] =
				3051	"S.encode([encoding[,errors]]) -> string\n\
				3052	\n\
				3053	Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
				3054	errors may be given to set a different error handling scheme. Default\n\
				3055	is 'strict' meaning that encoding errors raise a ValueError. Other\n\
				3056	possible values are 'ignore' and 'replace'.";
				3057
				3058	static PyObject *
				3059	unicode_encode(PyUnicodeObject self, PyObject args)
				3060	{
				3061	char *encoding = NULL;
				3062	char *errors = NULL;
				3063	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3064	return NULL;
				3065	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3066	}
				3067
				3068	static char expandtabs__doc__[] =
				3069	"S.expandtabs([tabsize]) -> unicode\n\
				3070	\n\
				3071	Return a copy of S where all tab characters are expanded using spaces.\n\
				3072	If tabsize is not given, a tab size of 8 characters is assumed.";
				3073
				3074	static PyObject*
				3075	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3076	{
				3077	Py_UNICODE *e;
				3078	Py_UNICODE *p;
				3079	Py_UNICODE *q;
				3080	int i, j;
				3081	PyUnicodeObject *u;
				3082	int tabsize = 8;
				3083
				3084	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3085	return NULL;
				3086
				3087	/* First pass: determine size of ouput string */
				3088	i = j = 0;
				3089	e = self->str + self->length;
				3090	for (p = self->str; p < e; p++)
				3091	if (*p == '\t') {
				3092	if (tabsize > 0)
				3093	j += tabsize - (j % tabsize);
				3094	}
				3095	else {
				3096	j++;
				3097	if (p == '\n' \|\| p == '\r') {
				3098	i += j;
				3099	j = 0;
				3100	}
				3101	}
				3102
				3103	/* Second pass: create output string and fill it */
				3104	u = _PyUnicode_New(i + j);
				3105	if (!u)
				3106	return NULL;
				3107
				3108	j = 0;
				3109	q = u->str;
				3110
				3111	for (p = self->str; p < e; p++)
				3112	if (*p == '\t') {
				3113	if (tabsize > 0) {
				3114	i = tabsize - (j % tabsize);
				3115	j += i;
				3116	while (i--)
				3117	*q++ = ' ';
				3118	}
				3119	}
				3120	else {
				3121	j++;
				3122	q++ = p;
				3123	if (p == '\n' \|\| p == '\r')
				3124	j = 0;
				3125	}
				3126
				3127	return (PyObject*) u;
				3128	}
				3129
				3130	static char find__doc__[] =
				3131	"S.find(sub [,start [,end]]) -> int\n\
				3132	\n\
				3133	Return the lowest index in S where substring sub is found,\n\
				3134	such that sub is contained within s[start,end]. Optional\n\
				3135	arguments start and end are interpreted as in slice notation.\n\
				3136	\n\
				3137	Return -1 on failure.";
				3138
				3139	static PyObject *
				3140	unicode_find(PyUnicodeObject self, PyObject args)
				3141	{
				3142	PyUnicodeObject *substring;
				3143	int start = 0;
				3144	int end = INT_MAX;
				3145	PyObject *result;
				3146
				3147	if (!PyArg_ParseTuple(args, "O\|ii:find", &substring, &start, &end))
				3148	return NULL;
				3149	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3150	(PyObject *)substring);
				3151	if (substring == NULL)
				3152	return NULL;
				3153
				3154	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3155
				3156	Py_DECREF(substring);
				3157	return result;
				3158	}
				3159
				3160	static PyObject *
				3161	unicode_getitem(PyUnicodeObject *self, int index)
				3162	{
				3163	if (index < 0 \|\| index >= self->length) {
				3164	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3165	return NULL;
				3166	}
				3167
				3168	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3169	}
				3170
				3171	static long
				3172	unicode_hash(PyUnicodeObject *self)
				3173	{
				3174	long hash;
				3175	PyObject *utf8;
				3176
				3177	/* Since Unicode objects compare equal to their UTF-8 string
				3178	counterparts, they should also use the UTF-8 strings as basis
				3179	for their hash value. This is needed to assure that strings and
				3180	Unicode objects behave in the same way as dictionary
				3181	keys. Unfortunately, this costs some performance and also some
				3182	memory if the cached UTF-8 representation is not used later
				3183	on. */
				3184	if (self->hash != -1)
				3185	return self->hash;
				3186	utf8 = utf8_string(self, NULL);
				3187	if (utf8 == NULL)
				3188	return -1;
				3189	hash = PyObject_Hash(utf8);
				3190	if (hash == -1)
				3191	return -1;
				3192	self->hash = hash;
				3193	return hash;
				3194	}
				3195
				3196	static char index__doc__[] =
				3197	"S.index(sub [,start [,end]]) -> int\n\
				3198	\n\
				3199	Like S.find() but raise ValueError when the substring is not found.";
				3200
				3201	static PyObject *
				3202	unicode_index(PyUnicodeObject self, PyObject args)
				3203	{
				3204	int result;
				3205	PyUnicodeObject *substring;
				3206	int start = 0;
				3207	int end = INT_MAX;
				3208
				3209	if (!PyArg_ParseTuple(args, "O\|ii:index", &substring, &start, &end))
				3210	return NULL;
				3211
				3212	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3213	(PyObject *)substring);
				3214	if (substring == NULL)
				3215	return NULL;
				3216
				3217	result = findstring(self, substring, start, end, 1);
				3218
				3219	Py_DECREF(substring);
				3220	if (result < 0) {
				3221	PyErr_SetString(PyExc_ValueError, "substring not found");
				3222	return NULL;
				3223	}
				3224	return PyInt_FromLong(result);
				3225	}
				3226
				3227	static char islower__doc__[] =
				3228	"S.islower() -> int\n\
				3229	\n\
				3230	Return 1 if all cased characters in S are lowercase and there is\n\
				3231	at least one cased character in S, 0 otherwise.";
				3232
				3233	static PyObject*
				3234	unicode_islower(PyUnicodeObject self, PyObject args)
				3235	{
				3236	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3237	register const Py_UNICODE *e;
				3238	int cased;
				3239
				3240	if (!PyArg_NoArgs(args))
				3241	return NULL;
				3242
				3243	/* Shortcut for single character strings */
				3244	if (PyUnicode_GET_SIZE(self) == 1)
				3245	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3246
				3247	e = p + PyUnicode_GET_SIZE(self);
				3248	cased = 0;
				3249	for (; p < e; p++) {
				3250	register const Py_UNICODE ch = *p;
				3251
				3252	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3253	return PyInt_FromLong(0);
				3254	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3255	cased = 1;
				3256	}
				3257	return PyInt_FromLong(cased);
				3258	}
				3259
				3260	static char isupper__doc__[] =
				3261	"S.isupper() -> int\n\
				3262	\n\
				3263	Return 1 if all cased characters in S are uppercase and there is\n\
				3264	at least one cased character in S, 0 otherwise.";
				3265
				3266	static PyObject*
				3267	unicode_isupper(PyUnicodeObject self, PyObject args)
				3268	{
				3269	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3270	register const Py_UNICODE *e;
				3271	int cased;
				3272
				3273	if (!PyArg_NoArgs(args))
				3274	return NULL;
				3275
				3276	/* Shortcut for single character strings */
				3277	if (PyUnicode_GET_SIZE(self) == 1)
				3278	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3279
				3280	e = p + PyUnicode_GET_SIZE(self);
				3281	cased = 0;
				3282	for (; p < e; p++) {
				3283	register const Py_UNICODE ch = *p;
				3284
				3285	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3286	return PyInt_FromLong(0);
				3287	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3288	cased = 1;
				3289	}
				3290	return PyInt_FromLong(cased);
				3291	}
				3292
				3293	static char istitle__doc__[] =
				3294	"S.istitle() -> int\n\
				3295	\n\
				3296	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3297	may only follow uncased characters and lowercase characters only cased\n\
				3298	ones. Return 0 otherwise.";
				3299
				3300	static PyObject*
				3301	unicode_istitle(PyUnicodeObject self, PyObject args)
				3302	{
				3303	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3304	register const Py_UNICODE *e;
				3305	int cased, previous_is_cased;
				3306
				3307	if (!PyArg_NoArgs(args))
				3308	return NULL;
				3309
				3310	/* Shortcut for single character strings */
				3311	if (PyUnicode_GET_SIZE(self) == 1)
				3312	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3313	(Py_UNICODE_ISUPPER(*p) != 0));
				3314
				3315	e = p + PyUnicode_GET_SIZE(self);
				3316	cased = 0;
				3317	previous_is_cased = 0;
				3318	for (; p < e; p++) {
				3319	register const Py_UNICODE ch = *p;
				3320
				3321	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3322	if (previous_is_cased)
				3323	return PyInt_FromLong(0);
				3324	previous_is_cased = 1;
				3325	cased = 1;
				3326	}
				3327	else if (Py_UNICODE_ISLOWER(ch)) {
				3328	if (!previous_is_cased)
				3329	return PyInt_FromLong(0);
				3330	previous_is_cased = 1;
				3331	cased = 1;
				3332	}
				3333	else
				3334	previous_is_cased = 0;
				3335	}
				3336	return PyInt_FromLong(cased);
				3337	}
				3338
				3339	static char isspace__doc__[] =
				3340	"S.isspace() -> int\n\
				3341	\n\
				3342	Return 1 if there are only whitespace characters in S,\n\
				3343	0 otherwise.";
				3344
				3345	static PyObject*
				3346	unicode_isspace(PyUnicodeObject self, PyObject args)
				3347	{
				3348	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3349	register const Py_UNICODE *e;
				3350
				3351	if (!PyArg_NoArgs(args))
				3352	return NULL;
				3353
				3354	/* Shortcut for single character strings */
				3355	if (PyUnicode_GET_SIZE(self) == 1 &&
				3356	Py_UNICODE_ISSPACE(*p))
				3357	return PyInt_FromLong(1);
				3358
				3359	e = p + PyUnicode_GET_SIZE(self);
				3360	for (; p < e; p++) {
				3361	if (!Py_UNICODE_ISSPACE(*p))
				3362	return PyInt_FromLong(0);
				3363	}
				3364	return PyInt_FromLong(1);
				3365	}
				3366
				3367	static char isdecimal__doc__[] =
				3368	"S.isdecimal() -> int\n\
				3369	\n\
				3370	Return 1 if there are only decimal characters in S,\n\
				3371	0 otherwise.";
				3372
				3373	static PyObject*
				3374	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3375	{
				3376	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3377	register const Py_UNICODE *e;
				3378
				3379	if (!PyArg_NoArgs(args))
				3380	return NULL;
				3381
				3382	/* Shortcut for single character strings */
				3383	if (PyUnicode_GET_SIZE(self) == 1 &&
				3384	Py_UNICODE_ISDECIMAL(*p))
				3385	return PyInt_FromLong(1);
				3386
				3387	e = p + PyUnicode_GET_SIZE(self);
				3388	for (; p < e; p++) {
				3389	if (!Py_UNICODE_ISDECIMAL(*p))
				3390	return PyInt_FromLong(0);
				3391	}
				3392	return PyInt_FromLong(1);
				3393	}
				3394
				3395	static char isdigit__doc__[] =
				3396	"S.isdigit() -> int\n\
				3397	\n\
				3398	Return 1 if there are only digit characters in S,\n\
				3399	0 otherwise.";
				3400
				3401	static PyObject*
				3402	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3403	{
				3404	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3405	register const Py_UNICODE *e;
				3406
				3407	if (!PyArg_NoArgs(args))
				3408	return NULL;
				3409
				3410	/* Shortcut for single character strings */
				3411	if (PyUnicode_GET_SIZE(self) == 1 &&
				3412	Py_UNICODE_ISDIGIT(*p))
				3413	return PyInt_FromLong(1);
				3414
				3415	e = p + PyUnicode_GET_SIZE(self);
				3416	for (; p < e; p++) {
				3417	if (!Py_UNICODE_ISDIGIT(*p))
				3418	return PyInt_FromLong(0);
				3419	}
				3420	return PyInt_FromLong(1);
				3421	}
				3422
				3423	static char isnumeric__doc__[] =
				3424	"S.isnumeric() -> int\n\
				3425	\n\
				3426	Return 1 if there are only numeric characters in S,\n\
				3427	0 otherwise.";
				3428
				3429	static PyObject*
				3430	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3431	{
				3432	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3433	register const Py_UNICODE *e;
				3434
				3435	if (!PyArg_NoArgs(args))
				3436	return NULL;
				3437
				3438	/* Shortcut for single character strings */
				3439	if (PyUnicode_GET_SIZE(self) == 1 &&
				3440	Py_UNICODE_ISNUMERIC(*p))
				3441	return PyInt_FromLong(1);
				3442
				3443	e = p + PyUnicode_GET_SIZE(self);
				3444	for (; p < e; p++) {
				3445	if (!Py_UNICODE_ISNUMERIC(*p))
				3446	return PyInt_FromLong(0);
				3447	}
				3448	return PyInt_FromLong(1);
				3449	}
				3450
				3451	static char join__doc__[] =
				3452	"S.join(sequence) -> unicode\n\
				3453	\n\
				3454	Return a string which is the concatenation of the strings in the\n\
				3455	sequence. The separator between elements is S.";
				3456
				3457	static PyObject*
				3458	unicode_join(PyUnicodeObject self, PyObject args)
				3459	{
				3460	PyObject *data;
				3461	if (!PyArg_ParseTuple(args, "O:join", &data))
				3462	return NULL;
				3463
				3464	return PyUnicode_Join((PyObject *)self, data);
				3465	}
				3466
				3467	static int
				3468	unicode_length(PyUnicodeObject *self)
				3469	{
				3470	return self->length;
				3471	}
				3472
				3473	static char ljust__doc__[] =
				3474	"S.ljust(width) -> unicode\n\
				3475	\n\
				3476	Return S left justified in a Unicode string of length width. Padding is\n\
				3477	done using spaces.";
				3478
				3479	static PyObject *
				3480	unicode_ljust(PyUnicodeObject self, PyObject args)
				3481	{
				3482	int width;
				3483	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3484	return NULL;
				3485
				3486	if (self->length >= width) {
				3487	Py_INCREF(self);
				3488	return (PyObject*) self;
				3489	}
				3490
				3491	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3492	}
				3493
				3494	static char lower__doc__[] =
				3495	"S.lower() -> unicode\n\
				3496	\n\
				3497	Return a copy of the string S converted to lowercase.";
				3498
				3499	static PyObject*
				3500	unicode_lower(PyUnicodeObject self, PyObject args)
				3501	{
				3502	if (!PyArg_NoArgs(args))
				3503	return NULL;
				3504	return fixup(self, fixlower);
				3505	}
				3506
				3507	static char lstrip__doc__[] =
				3508	"S.lstrip() -> unicode\n\
				3509	\n\
				3510	Return a copy of the string S with leading whitespace removed.";
				3511
				3512	static PyObject *
				3513	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3514	{
				3515	if (!PyArg_NoArgs(args))
				3516	return NULL;
				3517	return strip(self, 1, 0);
				3518	}
				3519
				3520	static PyObject*
				3521	unicode_repeat(PyUnicodeObject *str, int len)
				3522	{
				3523	PyUnicodeObject *u;
				3524	Py_UNICODE *p;
				3525
				3526	if (len < 0)
				3527	len = 0;
				3528
				3529	if (len == 1) {
				3530	/* no repeat, return original string */
				3531	Py_INCREF(str);
				3532	return (PyObject*) str;
				3533	}
				3534
				3535	u = _PyUnicode_New(len * str->length);
				3536	if (!u)
				3537	return NULL;
				3538
				3539	p = u->str;
				3540
				3541	while (len-- > 0) {
				3542	Py_UNICODE_COPY(p, str->str, str->length);
				3543	p += str->length;
				3544	}
				3545
				3546	return (PyObject*) u;
				3547	}
				3548
				3549	PyObject PyUnicode_Replace(PyObject obj,
				3550	PyObject *subobj,
				3551	PyObject *replobj,
				3552	int maxcount)
				3553	{
				3554	PyObject *self;
				3555	PyObject *str1;
				3556	PyObject *str2;
				3557	PyObject *result;
				3558
				3559	self = PyUnicode_FromObject(obj);
				3560	if (self == NULL)
				3561	return NULL;
				3562	str1 = PyUnicode_FromObject(subobj);
				3563	if (str1 == NULL) {
				3564	Py_DECREF(self);
				3565	return NULL;
				3566	}
				3567	str2 = PyUnicode_FromObject(replobj);
				3568	if (str2 == NULL) {
				3569	Py_DECREF(self);
				3570	Py_DECREF(str1);
				3571	return NULL;
				3572	}
				3573	result = replace((PyUnicodeObject *)self,
				3574	(PyUnicodeObject *)str1,
				3575	(PyUnicodeObject *)str2,
				3576	maxcount);
				3577	Py_DECREF(self);
				3578	Py_DECREF(str1);
				3579	Py_DECREF(str2);
				3580	return result;
				3581	}
				3582
				3583	static char replace__doc__[] =
				3584	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3585	\n\
				3586	Return a copy of S with all occurrences of substring\n\
				3587	old replaced by new. If the optional argument maxsplit is\n\
				3588	given, only the first maxsplit occurrences are replaced.";
				3589
				3590	static PyObject*
				3591	unicode_replace(PyUnicodeObject self, PyObject args)
				3592	{
				3593	PyUnicodeObject *str1;
				3594	PyUnicodeObject *str2;
				3595	int maxcount = -1;
				3596	PyObject *result;
				3597
				3598	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3599	return NULL;
				3600	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3601	if (str1 == NULL)
				3602	return NULL;
				3603	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3604	if (str2 == NULL)
				3605	return NULL;
				3606
				3607	result = replace(self, str1, str2, maxcount);
				3608
				3609	Py_DECREF(str1);
				3610	Py_DECREF(str2);
				3611	return result;
				3612	}
				3613
				3614	static
				3615	PyObject unicode_repr(PyObject unicode)
				3616	{
				3617	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				3618	PyUnicode_GET_SIZE(unicode),
				3619	1);
				3620	}
				3621
				3622	static char rfind__doc__[] =
				3623	"S.rfind(sub [,start [,end]]) -> int\n\
				3624	\n\
				3625	Return the highest index in S where substring sub is found,\n\
				3626	such that sub is contained within s[start,end]. Optional\n\
				3627	arguments start and end are interpreted as in slice notation.\n\
				3628	\n\
				3629	Return -1 on failure.";
				3630
				3631	static PyObject *
				3632	unicode_rfind(PyUnicodeObject self, PyObject args)
				3633	{
				3634	PyUnicodeObject *substring;
				3635	int start = 0;
				3636	int end = INT_MAX;
				3637	PyObject *result;
				3638
				3639	if (!PyArg_ParseTuple(args, "O\|ii:rfind", &substring, &start, &end))
				3640	return NULL;
				3641	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3642	(PyObject *)substring);
				3643	if (substring == NULL)
				3644	return NULL;
				3645
				3646	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				3647
				3648	Py_DECREF(substring);
				3649	return result;
				3650	}
				3651
				3652	static char rindex__doc__[] =
				3653	"S.rindex(sub [,start [,end]]) -> int\n\
				3654	\n\
				3655	Like S.rfind() but raise ValueError when the substring is not found.";
				3656
				3657	static PyObject *
				3658	unicode_rindex(PyUnicodeObject self, PyObject args)
				3659	{
				3660	int result;
				3661	PyUnicodeObject *substring;
				3662	int start = 0;
				3663	int end = INT_MAX;
				3664
				3665	if (!PyArg_ParseTuple(args, "O\|ii:rindex", &substring, &start, &end))
				3666	return NULL;
				3667	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3668	(PyObject *)substring);
				3669	if (substring == NULL)
				3670	return NULL;
				3671
				3672	result = findstring(self, substring, start, end, -1);
				3673
				3674	Py_DECREF(substring);
				3675	if (result < 0) {
				3676	PyErr_SetString(PyExc_ValueError, "substring not found");
				3677	return NULL;
				3678	}
				3679	return PyInt_FromLong(result);
				3680	}
				3681
				3682	static char rjust__doc__[] =
				3683	"S.rjust(width) -> unicode\n\
				3684	\n\
				3685	Return S right justified in a Unicode string of length width. Padding is\n\
				3686	done using spaces.";
				3687
				3688	static PyObject *
				3689	unicode_rjust(PyUnicodeObject self, PyObject args)
				3690	{
				3691	int width;
				3692	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				3693	return NULL;
				3694
				3695	if (self->length >= width) {
				3696	Py_INCREF(self);
				3697	return (PyObject*) self;
				3698	}
				3699
				3700	return (PyObject*) pad(self, width - self->length, 0, ' ');
				3701	}
				3702
				3703	static char rstrip__doc__[] =
				3704	"S.rstrip() -> unicode\n\
				3705	\n\
				3706	Return a copy of the string S with trailing whitespace removed.";
				3707
				3708	static PyObject *
				3709	unicode_rstrip(PyUnicodeObject self, PyObject args)
				3710	{
				3711	if (!PyArg_NoArgs(args))
				3712	return NULL;
				3713	return strip(self, 0, 1);
				3714	}
				3715
				3716	static PyObject*
				3717	unicode_slice(PyUnicodeObject *self, int start, int end)
				3718	{
				3719	/* standard clamping */
				3720	if (start < 0)
				3721	start = 0;
				3722	if (end < 0)
				3723	end = 0;
				3724	if (end > self->length)
				3725	end = self->length;
				3726	if (start == 0 && end == self->length) {
				3727	/* full slice, return original string */
				3728	Py_INCREF(self);
				3729	return (PyObject*) self;
				3730	}
				3731	if (start > end)
				3732	start = end;
				3733	/* copy slice */
				3734	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				3735	end - start);
				3736	}
				3737
				3738	PyObject PyUnicode_Split(PyObject s,
				3739	PyObject *sep,
				3740	int maxsplit)
				3741	{
				3742	PyObject *result;
				3743
				3744	s = PyUnicode_FromObject(s);
				3745	if (s == NULL)
				3746	return NULL;
				3747	if (sep != NULL) {
				3748	sep = PyUnicode_FromObject(sep);
				3749	if (sep == NULL) {
				3750	Py_DECREF(s);
				3751	return NULL;
				3752	}
				3753	}
				3754
				3755	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				3756
				3757	Py_DECREF(s);
				3758	Py_XDECREF(sep);
				3759	return result;
				3760	}
				3761
				3762	static char split__doc__[] =
				3763	"S.split([sep [,maxsplit]]) -> list of strings\n\
				3764	\n\
				3765	Return a list of the words in S, using sep as the\n\
				3766	delimiter string. If maxsplit is given, at most maxsplit\n\
				3767	splits are done. If sep is not specified, any whitespace string\n\
				3768	is a separator.";
				3769
				3770	static PyObject*
				3771	unicode_split(PyUnicodeObject self, PyObject args)
				3772	{
				3773	PyObject *substring = Py_None;
				3774	int maxcount = -1;
				3775
				3776	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				3777	return NULL;
				3778
				3779	if (substring == Py_None)
				3780	return split(self, NULL, maxcount);
				3781	else if (PyUnicode_Check(substring))
				3782	return split(self, (PyUnicodeObject *)substring, maxcount);
				3783	else
				3784	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				3785	}
				3786
				3787	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3788	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3789	\n\
				3790	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3791	Line breaks are not included in the resulting list unless keepends\n\
				3792	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3793
				3794	static PyObject*
				3795	unicode_splitlines(PyUnicodeObject self, PyObject args)
				3796	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3797	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3798
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3799	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3800	return NULL;
				3801
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3802	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3803	}
				3804
				3805	static
				3806	PyObject unicode_str(PyUnicodeObject self)
				3807	{
				3808	return PyUnicode_AsUTF8String((PyObject *)self);
				3809	}
				3810
				3811	static char strip__doc__[] =
				3812	"S.strip() -> unicode\n\
				3813	\n\
				3814	Return a copy of S with leading and trailing whitespace removed.";
				3815
				3816	static PyObject *
				3817	unicode_strip(PyUnicodeObject self, PyObject args)
				3818	{
				3819	if (!PyArg_NoArgs(args))
				3820	return NULL;
				3821	return strip(self, 1, 1);
				3822	}
				3823
				3824	static char swapcase__doc__[] =
				3825	"S.swapcase() -> unicode\n\
				3826	\n\
				3827	Return a copy of S with uppercase characters converted to lowercase\n\
				3828	and vice versa.";
				3829
				3830	static PyObject*
				3831	unicode_swapcase(PyUnicodeObject self, PyObject args)
				3832	{
				3833	if (!PyArg_NoArgs(args))
				3834	return NULL;
				3835	return fixup(self, fixswapcase);
				3836	}
				3837
				3838	static char translate__doc__[] =
				3839	"S.translate(table) -> unicode\n\
				3840	\n\
				3841	Return a copy of the string S, where all characters have been mapped\n\
				3842	through the given translation table, which must be a mapping of\n\
				3843	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				3844	are left untouched. Characters mapped to None are deleted.";
				3845
				3846	static PyObject*
				3847	unicode_translate(PyUnicodeObject self, PyObject args)
				3848	{
				3849	PyObject *table;
				3850
				3851	if (!PyArg_ParseTuple(args, "O:translate", &table))
				3852	return NULL;
				3853	return PyUnicode_TranslateCharmap(self->str,
				3854	self->length,
				3855	table,
				3856	"ignore");
				3857	}
				3858
				3859	static char upper__doc__[] =
				3860	"S.upper() -> unicode\n\
				3861	\n\
				3862	Return a copy of S converted to uppercase.";
				3863
				3864	static PyObject*
				3865	unicode_upper(PyUnicodeObject self, PyObject args)
				3866	{
				3867	if (!PyArg_NoArgs(args))
				3868	return NULL;
				3869	return fixup(self, fixupper);
				3870	}
				3871
				3872	#if 0
				3873	static char zfill__doc__[] =
				3874	"S.zfill(width) -> unicode\n\
				3875	\n\
				3876	Pad a numeric string x with zeros on the left, to fill a field\n\
				3877	of the specified width. The string x is never truncated.";
				3878
				3879	static PyObject *
				3880	unicode_zfill(PyUnicodeObject self, PyObject args)
				3881	{
				3882	int fill;
				3883	PyUnicodeObject *u;
				3884
				3885	int width;
				3886	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				3887	return NULL;
				3888
				3889	if (self->length >= width) {
				3890	Py_INCREF(self);
				3891	return (PyObject*) self;
				3892	}
				3893
				3894	fill = width - self->length;
				3895
				3896	u = pad(self, fill, 0, '0');
				3897
				3898	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				3899	/* move sign to beginning of string */
				3900	u->str[0] = u->str[fill];
				3901	u->str[fill] = '0';
				3902	}
				3903
				3904	return (PyObject*) u;
				3905	}
				3906	#endif
				3907
				3908	#if 0
				3909	static PyObject*
				3910	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				3911	{
				3912	if (!PyArg_NoArgs(args))
				3913	return NULL;
				3914	return PyInt_FromLong(unicode_freelist_size);
				3915	}
				3916	#endif
				3917
				3918	static char startswith__doc__[] =
				3919	"S.startswith(prefix[, start[, end]]) -> int\n\
				3920	\n\
				3921	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				3922	optional start, test S beginning at that position. With optional end, stop\n\
				3923	comparing S at that position.";
				3924
				3925	static PyObject *
				3926	unicode_startswith(PyUnicodeObject *self,
				3927	PyObject *args)
				3928	{
				3929	PyUnicodeObject *substring;
				3930	int start = 0;
				3931	int end = INT_MAX;
				3932	PyObject *result;
				3933
				3934	if (!PyArg_ParseTuple(args, "O\|ii:startswith", &substring, &start, &end))
				3935	return NULL;
				3936	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3937	(PyObject *)substring);
				3938	if (substring == NULL)
				3939	return NULL;
				3940
				3941	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				3942
				3943	Py_DECREF(substring);
				3944	return result;
				3945	}
				3946
				3947
				3948	static char endswith__doc__[] =
				3949	"S.endswith(suffix[, start[, end]]) -> int\n\
				3950	\n\
				3951	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				3952	optional start, test S beginning at that position. With optional end, stop\n\
				3953	comparing S at that position.";
				3954
				3955	static PyObject *
				3956	unicode_endswith(PyUnicodeObject *self,
				3957	PyObject *args)
				3958	{
				3959	PyUnicodeObject *substring;
				3960	int start = 0;
				3961	int end = INT_MAX;
				3962	PyObject *result;
				3963
				3964	if (!PyArg_ParseTuple(args, "O\|ii:endswith", &substring, &start, &end))
				3965	return NULL;
				3966	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3967	(PyObject *)substring);
				3968	if (substring == NULL)
				3969	return NULL;
				3970
				3971	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				3972
				3973	Py_DECREF(substring);
				3974	return result;
				3975	}
				3976
				3977
				3978	static PyMethodDef unicode_methods[] = {
				3979
				3980	/* Order is according to common usage: often used methods should
				3981	appear first, since lookup is done sequentially. */
				3982
				3983	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				3984	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				3985	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				3986	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				3987	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				3988	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				3989	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				3990	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				3991	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				3992	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				3993	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				3994	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				3995	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				3996	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				3997	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				3998	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				3999	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4000	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4001	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4002	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4003	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4004	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4005	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4006	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4007	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4008	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4009	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4010	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4011	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4012	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4013	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4014	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4015	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
				4016	#if 0
				4017	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4018	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4019	#endif
				4020
				4021	#if 0
				4022	/* This one is just used for debugging the implementation. */
				4023	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4024	#endif
				4025
				4026	{NULL, NULL}
				4027	};
				4028
				4029	static PyObject *
				4030	unicode_getattr(PyUnicodeObject self, char name)
				4031	{
				4032	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4033	}
				4034
				4035	static PySequenceMethods unicode_as_sequence = {
				4036	(inquiry) unicode_length, /* sq_length */
				4037	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4038	(intargfunc) unicode_repeat, /* sq_repeat */
				4039	(intargfunc) unicode_getitem, /* sq_item */
				4040	(intintargfunc) unicode_slice, /* sq_slice */
				4041	0, /* sq_ass_item */
				4042	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4043	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4044	};
				4045
				4046	static int
				4047	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4048	int index,
				4049	const void **ptr)
				4050	{
				4051	if (index != 0) {
				4052	PyErr_SetString(PyExc_SystemError,
				4053	"accessing non-existent unicode segment");
				4054	return -1;
				4055	}
				4056	ptr = (void ) self->str;
				4057	return PyUnicode_GET_DATA_SIZE(self);
				4058	}
				4059
				4060	static int
				4061	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4062	const void **ptr)
				4063	{
				4064	PyErr_SetString(PyExc_TypeError,
				4065	"cannot use unicode as modifyable buffer");
				4066	return -1;
				4067	}
				4068
				4069	static int
				4070	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4071	int *lenp)
				4072	{
				4073	if (lenp)
				4074	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4075	return 1;
				4076	}
				4077
				4078	static int
				4079	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4080	int index,
				4081	const void **ptr)
				4082	{
				4083	PyObject *str;
				4084
				4085	if (index != 0) {
				4086	PyErr_SetString(PyExc_SystemError,
				4087	"accessing non-existent unicode segment");
				4088	return -1;
				4089	}
				4090	str = utf8_string(self, NULL);
				4091	if (str == NULL)
				4092	return -1;
				4093	ptr = (void ) PyString_AS_STRING(str);
				4094	return PyString_GET_SIZE(str);
				4095	}
				4096
				4097	/* Helpers for PyUnicode_Format() */
				4098
				4099	static PyObject *
				4100	getnextarg(args, arglen, p_argidx)
				4101	PyObject *args;
				4102	int arglen;
				4103	int *p_argidx;
				4104	{
				4105	int argidx = *p_argidx;
				4106	if (argidx < arglen) {
				4107	(*p_argidx)++;
				4108	if (arglen < 0)
				4109	return args;
				4110	else
				4111	return PyTuple_GetItem(args, argidx);
				4112	}
				4113	PyErr_SetString(PyExc_TypeError,
				4114	"not enough arguments for format string");
				4115	return NULL;
				4116	}
				4117
				4118	#define F_LJUST (1<<0)
				4119	#define F_SIGN (1<<1)
				4120	#define F_BLANK (1<<2)
				4121	#define F_ALT (1<<3)
				4122	#define F_ZERO (1<<4)
				4123
				4124	static
				4125	#ifdef HAVE_STDARG_PROTOTYPES
				4126	int usprintf(register Py_UNICODE buffer, char format, ...)
				4127	#else
				4128	int usprintf(va_alist) va_dcl
				4129	#endif
				4130	{
				4131	register int i;
				4132	int len;
				4133	va_list va;
				4134	char *charbuffer;
				4135	#ifdef HAVE_STDARG_PROTOTYPES
				4136	va_start(va, format);
				4137	#else
				4138	Py_UNICODE *args;
				4139	char *format;
				4140
				4141	va_start(va);
				4142	buffer = va_arg(va, Py_UNICODE *);
				4143	format = va_arg(va, char *);
				4144	#endif
				4145
				4146	/* First, format the string as char array, then expand to Py_UNICODE
				4147	array. */
				4148	charbuffer = (char *)buffer;
				4149	len = vsprintf(charbuffer, format, va);
				4150	for (i = len - 1; i >= 0; i--)
				4151	buffer[i] = (Py_UNICODE) charbuffer[i];
				4152
				4153	va_end(va);
				4154	return len;
				4155	}
				4156
				4157	static int
				4158	formatfloat(Py_UNICODE *buf,
				4159	int flags,
				4160	int prec,
				4161	int type,
				4162	PyObject *v)
				4163	{
				4164	char fmt[20];
				4165	double x;
				4166
				4167	x = PyFloat_AsDouble(v);
				4168	if (x == -1.0 && PyErr_Occurred())
				4169	return -1;
				4170	if (prec < 0)
				4171	prec = 6;
				4172	if (prec > 50)
				4173	prec = 50; /* Arbitrary limitation */
				4174	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4175	type = 'g';
				4176	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
				4177	return usprintf(buf, fmt, x);
				4178	}
				4179
				4180	static int
				4181	formatint(Py_UNICODE *buf,
				4182	int flags,
				4183	int prec,
				4184	int type,
				4185	PyObject *v)
				4186	{
				4187	char fmt[20];
				4188	long x;
				4189
				4190	x = PyInt_AsLong(v);
				4191	if (x == -1 && PyErr_Occurred())
				4192	return -1;
				4193	if (prec < 0)
				4194	prec = 1;
				4195	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4196	return usprintf(buf, fmt, x);
				4197	}
				4198
				4199	static int
				4200	formatchar(Py_UNICODE *buf,
				4201	PyObject *v)
				4202	{
				4203	if (PyUnicode_Check(v))
				4204	buf[0] = PyUnicode_AS_UNICODE(v)[0];
				4205
				4206	else if (PyString_Check(v))
				4207	buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
				4208
				4209	else {
				4210	/* Integer input truncated to a character */
				4211	long x;
				4212	x = PyInt_AsLong(v);
				4213	if (x == -1 && PyErr_Occurred())
				4214	return -1;
				4215	buf[0] = (char) x;
				4216	}
				4217	buf[1] = '\0';
				4218	return 1;
				4219	}
				4220
				4221	PyObject PyUnicode_Format(PyObject format,
				4222	PyObject *args)
				4223	{
				4224	Py_UNICODE fmt, res;
				4225	int fmtcnt, rescnt, reslen, arglen, argidx;
				4226	int args_owned = 0;
				4227	PyUnicodeObject *result = NULL;
				4228	PyObject *dict = NULL;
				4229	PyObject *uformat;
				4230
				4231	if (format == NULL \|\| args == NULL) {
				4232	PyErr_BadInternalCall();
				4233	return NULL;
				4234	}
				4235	uformat = PyUnicode_FromObject(format);
				4236	fmt = PyUnicode_AS_UNICODE(uformat);
				4237	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4238
				4239	reslen = rescnt = fmtcnt + 100;
				4240	result = _PyUnicode_New(reslen);
				4241	if (result == NULL)
				4242	goto onError;
				4243	res = PyUnicode_AS_UNICODE(result);
				4244
				4245	if (PyTuple_Check(args)) {
				4246	arglen = PyTuple_Size(args);
				4247	argidx = 0;
				4248	}
				4249	else {
				4250	arglen = -1;
				4251	argidx = -2;
				4252	}
				4253	if (args->ob_type->tp_as_mapping)
				4254	dict = args;
				4255
				4256	while (--fmtcnt >= 0) {
				4257	if (*fmt != '%') {
				4258	if (--rescnt < 0) {
				4259	rescnt = fmtcnt + 100;
				4260	reslen += rescnt;
				4261	if (_PyUnicode_Resize(result, reslen) < 0)
				4262	return NULL;
				4263	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4264	--rescnt;
				4265	}
				4266	res++ = fmt++;
				4267	}
				4268	else {
				4269	/* Got a format specifier */
				4270	int flags = 0;
				4271	int width = -1;
				4272	int prec = -1;
				4273	int size = 0;
				4274	Py_UNICODE c = '\0';
				4275	Py_UNICODE fill;
				4276	PyObject *v = NULL;
				4277	PyObject *temp = NULL;
				4278	Py_UNICODE *buf;
				4279	Py_UNICODE sign;
				4280	int len;
				4281	Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
				4282
				4283	fmt++;
				4284	if (*fmt == '(') {
				4285	Py_UNICODE *keystart;
				4286	int keylen;
				4287	PyObject *key;
				4288	int pcount = 1;
				4289
				4290	if (dict == NULL) {
				4291	PyErr_SetString(PyExc_TypeError,
				4292	"format requires a mapping");
				4293	goto onError;
				4294	}
				4295	++fmt;
				4296	--fmtcnt;
				4297	keystart = fmt;
				4298	/* Skip over balanced parentheses */
				4299	while (pcount > 0 && --fmtcnt >= 0) {
				4300	if (*fmt == ')')
				4301	--pcount;
				4302	else if (*fmt == '(')
				4303	++pcount;
				4304	fmt++;
				4305	}
				4306	keylen = fmt - keystart - 1;
				4307	if (fmtcnt < 0 \|\| pcount > 0) {
				4308	PyErr_SetString(PyExc_ValueError,
				4309	"incomplete format key");
				4310	goto onError;
				4311	}
				4312	/* keys are converted to strings (using UTF-8) and
				4313	then looked up since Python uses strings to hold
				4314	variables names etc. in its namespaces and we
				4315	wouldn't want to break common idioms. The
				4316	alternative would be using Unicode objects for the
				4317	lookup but u"abc" and "abc" have different hash
				4318	values (on purpose). */
				4319	key = PyUnicode_EncodeUTF8(keystart,
				4320	keylen,
				4321	NULL);
				4322	if (key == NULL)
				4323	goto onError;
				4324	if (args_owned) {
				4325	Py_DECREF(args);
				4326	args_owned = 0;
				4327	}
				4328	args = PyObject_GetItem(dict, key);
				4329	Py_DECREF(key);
				4330	if (args == NULL) {
				4331	goto onError;
				4332	}
				4333	args_owned = 1;
				4334	arglen = -1;
				4335	argidx = -2;
				4336	}
				4337	while (--fmtcnt >= 0) {
				4338	switch (c = *fmt++) {
				4339	case '-': flags \|= F_LJUST; continue;
				4340	case '+': flags \|= F_SIGN; continue;
				4341	case ' ': flags \|= F_BLANK; continue;
				4342	case '#': flags \|= F_ALT; continue;
				4343	case '0': flags \|= F_ZERO; continue;
				4344	}
				4345	break;
				4346	}
				4347	if (c == '*') {
				4348	v = getnextarg(args, arglen, &argidx);
				4349	if (v == NULL)
				4350	goto onError;
				4351	if (!PyInt_Check(v)) {
				4352	PyErr_SetString(PyExc_TypeError,
				4353	"* wants int");
				4354	goto onError;
				4355	}
				4356	width = PyInt_AsLong(v);
				4357	if (width < 0) {
				4358	flags \|= F_LJUST;
				4359	width = -width;
				4360	}
				4361	if (--fmtcnt >= 0)
				4362	c = *fmt++;
				4363	}
				4364	else if (c >= '0' && c <= '9') {
				4365	width = c - '0';
				4366	while (--fmtcnt >= 0) {
				4367	c = *fmt++;
				4368	if (c < '0' \|\| c > '9')
				4369	break;
				4370	if ((width*10) / 10 != width) {
				4371	PyErr_SetString(PyExc_ValueError,
				4372	"width too big");
				4373	goto onError;
				4374	}
				4375	width = width*10 + (c - '0');
				4376	}
				4377	}
				4378	if (c == '.') {
				4379	prec = 0;
				4380	if (--fmtcnt >= 0)
				4381	c = *fmt++;
				4382	if (c == '*') {
				4383	v = getnextarg(args, arglen, &argidx);
				4384	if (v == NULL)
				4385	goto onError;
				4386	if (!PyInt_Check(v)) {
				4387	PyErr_SetString(PyExc_TypeError,
				4388	"* wants int");
				4389	goto onError;
				4390	}
				4391	prec = PyInt_AsLong(v);
				4392	if (prec < 0)
				4393	prec = 0;
				4394	if (--fmtcnt >= 0)
				4395	c = *fmt++;
				4396	}
				4397	else if (c >= '0' && c <= '9') {
				4398	prec = c - '0';
				4399	while (--fmtcnt >= 0) {
				4400	c = Py_CHARMASK(*fmt++);
				4401	if (c < '0' \|\| c > '9')
				4402	break;
				4403	if ((prec*10) / 10 != prec) {
				4404	PyErr_SetString(PyExc_ValueError,
				4405	"prec too big");
				4406	goto onError;
				4407	}
				4408	prec = prec*10 + (c - '0');
				4409	}
				4410	}
				4411	} /* prec */
				4412	if (fmtcnt >= 0) {
				4413	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4414	size = c;
				4415	if (--fmtcnt >= 0)
				4416	c = *fmt++;
				4417	}
				4418	}
				4419	if (fmtcnt < 0) {
				4420	PyErr_SetString(PyExc_ValueError,
				4421	"incomplete format");
				4422	goto onError;
				4423	}
				4424	if (c != '%') {
				4425	v = getnextarg(args, arglen, &argidx);
				4426	if (v == NULL)
				4427	goto onError;
				4428	}
				4429	sign = 0;
				4430	fill = ' ';
				4431	switch (c) {
				4432
				4433	case '%':
				4434	buf = tmpbuf;
				4435	buf[0] = '%';
				4436	len = 1;
				4437	break;
				4438
				4439	case 's':
				4440	case 'r':
				4441	if (PyUnicode_Check(v) && c == 's') {
				4442	temp = v;
				4443	Py_INCREF(temp);
				4444	}
				4445	else {
				4446	PyObject *unicode;
				4447	if (c == 's')
				4448	temp = PyObject_Str(v);
				4449	else
				4450	temp = PyObject_Repr(v);
				4451	if (temp == NULL)
				4452	goto onError;
				4453	if (!PyString_Check(temp)) {
				4454	/* XXX Note: this should never happen, since
				4455	PyObject_Repr() and PyObject_Str() assure
				4456	this */
				4457	Py_DECREF(temp);
				4458	PyErr_SetString(PyExc_TypeError,
				4459	"%s argument has non-string str()");
				4460	goto onError;
				4461	}
				4462	unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
				4463	PyString_GET_SIZE(temp),
				4464	"strict");
				4465	Py_DECREF(temp);
				4466	temp = unicode;
				4467	if (temp == NULL)
				4468	goto onError;
				4469	}
				4470	buf = PyUnicode_AS_UNICODE(temp);
				4471	len = PyUnicode_GET_SIZE(temp);
				4472	if (prec >= 0 && len > prec)
				4473	len = prec;
				4474	break;
				4475
				4476	case 'i':
				4477	case 'd':
				4478	case 'u':
				4479	case 'o':
				4480	case 'x':
				4481	case 'X':
				4482	if (c == 'i')
				4483	c = 'd';
				4484	buf = tmpbuf;
				4485	len = formatint(buf, flags, prec, c, v);
				4486	if (len < 0)
				4487	goto onError;
				4488	sign = (c == 'd');
				4489	if (flags & F_ZERO) {
				4490	fill = '0';
				4491	if ((flags&F_ALT) &&
				4492	(c == 'x' \|\| c == 'X') &&
				4493	buf[0] == '0' && buf[1] == c) {
				4494	res++ = buf++;
				4495	res++ = buf++;
				4496	rescnt -= 2;
				4497	len -= 2;
				4498	width -= 2;
				4499	if (width < 0)
				4500	width = 0;
				4501	}
				4502	}
				4503	break;
				4504
				4505	case 'e':
				4506	case 'E':
				4507	case 'f':
				4508	case 'g':
				4509	case 'G':
				4510	buf = tmpbuf;
				4511	len = formatfloat(buf, flags, prec, c, v);
				4512	if (len < 0)
				4513	goto onError;
				4514	sign = 1;
				4515	if (flags&F_ZERO)
				4516	fill = '0';
				4517	break;
				4518
				4519	case 'c':
				4520	buf = tmpbuf;
				4521	len = formatchar(buf, v);
				4522	if (len < 0)
				4523	goto onError;
				4524	break;
				4525
				4526	default:
				4527	PyErr_Format(PyExc_ValueError,
				4528	"unsupported format character '%c' (0x%x)",
				4529	c, c);
				4530	goto onError;
				4531	}
				4532	if (sign) {
				4533	if (buf == '-' \|\| buf == '+') {
				4534	sign = *buf++;
				4535	len--;
				4536	}
				4537	else if (flags & F_SIGN)
				4538	sign = '+';
				4539	else if (flags & F_BLANK)
				4540	sign = ' ';
				4541	else
				4542	sign = 0;
				4543	}
				4544	if (width < len)
				4545	width = len;
				4546	if (rescnt < width + (sign != 0)) {
				4547	reslen -= rescnt;
				4548	rescnt = width + fmtcnt + 100;
				4549	reslen += rescnt;
				4550	if (_PyUnicode_Resize(result, reslen) < 0)
				4551	return NULL;
				4552	res = PyUnicode_AS_UNICODE(result)
				4553	+ reslen - rescnt;
				4554	}
				4555	if (sign) {
				4556	if (fill != ' ')
				4557	*res++ = sign;
				4558	rescnt--;
				4559	if (width > len)
				4560	width--;
				4561	}
				4562	if (width > len && !(flags & F_LJUST)) {
				4563	do {
				4564	--rescnt;
				4565	*res++ = fill;
				4566	} while (--width > len);
				4567	}
				4568	if (sign && fill == ' ')
				4569	*res++ = sign;
				4570	memcpy(res, buf, len * sizeof(Py_UNICODE));
				4571	res += len;
				4572	rescnt -= len;
				4573	while (--width >= len) {
				4574	--rescnt;
				4575	*res++ = ' ';
				4576	}
				4577	if (dict && (argidx < arglen) && c != '%') {
				4578	PyErr_SetString(PyExc_TypeError,
				4579	"not all arguments converted");
				4580	goto onError;
				4581	}
				4582	Py_XDECREF(temp);
				4583	} /* '%' */
				4584	} /* until end */
				4585	if (argidx < arglen && !dict) {
				4586	PyErr_SetString(PyExc_TypeError,
				4587	"not all arguments converted");
				4588	goto onError;
				4589	}
				4590
				4591	if (args_owned) {
				4592	Py_DECREF(args);
				4593	}
				4594	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4595	if (_PyUnicode_Resize(result, reslen - rescnt))
				4596	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4597	return (PyObject *)result;
				4598
				4599	onError:
				4600	Py_XDECREF(result);
				4601	Py_DECREF(uformat);
				4602	if (args_owned) {
				4603	Py_DECREF(args);
				4604	}
				4605	return NULL;
				4606	}
				4607
				4608	static PyBufferProcs unicode_as_buffer = {
				4609	(getreadbufferproc) unicode_buffer_getreadbuf,
				4610	(getwritebufferproc) unicode_buffer_getwritebuf,
				4611	(getsegcountproc) unicode_buffer_getsegcount,
				4612	(getcharbufferproc) unicode_buffer_getcharbuf,
				4613	};
				4614
				4615	PyTypeObject PyUnicode_Type = {
				4616	PyObject_HEAD_INIT(&PyType_Type)
				4617	0, /* ob_size */
				4618	"unicode", /* tp_name */
				4619	sizeof(PyUnicodeObject), /* tp_size */
				4620	0, /* tp_itemsize */
				4621	/* Slots */
				4622	(destructor)_PyUnicode_Free, /* tp_dealloc */
				4623	0, /* tp_print */
				4624	(getattrfunc)unicode_getattr, /* tp_getattr */
				4625	0, /* tp_setattr */
				4626	(cmpfunc) unicode_compare, /* tp_compare */
				4627	(reprfunc) unicode_repr, /* tp_repr */
				4628	0, /* tp_as_number */
				4629	&unicode_as_sequence, /* tp_as_sequence */
				4630	0, /* tp_as_mapping */
				4631	(hashfunc) unicode_hash, /* tp_hash*/
				4632	0, /* tp_call*/
				4633	(reprfunc) unicode_str, /* tp_str */
				4634	(getattrofunc) NULL, /* tp_getattro */
				4635	(setattrofunc) NULL, /* tp_setattro */
				4636	&unicode_as_buffer, /* tp_as_buffer */
				4637	Py_TPFLAGS_DEFAULT, /* tp_flags */
				4638	};
				4639
				4640	/* Initialize the Unicode implementation */
				4641
				4642	void _PyUnicode_Init()
				4643	{
				4644	/* Doublecheck the configuration... */
				4645	if (sizeof(Py_UNICODE) != 2)
				4646	Py_FatalError("Unicode configuration error: "
				4647	"sizeof(Py_UNICODE) != 2 bytes");
				4648
				4649	unicode_empty = _PyUnicode_New(0);
				4650	}
				4651
				4652	/* Finalize the Unicode implementation */
				4653
				4654	void
				4655	_PyUnicode_Fini()
				4656	{
				4657	PyUnicodeObject *u = unicode_freelist;
				4658
				4659	while (u != NULL) {
				4660	PyUnicodeObject *v = u;
				4661	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4662	if (v->str)
				4663	free(v->str);
				4664	Py_XDECREF(v->utf8str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4665	free(v);
				4666	}
				4667	Py_XDECREF(unicode_empty);
				4668	}