Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 7a68dd40104d51b31583b5f30004ec3d9afa33ba [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
				4	modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
				5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
				69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	79
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	80	/* Limit for the Unicode object free list */
				81
				82	#define MAX_UNICODE_FREELIST_SIZE 1024
				83
				84	/* Limit for the Unicode object free list stay alive optimization.
				85
				86	The implementation will keep allocated Unicode memory intact for
				87	all objects on the free list having a size less than this
				88	limit. This reduces malloc() overhead for small Unicode objects.
				89
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	90	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	91	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92	malloc()-overhead) bytes of unused garbage.
				93
				94	Setting the limit to 0 effectively turns the feature off.
				95
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	96	Note: This is an experimental feature ! If you get core dumps when
				97	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	98
				99	*/
				100
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	101	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
				111	/* --- Globals ------------------------------------------------------------ */
				112
				113	/* The empty Unicode object */
				114	static PyUnicodeObject *unicode_empty = NULL;
				115
				116	/* Free list for Unicode objects */
				117	static PyUnicodeObject *unicode_freelist = NULL;
				118	static int unicode_freelist_size = 0;
				119
				120	/* --- Unicode Object ----------------------------------------------------- */
				121
				122	static
				123	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				124	int length)
				125	{
				126	void *oldstr;
				127
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	130	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	131
				132	/* Resizing unicode_empty is not allowed. */
				133	if (unicode == unicode_empty) {
				134	PyErr_SetString(PyExc_SystemError,
				135	"can't resize empty unicode object");
				136	return -1;
				137	}
				138
				139	/* We allocate one more byte to make sure the string is
				140	Ux0000 terminated -- XXX is this needed ? */
				141	oldstr = unicode->str;
				142	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				143	if (!unicode->str) {
				144	unicode->str = oldstr;
				145	PyErr_NoMemory();
				146	return -1;
				147	}
				148	unicode->str[length] = 0;
				149	unicode->length = length;
				150
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	151	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	152	/* Reset the object caches */
				153	if (unicode->utf8str) {
				154	Py_DECREF(unicode->utf8str);
				155	unicode->utf8str = NULL;
				156	}
				157	unicode->hash = -1;
				158
				159	return 0;
				160	}
				161
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	162	int PyUnicode_Resize(PyObject **unicode,
				163	int length)
				164	{
				165	PyUnicodeObject *v;
				166
				167	if (unicode == NULL) {
				168	PyErr_BadInternalCall();
				169	return -1;
				170	}
				171	v = (PyUnicodeObject )unicode;
				172	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				173	PyErr_BadInternalCall();
				174	return -1;
				175	}
				176	return _PyUnicode_Resize(v, length);
				177	}
				178
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	179	/* We allocate one more byte to make sure the string is
				180	Ux0000 terminated -- XXX is this needed ?
				181
				182	XXX This allocator could further be enhanced by assuring that the
				183	free list never reduces its size below 1.
				184
				185	*/
				186
				187	static
				188	PyUnicodeObject *_PyUnicode_New(int length)
				189	{
				190	register PyUnicodeObject *unicode;
				191
				192	/* Optimization for empty strings */
				193	if (length == 0 && unicode_empty != NULL) {
				194	Py_INCREF(unicode_empty);
				195	return unicode_empty;
				196	}
				197
				198	/* Unicode freelist & memory allocation */
				199	if (unicode_freelist) {
				200	unicode = unicode_freelist;
				201	unicode_freelist = (PyUnicodeObject *)unicode_freelist;
				202	unicode_freelist_size--;
				203	unicode->ob_type = &PyUnicode_Type;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	204	_Py_NewReference((PyObject *)unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	205	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	206	/* Keep-Alive optimization: we only upsize the buffer,
				207	never downsize it. */
				208	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	209	_PyUnicode_Resize(unicode, length)) {
				210	free(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	211	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	212	}
				213	}
				214	else
				215	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				216	}
				217	else {
				218	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				219	if (unicode == NULL)
				220	return NULL;
				221	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				222	}
				223
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	224	if (!unicode->str) {
				225	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	226	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	227	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	228	unicode->str[length] = 0;
				229	unicode->length = length;
				230	unicode->hash = -1;
				231	unicode->utf8str = NULL;
				232	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	233
				234	onError:
				235	_Py_ForgetReference((PyObject *)unicode);
				236	PyMem_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	237	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	238	}
				239
				240	static
				241	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				242	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	243	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	244	/* Keep-Alive optimization */
				245	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	246	free(unicode->str);
				247	unicode->str = NULL;
				248	unicode->length = 0;
				249	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	250	if (unicode->utf8str) {
				251	Py_DECREF(unicode->utf8str);
				252	unicode->utf8str = NULL;
				253	}
				254	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	255	(PyUnicodeObject *)unicode = unicode_freelist;
				256	unicode_freelist = unicode;
				257	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	258	}
				259	else {
				260	free(unicode->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	261	Py_XDECREF(unicode->utf8str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	262	PyMem_DEL(unicode);
				263	}
				264	}
				265
				266	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				267	int size)
				268	{
				269	PyUnicodeObject *unicode;
				270
				271	unicode = _PyUnicode_New(size);
				272	if (!unicode)
				273	return NULL;
				274
				275	/* Copy the Unicode data into the new object */
				276	if (u != NULL)
				277	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				278
				279	return (PyObject *)unicode;
				280	}
				281
				282	#ifdef HAVE_WCHAR_H
				283
				284	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				285	int size)
				286	{
				287	PyUnicodeObject *unicode;
				288
				289	if (w == NULL) {
				290	PyErr_BadInternalCall();
				291	return NULL;
				292	}
				293
				294	unicode = _PyUnicode_New(size);
				295	if (!unicode)
				296	return NULL;
				297
				298	/* Copy the wchar_t data into the new object */
				299	#ifdef HAVE_USABLE_WCHAR_T
				300	memcpy(unicode->str, w, size * sizeof(wchar_t));
				301	#else
				302	{
				303	register Py_UNICODE *u;
				304	register int i;
				305	u = PyUnicode_AS_UNICODE(unicode);
				306	for (i = size; i >= 0; i--)
				307	u++ = w++;
				308	}
				309	#endif
				310
				311	return (PyObject *)unicode;
				312	}
				313
				314	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				315	register wchar_t *w,
				316	int size)
				317	{
				318	if (unicode == NULL) {
				319	PyErr_BadInternalCall();
				320	return -1;
				321	}
				322	if (size > PyUnicode_GET_SIZE(unicode))
				323	size = PyUnicode_GET_SIZE(unicode);
				324	#ifdef HAVE_USABLE_WCHAR_T
				325	memcpy(w, unicode->str, size * sizeof(wchar_t));
				326	#else
				327	{
				328	register Py_UNICODE *u;
				329	register int i;
				330	u = PyUnicode_AS_UNICODE(unicode);
				331	for (i = size; i >= 0; i--)
				332	w++ = u++;
				333	}
				334	#endif
				335
				336	return size;
				337	}
				338
				339	#endif
				340
				341	PyObject PyUnicode_FromObject(register PyObject obj)
				342	{
				343	const char *s;
				344	int len;
				345
				346	if (obj == NULL) {
				347	PyErr_BadInternalCall();
				348	return NULL;
				349	}
				350	else if (PyUnicode_Check(obj)) {
				351	Py_INCREF(obj);
				352	return obj;
				353	}
				354	else if (PyString_Check(obj)) {
				355	s = PyString_AS_STRING(obj);
				356	len = PyString_GET_SIZE(obj);
				357	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	358	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				359	/* Overwrite the error message with something more useful in
				360	case of a TypeError. */
				361	if (PyErr_ExceptionMatches(PyExc_TypeError))
				362	PyErr_SetString(PyExc_TypeError,
				363	"coercing to Unicode: need string or charbuffer");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	364	return NULL;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	365	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	366	if (len == 0) {
				367	Py_INCREF(unicode_empty);
				368	return (PyObject *)unicode_empty;
				369	}
				370	return PyUnicode_DecodeUTF8(s, len, "strict");
				371	}
				372
				373	PyObject PyUnicode_Decode(const char s,
				374	int size,
				375	const char *encoding,
				376	const char *errors)
				377	{
				378	PyObject buffer = NULL, unicode;
				379
				380	/* Shortcut for the default encoding UTF-8 */
				381	if (encoding == NULL \|\|
				382	(strcmp(encoding, "utf-8") == 0))
				383	return PyUnicode_DecodeUTF8(s, size, errors);
				384
				385	/* Decode via the codec registry */
				386	buffer = PyBuffer_FromMemory((void *)s, size);
				387	if (buffer == NULL)
				388	goto onError;
				389	unicode = PyCodec_Decode(buffer, encoding, errors);
				390	if (unicode == NULL)
				391	goto onError;
				392	if (!PyUnicode_Check(unicode)) {
				393	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	394	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	395	unicode->ob_type->tp_name);
				396	Py_DECREF(unicode);
				397	goto onError;
				398	}
				399	Py_DECREF(buffer);
				400	return unicode;
				401
				402	onError:
				403	Py_XDECREF(buffer);
				404	return NULL;
				405	}
				406
				407	PyObject PyUnicode_Encode(const Py_UNICODE s,
				408	int size,
				409	const char *encoding,
				410	const char *errors)
				411	{
				412	PyObject v, unicode;
				413
				414	unicode = PyUnicode_FromUnicode(s, size);
				415	if (unicode == NULL)
				416	return NULL;
				417	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				418	Py_DECREF(unicode);
				419	return v;
				420	}
				421
				422	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				423	const char *encoding,
				424	const char *errors)
				425	{
				426	PyObject *v;
				427
				428	if (!PyUnicode_Check(unicode)) {
				429	PyErr_BadArgument();
				430	goto onError;
				431	}
				432	/* Shortcut for the default encoding UTF-8 */
				433	if ((encoding == NULL \|\|
				434	(strcmp(encoding, "utf-8") == 0)) &&
				435	errors == NULL)
				436	return PyUnicode_AsUTF8String(unicode);
				437
				438	/* Encode via the codec registry */
				439	v = PyCodec_Encode(unicode, encoding, errors);
				440	if (v == NULL)
				441	goto onError;
				442	/* XXX Should we really enforce this ? */
				443	if (!PyString_Check(v)) {
				444	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	445	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	446	v->ob_type->tp_name);
				447	Py_DECREF(v);
				448	goto onError;
				449	}
				450	return v;
				451
				452	onError:
				453	return NULL;
				454	}
				455
				456	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				457	{
				458	if (!PyUnicode_Check(unicode)) {
				459	PyErr_BadArgument();
				460	goto onError;
				461	}
				462	return PyUnicode_AS_UNICODE(unicode);
				463
				464	onError:
				465	return NULL;
				466	}
				467
				468	int PyUnicode_GetSize(PyObject *unicode)
				469	{
				470	if (!PyUnicode_Check(unicode)) {
				471	PyErr_BadArgument();
				472	goto onError;
				473	}
				474	return PyUnicode_GET_SIZE(unicode);
				475
				476	onError:
				477	return -1;
				478	}
				479
				480	/* --- UTF-8 Codec -------------------------------------------------------- */
				481
				482	static
				483	char utf8_code_length[256] = {
				484	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				485	illegal prefix. see RFC 2279 for details */
				486	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				487	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				488	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				489	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				490	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				491	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				492	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				493	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				494	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				495	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				496	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				497	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				498	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				499	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				500	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				501	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				502	};
				503
				504	static
				505	int utf8_decoding_error(const char **source,
				506	Py_UNICODE **dest,
				507	const char *errors,
				508	const char *details)
				509	{
				510	if ((errors == NULL) \|\|
				511	(strcmp(errors,"strict") == 0)) {
				512	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	513	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	514	details);
				515	return -1;
				516	}
				517	else if (strcmp(errors,"ignore") == 0) {
				518	(*source)++;
				519	return 0;
				520	}
				521	else if (strcmp(errors,"replace") == 0) {
				522	(*source)++;
				523	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				524	(*dest)++;
				525	return 0;
				526	}
				527	else {
				528	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	529	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	530	errors);
				531	return -1;
				532	}
				533	}
				534
				535	#define UTF8_ERROR(details) do { \
				536	if (utf8_decoding_error(&s, &p, errors, details)) \
				537	goto onError; \
				538	continue; \
				539	} while (0)
				540
				541	PyObject PyUnicode_DecodeUTF8(const char s,
				542	int size,
				543	const char *errors)
				544	{
				545	int n;
				546	const char *e;
				547	PyUnicodeObject *unicode;
				548	Py_UNICODE *p;
				549
				550	/* Note: size will always be longer than the resulting Unicode
				551	character count */
				552	unicode = _PyUnicode_New(size);
				553	if (!unicode)
				554	return NULL;
				555	if (size == 0)
				556	return (PyObject *)unicode;
				557
				558	/* Unpack UTF-8 encoded data */
				559	p = unicode->str;
				560	e = s + size;
				561
				562	while (s < e) {
				563	register Py_UNICODE ch = (unsigned char)*s;
				564
				565	if (ch < 0x80) {
				566	*p++ = ch;
				567	s++;
				568	continue;
				569	}
				570
				571	n = utf8_code_length[ch];
				572
				573	if (s + n > e)
				574	UTF8_ERROR("unexpected end of data");
				575
				576	switch (n) {
				577
				578	case 0:
				579	UTF8_ERROR("unexpected code byte");
				580	break;
				581
				582	case 1:
				583	UTF8_ERROR("internal error");
				584	break;
				585
				586	case 2:
				587	if ((s[1] & 0xc0) != 0x80)
				588	UTF8_ERROR("invalid data");
				589	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				590	if (ch < 0x80)
				591	UTF8_ERROR("illegal encoding");
				592	else
				593	*p++ = ch;
				594	break;
				595
				596	case 3:
				597	if ((s[1] & 0xc0) != 0x80 \|\|
				598	(s[2] & 0xc0) != 0x80)
				599	UTF8_ERROR("invalid data");
				600	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				601	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				602	UTF8_ERROR("illegal encoding");
				603	else
				604	*p++ = ch;
				605	break;
				606
				607	default:
				608	/* Other sizes are only needed for UCS-4 */
				609	UTF8_ERROR("unsupported Unicode code range");
				610	}
				611	s += n;
				612	}
				613
				614	/* Adjust length */
				615	if (_PyUnicode_Resize(unicode, p - unicode->str))
				616	goto onError;
				617
				618	return (PyObject *)unicode;
				619
				620	onError:
				621	Py_DECREF(unicode);
				622	return NULL;
				623	}
				624
				625	#undef UTF8_ERROR
				626
				627	static
				628	int utf8_encoding_error(const Py_UNICODE **source,
				629	char **dest,
				630	const char *errors,
				631	const char *details)
				632	{
				633	if ((errors == NULL) \|\|
				634	(strcmp(errors,"strict") == 0)) {
				635	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	636	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	637	details);
				638	return -1;
				639	}
				640	else if (strcmp(errors,"ignore") == 0) {
				641	return 0;
				642	}
				643	else if (strcmp(errors,"replace") == 0) {
				644	**dest = '?';
				645	(*dest)++;
				646	return 0;
				647	}
				648	else {
				649	PyErr_Format(PyExc_ValueError,
				650	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	651	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	652	errors);
				653	return -1;
				654	}
				655	}
				656
				657	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				658	int size,
				659	const char *errors)
				660	{
				661	PyObject *v;
				662	char *p;
				663	char *q;
				664
				665	v = PyString_FromStringAndSize(NULL, 3 * size);
				666	if (v == NULL)
				667	return NULL;
				668	if (size == 0)
				669	goto done;
				670
				671	p = q = PyString_AS_STRING(v);
				672	while (size-- > 0) {
				673	Py_UNICODE ch = *s++;
				674	if (ch < 0x80)
				675	*p++ = (char) ch;
				676	else if (ch < 0x0800) {
				677	*p++ = 0xc0 \| (ch >> 6);
				678	*p++ = 0x80 \| (ch & 0x3f);
				679	} else if (0xD800 <= ch && ch <= 0xDFFF) {
				680	/* These byte ranges are reserved for UTF-16 surrogate
				681	bytes which the Python implementation currently does
				682	not support. */
				683	printf("code range problem: U+%04x\n", ch);
				684	if (utf8_encoding_error(&s, &p, errors,
				685	"unsupported code range"))
				686	goto onError;
				687	} else {
				688	*p++ = 0xe0 \| (ch >> 12);
				689	*p++ = 0x80 \| ((ch >> 6) & 0x3f);
				690	*p++ = 0x80 \| (ch & 0x3f);
				691	}
				692	}
				693	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	694	if (_PyString_Resize(&v, p - q))
				695	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	696
				697	done:
				698	return v;
				699
				700	onError:
				701	Py_DECREF(v);
				702	return NULL;
				703	}
				704
				705	/* Return a Python string holding the UTF-8 encoded value of the
				706	Unicode object.
				707
				708	The resulting string is cached in the Unicode object for subsequent
				709	usage by this function. The cached version is needed to implement
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	710	the character buffer interface and will live (at least) as long as
				711	the Unicode object itself.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	712
				713	The refcount of the string is not incremented.
				714
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	715	* Exported for internal use by the interpreter only !!! *
				716
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	717	*/
				718
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	719	PyObject _PyUnicode_AsUTF8String(PyObject unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	720	const char *errors)
				721	{
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	722	PyObject v = ((PyUnicodeObject )unicode)->utf8str;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	723
				724	if (v)
				725	return v;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	726	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				727	PyUnicode_GET_SIZE(unicode),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	728	errors);
				729	if (v && errors == NULL)
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	730	((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	731	return v;
				732	}
				733
				734	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				735	{
				736	PyObject *str;
				737
				738	if (!PyUnicode_Check(unicode)) {
				739	PyErr_BadArgument();
				740	return NULL;
				741	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	742	str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	743	if (str == NULL)
				744	return NULL;
				745	Py_INCREF(str);
				746	return str;
				747	}
				748
				749	/* --- UTF-16 Codec ------------------------------------------------------- */
				750
				751	static
				752	int utf16_decoding_error(const Py_UNICODE **source,
				753	Py_UNICODE **dest,
				754	const char *errors,
				755	const char *details)
				756	{
				757	if ((errors == NULL) \|\|
				758	(strcmp(errors,"strict") == 0)) {
				759	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	760	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	761	details);
				762	return -1;
				763	}
				764	else if (strcmp(errors,"ignore") == 0) {
				765	return 0;
				766	}
				767	else if (strcmp(errors,"replace") == 0) {
				768	if (dest) {
				769	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				770	(*dest)++;
				771	}
				772	return 0;
				773	}
				774	else {
				775	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	776	"UTF-16 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	777	errors);
				778	return -1;
				779	}
				780	}
				781
				782	#define UTF16_ERROR(details) do { \
				783	if (utf16_decoding_error(&q, &p, errors, details)) \
				784	goto onError; \
				785	continue; \
				786	} while(0)
				787
				788	PyObject PyUnicode_DecodeUTF16(const char s,
				789	int size,
				790	const char *errors,
				791	int *byteorder)
				792	{
				793	PyUnicodeObject *unicode;
				794	Py_UNICODE *p;
				795	const Py_UNICODE q, e;
				796	int bo = 0;
				797
				798	/* size should be an even number */
				799	if (size % sizeof(Py_UNICODE) != 0) {
				800	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				801	return NULL;
				802	/* The remaining input chars are ignored if we fall through
				803	here... */
				804	}
				805
				806	/* Note: size will always be longer than the resulting Unicode
				807	character count */
				808	unicode = _PyUnicode_New(size);
				809	if (!unicode)
				810	return NULL;
				811	if (size == 0)
				812	return (PyObject *)unicode;
				813
				814	/* Unpack UTF-16 encoded data */
				815	p = unicode->str;
				816	q = (Py_UNICODE *)s;
				817	e = q + (size / sizeof(Py_UNICODE));
				818
				819	if (byteorder)
				820	bo = *byteorder;
				821
				822	while (q < e) {
				823	register Py_UNICODE ch = *q++;
				824
				825	/* Check for BOM marks (U+FEFF) in the input and adjust
				826	current byte order setting accordingly. Swap input
				827	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				828	!) */
				829	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				830	if (ch == 0xFEFF) {
				831	bo = -1;
				832	continue;
				833	} else if (ch == 0xFFFE) {
				834	bo = 1;
				835	continue;
				836	}
				837	if (bo == 1)
				838	ch = (ch >> 8) \| (ch << 8);
				839	#else
				840	if (ch == 0xFEFF) {
				841	bo = 1;
				842	continue;
				843	} else if (ch == 0xFFFE) {
				844	bo = -1;
				845	continue;
				846	}
				847	if (bo == -1)
				848	ch = (ch >> 8) \| (ch << 8);
				849	#endif
				850	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				851	*p++ = ch;
				852	continue;
				853	}
				854
				855	/* UTF-16 code pair: */
				856	if (q >= e)
				857	UTF16_ERROR("unexpected end of data");
				858	if (0xDC00 <= q && q <= 0xDFFF) {
				859	q++;
				860	if (0xD800 <= q && q <= 0xDBFF)
				861	/* This is valid data (a UTF-16 surrogate pair), but
				862	we are not able to store this information since our
				863	Py_UNICODE type only has 16 bits... this might
				864	change someday, even though it's unlikely. */
				865	UTF16_ERROR("code pairs are not supported");
				866	else
				867	continue;
				868	}
				869	UTF16_ERROR("illegal encoding");
				870	}
				871
				872	if (byteorder)
				873	*byteorder = bo;
				874
				875	/* Adjust length */
				876	if (_PyUnicode_Resize(unicode, p - unicode->str))
				877	goto onError;
				878
				879	return (PyObject *)unicode;
				880
				881	onError:
				882	Py_DECREF(unicode);
				883	return NULL;
				884	}
				885
				886	#undef UTF16_ERROR
				887
				888	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				889	int size,
				890	const char *errors,
				891	int byteorder)
				892	{
				893	PyObject *v;
				894	Py_UNICODE *p;
				895	char *q;
				896
				897	/* We don't create UTF-16 pairs... */
				898	v = PyString_FromStringAndSize(NULL,
				899	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				900	if (v == NULL)
				901	return NULL;
				902	if (size == 0)
				903	goto done;
				904
				905	q = PyString_AS_STRING(v);
				906	p = (Py_UNICODE *)q;
				907
				908	if (byteorder == 0)
				909	*p++ = 0xFEFF;
				910	if (byteorder == 0 \|\|
				911	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				912	byteorder == -1
				913	#else
				914	byteorder == 1
				915	#endif
				916	)
				917	memcpy(p, s, size * sizeof(Py_UNICODE));
				918	else
				919	while (size-- > 0) {
				920	Py_UNICODE ch = *s++;
				921	*p++ = (ch >> 8) \| (ch << 8);
				922	}
				923	done:
				924	return v;
				925	}
				926
				927	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				928	{
				929	if (!PyUnicode_Check(unicode)) {
				930	PyErr_BadArgument();
				931	return NULL;
				932	}
				933	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				934	PyUnicode_GET_SIZE(unicode),
				935	NULL,
				936	0);
				937	}
				938
				939	/* --- Unicode Escape Codec ----------------------------------------------- */
				940
				941	static
				942	int unicodeescape_decoding_error(const char **source,
				943	unsigned int *x,
				944	const char *errors,
				945	const char *details)
				946	{
				947	if ((errors == NULL) \|\|
				948	(strcmp(errors,"strict") == 0)) {
				949	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	950	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	951	details);
				952	return -1;
				953	}
				954	else if (strcmp(errors,"ignore") == 0) {
				955	return 0;
				956	}
				957	else if (strcmp(errors,"replace") == 0) {
				958	*x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
				959	return 0;
				960	}
				961	else {
				962	PyErr_Format(PyExc_ValueError,
				963	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	964	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	965	errors);
				966	return -1;
				967	}
				968	}
				969
				970	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				971	int size,
				972	const char *errors)
				973	{
				974	PyUnicodeObject *v;
				975	Py_UNICODE p = NULL, buf = NULL;
				976	const char *end;
				977
				978	/* Escaped strings will always be longer than the resulting
				979	Unicode string, so we start with size here and then reduce the
				980	length after conversion to the true value. */
				981	v = _PyUnicode_New(size);
				982	if (v == NULL)
				983	goto onError;
				984	if (size == 0)
				985	return (PyObject *)v;
				986	p = buf = PyUnicode_AS_UNICODE(v);
				987	end = s + size;
				988	while (s < end) {
				989	unsigned char c;
				990	unsigned int x;
				991	int i;
				992
				993	/* Non-escape characters are interpreted as Unicode ordinals */
				994	if (*s != '\\') {
				995	p++ = (unsigned char)s++;
				996	continue;
				997	}
				998
				999	/* \ - Escapes */
				1000	s++;
				1001	switch (*s++) {
				1002
				1003	/* \x escapes */
				1004	case '\n': break;
				1005	case '\\': *p++ = '\\'; break;
				1006	case '\'': *p++ = '\''; break;
				1007	case '\"': *p++ = '\"'; break;
				1008	case 'b': *p++ = '\b'; break;
				1009	case 'f': p++ = '\014'; break; / FF */
				1010	case 't': *p++ = '\t'; break;
				1011	case 'n': *p++ = '\n'; break;
				1012	case 'r': *p++ = '\r'; break;
				1013	case 'v': p++ = '\013'; break; / VT */
				1014	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1015
				1016	/* \OOO (octal) escapes */
				1017	case '0': case '1': case '2': case '3':
				1018	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1019	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1020	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1021	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1022	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1023	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1024	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1025	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1026	break;
				1027
				1028	/* \xXXXX escape with 0-4 hex digits */
				1029	case 'x':
				1030	x = 0;
				1031	c = (unsigned char)*s;
				1032	if (isxdigit(c)) {
				1033	do {
				1034	x = (x<<4) & ~0xF;
				1035	if ('0' <= c && c <= '9')
				1036	x += c - '0';
				1037	else if ('a' <= c && c <= 'f')
				1038	x += 10 + c - 'a';
				1039	else
				1040	x += 10 + c - 'A';
				1041	c = (unsigned char)*++s;
				1042	} while (isxdigit(c));
				1043	*p++ = x;
				1044	} else {
				1045	*p++ = '\\';
				1046	*p++ = (unsigned char)s[-1];
				1047	}
				1048	break;
				1049
				1050	/* \uXXXX with 4 hex digits */
				1051	case 'u':
				1052	for (x = 0, i = 0; i < 4; i++) {
				1053	c = (unsigned char)s[i];
				1054	if (!isxdigit(c)) {
				1055	if (unicodeescape_decoding_error(&s, &x, errors,
				1056	"truncated \\uXXXX"))
				1057	goto onError;
				1058	i++;
				1059	break;
				1060	}
				1061	x = (x<<4) & ~0xF;
				1062	if (c >= '0' && c <= '9')
				1063	x += c - '0';
				1064	else if (c >= 'a' && c <= 'f')
				1065	x += 10 + c - 'a';
				1066	else
				1067	x += 10 + c - 'A';
				1068	}
				1069	s += i;
				1070	*p++ = x;
				1071	break;
				1072
				1073	default:
				1074	*p++ = '\\';
				1075	*p++ = (unsigned char)s[-1];
				1076	break;
				1077	}
				1078	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1079	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1080	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1081	return (PyObject *)v;
				1082
				1083	onError:
				1084	Py_XDECREF(v);
				1085	return NULL;
				1086	}
				1087
				1088	/* Return a Unicode-Escape string version of the Unicode object.
				1089
				1090	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1091	appropriate.
				1092
				1093	*/
				1094
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1095	static const Py_UNICODE findchar(const Py_UNICODE s,
				1096	int size,
				1097	Py_UNICODE ch);
				1098
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1099	static
				1100	PyObject unicodeescape_string(const Py_UNICODE s,
				1101	int size,
				1102	int quotes)
				1103	{
				1104	PyObject *repr;
				1105	char *p;
				1106	char *q;
				1107
				1108	static const char *hexdigit = "0123456789ABCDEF";
				1109
				1110	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1111	if (repr == NULL)
				1112	return NULL;
				1113
				1114	p = q = PyString_AS_STRING(repr);
				1115
				1116	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1117	*p++ = 'u';
				1118	*p++ = (findchar(s, size, '\'') &&
				1119	!findchar(s, size, '"')) ? '"' : '\'';
				1120	}
				1121	while (size-- > 0) {
				1122	Py_UNICODE ch = *s++;
				1123	/* Escape quotes */
				1124	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1125	*p++ = '\\';
				1126	*p++ = (char) ch;
				1127	}
				1128	/* Map 16-bit characters to '\uxxxx' */
				1129	else if (ch >= 256) {
				1130	*p++ = '\\';
				1131	*p++ = 'u';
				1132	*p++ = hexdigit[(ch >> 12) & 0xf];
				1133	*p++ = hexdigit[(ch >> 8) & 0xf];
				1134	*p++ = hexdigit[(ch >> 4) & 0xf];
				1135	*p++ = hexdigit[ch & 15];
				1136	}
				1137	/* Map non-printable US ASCII to '\ooo' */
				1138	else if (ch < ' ' \|\| ch >= 128) {
				1139	*p++ = '\\';
				1140	*p++ = hexdigit[(ch >> 6) & 7];
				1141	*p++ = hexdigit[(ch >> 3) & 7];
				1142	*p++ = hexdigit[ch & 7];
				1143	}
				1144	/* Copy everything else as-is */
				1145	else
				1146	*p++ = (char) ch;
				1147	}
				1148	if (quotes)
				1149	*p++ = q[1];
				1150
				1151	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1152	if (_PyString_Resize(&repr, p - q))
				1153	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1154
				1155	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1156
				1157	onError:
				1158	Py_DECREF(repr);
				1159	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1160	}
				1161
				1162	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1163	int size)
				1164	{
				1165	return unicodeescape_string(s, size, 0);
				1166	}
				1167
				1168	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1169	{
				1170	if (!PyUnicode_Check(unicode)) {
				1171	PyErr_BadArgument();
				1172	return NULL;
				1173	}
				1174	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1175	PyUnicode_GET_SIZE(unicode));
				1176	}
				1177
				1178	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1179
				1180	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1181	int size,
				1182	const char *errors)
				1183	{
				1184	PyUnicodeObject *v;
				1185	Py_UNICODE p, buf;
				1186	const char *end;
				1187	const char *bs;
				1188
				1189	/* Escaped strings will always be longer than the resulting
				1190	Unicode string, so we start with size here and then reduce the
				1191	length after conversion to the true value. */
				1192	v = _PyUnicode_New(size);
				1193	if (v == NULL)
				1194	goto onError;
				1195	if (size == 0)
				1196	return (PyObject *)v;
				1197	p = buf = PyUnicode_AS_UNICODE(v);
				1198	end = s + size;
				1199	while (s < end) {
				1200	unsigned char c;
				1201	unsigned int x;
				1202	int i;
				1203
				1204	/* Non-escape characters are interpreted as Unicode ordinals */
				1205	if (*s != '\\') {
				1206	p++ = (unsigned char)s++;
				1207	continue;
				1208	}
				1209
				1210	/* \u-escapes are only interpreted iff the number of leading
				1211	backslashes if odd */
				1212	bs = s;
				1213	for (;s < end;) {
				1214	if (*s != '\\')
				1215	break;
				1216	p++ = (unsigned char)s++;
				1217	}
				1218	if (((s - bs) & 1) == 0 \|\|
				1219	s >= end \|\|
				1220	*s != 'u') {
				1221	continue;
				1222	}
				1223	p--;
				1224	s++;
				1225
				1226	/* \uXXXX with 4 hex digits */
				1227	for (x = 0, i = 0; i < 4; i++) {
				1228	c = (unsigned char)s[i];
				1229	if (!isxdigit(c)) {
				1230	if (unicodeescape_decoding_error(&s, &x, errors,
				1231	"truncated \\uXXXX"))
				1232	goto onError;
				1233	i++;
				1234	break;
				1235	}
				1236	x = (x<<4) & ~0xF;
				1237	if (c >= '0' && c <= '9')
				1238	x += c - '0';
				1239	else if (c >= 'a' && c <= 'f')
				1240	x += 10 + c - 'a';
				1241	else
				1242	x += 10 + c - 'A';
				1243	}
				1244	s += i;
				1245	*p++ = x;
				1246	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1247	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1248	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1249	return (PyObject *)v;
				1250
				1251	onError:
				1252	Py_XDECREF(v);
				1253	return NULL;
				1254	}
				1255
				1256	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1257	int size)
				1258	{
				1259	PyObject *repr;
				1260	char *p;
				1261	char *q;
				1262
				1263	static const char *hexdigit = "0123456789ABCDEF";
				1264
				1265	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1266	if (repr == NULL)
				1267	return NULL;
				1268
				1269	p = q = PyString_AS_STRING(repr);
				1270	while (size-- > 0) {
				1271	Py_UNICODE ch = *s++;
				1272	/* Map 16-bit characters to '\uxxxx' */
				1273	if (ch >= 256) {
				1274	*p++ = '\\';
				1275	*p++ = 'u';
				1276	*p++ = hexdigit[(ch >> 12) & 0xf];
				1277	*p++ = hexdigit[(ch >> 8) & 0xf];
				1278	*p++ = hexdigit[(ch >> 4) & 0xf];
				1279	*p++ = hexdigit[ch & 15];
				1280	}
				1281	/* Copy everything else as-is */
				1282	else
				1283	*p++ = (char) ch;
				1284	}
				1285	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1286	if (_PyString_Resize(&repr, p - q))
				1287	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1288
				1289	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1290
				1291	onError:
				1292	Py_DECREF(repr);
				1293	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1294	}
				1295
				1296	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1297	{
				1298	if (!PyUnicode_Check(unicode)) {
				1299	PyErr_BadArgument();
				1300	return NULL;
				1301	}
				1302	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1303	PyUnicode_GET_SIZE(unicode));
				1304	}
				1305
				1306	/* --- Latin-1 Codec ------------------------------------------------------ */
				1307
				1308	PyObject PyUnicode_DecodeLatin1(const char s,
				1309	int size,
				1310	const char *errors)
				1311	{
				1312	PyUnicodeObject *v;
				1313	Py_UNICODE *p;
				1314
				1315	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1316	v = _PyUnicode_New(size);
				1317	if (v == NULL)
				1318	goto onError;
				1319	if (size == 0)
				1320	return (PyObject *)v;
				1321	p = PyUnicode_AS_UNICODE(v);
				1322	while (size-- > 0)
				1323	p++ = (unsigned char)s++;
				1324	return (PyObject *)v;
				1325
				1326	onError:
				1327	Py_XDECREF(v);
				1328	return NULL;
				1329	}
				1330
				1331	static
				1332	int latin1_encoding_error(const Py_UNICODE **source,
				1333	char **dest,
				1334	const char *errors,
				1335	const char *details)
				1336	{
				1337	if ((errors == NULL) \|\|
				1338	(strcmp(errors,"strict") == 0)) {
				1339	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1340	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1341	details);
				1342	return -1;
				1343	}
				1344	else if (strcmp(errors,"ignore") == 0) {
				1345	return 0;
				1346	}
				1347	else if (strcmp(errors,"replace") == 0) {
				1348	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1349	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1350	return 0;
				1351	}
				1352	else {
				1353	PyErr_Format(PyExc_ValueError,
				1354	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1355	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1356	errors);
				1357	return -1;
				1358	}
				1359	}
				1360
				1361	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1362	int size,
				1363	const char *errors)
				1364	{
				1365	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1366	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1367	repr = PyString_FromStringAndSize(NULL, size);
				1368	if (repr == NULL)
				1369	return NULL;
				1370
				1371	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1372	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1373	while (size-- > 0) {
				1374	Py_UNICODE ch = *p++;
				1375	if (ch >= 256) {
				1376	if (latin1_encoding_error(&p, &s, errors,
				1377	"ordinal not in range(256)"))
				1378	goto onError;
				1379	}
				1380	else
				1381	*s++ = (char)ch;
				1382	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1383	/* Resize if error handling skipped some characters */
				1384	if (s - start < PyString_GET_SIZE(repr))
				1385	if (_PyString_Resize(&repr, s - start))
				1386	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1387	return repr;
				1388
				1389	onError:
				1390	Py_DECREF(repr);
				1391	return NULL;
				1392	}
				1393
				1394	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1395	{
				1396	if (!PyUnicode_Check(unicode)) {
				1397	PyErr_BadArgument();
				1398	return NULL;
				1399	}
				1400	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1401	PyUnicode_GET_SIZE(unicode),
				1402	NULL);
				1403	}
				1404
				1405	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1406
				1407	static
				1408	int ascii_decoding_error(const char **source,
				1409	Py_UNICODE **dest,
				1410	const char *errors,
				1411	const char *details)
				1412	{
				1413	if ((errors == NULL) \|\|
				1414	(strcmp(errors,"strict") == 0)) {
				1415	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1416	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1417	details);
				1418	return -1;
				1419	}
				1420	else if (strcmp(errors,"ignore") == 0) {
				1421	return 0;
				1422	}
				1423	else if (strcmp(errors,"replace") == 0) {
				1424	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1425	(*dest)++;
				1426	return 0;
				1427	}
				1428	else {
				1429	PyErr_Format(PyExc_ValueError,
				1430	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1431	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1432	errors);
				1433	return -1;
				1434	}
				1435	}
				1436
				1437	PyObject PyUnicode_DecodeASCII(const char s,
				1438	int size,
				1439	const char *errors)
				1440	{
				1441	PyUnicodeObject *v;
				1442	Py_UNICODE *p;
				1443
				1444	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1445	v = _PyUnicode_New(size);
				1446	if (v == NULL)
				1447	goto onError;
				1448	if (size == 0)
				1449	return (PyObject *)v;
				1450	p = PyUnicode_AS_UNICODE(v);
				1451	while (size-- > 0) {
				1452	register unsigned char c;
				1453
				1454	c = (unsigned char)*s++;
				1455	if (c < 128)
				1456	*p++ = c;
				1457	else if (ascii_decoding_error(&s, &p, errors,
				1458	"ordinal not in range(128)"))
				1459	goto onError;
				1460	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1461	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1462	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1463	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1464	return (PyObject *)v;
				1465
				1466	onError:
				1467	Py_XDECREF(v);
				1468	return NULL;
				1469	}
				1470
				1471	static
				1472	int ascii_encoding_error(const Py_UNICODE **source,
				1473	char **dest,
				1474	const char *errors,
				1475	const char *details)
				1476	{
				1477	if ((errors == NULL) \|\|
				1478	(strcmp(errors,"strict") == 0)) {
				1479	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1480	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1481	details);
				1482	return -1;
				1483	}
				1484	else if (strcmp(errors,"ignore") == 0) {
				1485	return 0;
				1486	}
				1487	else if (strcmp(errors,"replace") == 0) {
				1488	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1489	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1490	return 0;
				1491	}
				1492	else {
				1493	PyErr_Format(PyExc_ValueError,
				1494	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1495	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1496	errors);
				1497	return -1;
				1498	}
				1499	}
				1500
				1501	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1502	int size,
				1503	const char *errors)
				1504	{
				1505	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1506	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1507	repr = PyString_FromStringAndSize(NULL, size);
				1508	if (repr == NULL)
				1509	return NULL;
				1510
				1511	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1512	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1513	while (size-- > 0) {
				1514	Py_UNICODE ch = *p++;
				1515	if (ch >= 128) {
				1516	if (ascii_encoding_error(&p, &s, errors,
				1517	"ordinal not in range(128)"))
				1518	goto onError;
				1519	}
				1520	else
				1521	*s++ = (char)ch;
				1522	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1523	/* Resize if error handling skipped some characters */
				1524	if (s - start < PyString_GET_SIZE(repr))
				1525	if (_PyString_Resize(&repr, s - start))
				1526	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1527	return repr;
				1528
				1529	onError:
				1530	Py_DECREF(repr);
				1531	return NULL;
				1532	}
				1533
				1534	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1535	{
				1536	if (!PyUnicode_Check(unicode)) {
				1537	PyErr_BadArgument();
				1538	return NULL;
				1539	}
				1540	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1541	PyUnicode_GET_SIZE(unicode),
				1542	NULL);
				1543	}
				1544
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1545	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1546
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1547	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1548
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1549	PyObject PyUnicode_DecodeMBCS(const char s,
				1550	int size,
				1551	const char *errors)
				1552	{
				1553	PyUnicodeObject *v;
				1554	Py_UNICODE *p;
				1555
				1556	/* First get the size of the result */
				1557	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	4e751c3	2000-05-03 12:27:22 +0000	[diff] [blame]	1558	if (usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1559	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1560
				1561	v = _PyUnicode_New(usize);
				1562	if (v == NULL)
				1563	return NULL;
				1564	if (usize == 0)
				1565	return (PyObject *)v;
				1566	p = PyUnicode_AS_UNICODE(v);
				1567	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1568	Py_DECREF(v);
				1569	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1570	}
				1571
				1572	return (PyObject *)v;
				1573	}
				1574
				1575	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1576	int size,
				1577	const char *errors)
				1578	{
				1579	PyObject *repr;
				1580	char *s;
				1581
				1582	/* First get the size of the result */
Guido van Rossum	4e751c3	2000-05-03 12:27:22 +0000	[diff] [blame]	1583	DWORD mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1584	if (mbcssize==0)
				1585	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1586
				1587	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1588	if (repr == NULL)
				1589	return NULL;
				1590	if (mbcssize==0)
				1591	return repr;
				1592
				1593	/* Do the conversion */
				1594	s = PyString_AS_STRING(repr);
				1595	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1596	Py_DECREF(repr);
				1597	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1598	}
				1599	return repr;
				1600	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1601
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1602	#endif /* MS_WIN32 */
				1603
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1604	/* --- Character Mapping Codec -------------------------------------------- */
				1605
				1606	static
				1607	int charmap_decoding_error(const char **source,
				1608	Py_UNICODE **dest,
				1609	const char *errors,
				1610	const char *details)
				1611	{
				1612	if ((errors == NULL) \|\|
				1613	(strcmp(errors,"strict") == 0)) {
				1614	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1615	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1616	details);
				1617	return -1;
				1618	}
				1619	else if (strcmp(errors,"ignore") == 0) {
				1620	return 0;
				1621	}
				1622	else if (strcmp(errors,"replace") == 0) {
				1623	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1624	(*dest)++;
				1625	return 0;
				1626	}
				1627	else {
				1628	PyErr_Format(PyExc_ValueError,
				1629	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1630	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1631	errors);
				1632	return -1;
				1633	}
				1634	}
				1635
				1636	PyObject PyUnicode_DecodeCharmap(const char s,
				1637	int size,
				1638	PyObject *mapping,
				1639	const char *errors)
				1640	{
				1641	PyUnicodeObject *v;
				1642	Py_UNICODE *p;
				1643
				1644	/* Default to Latin-1 */
				1645	if (mapping == NULL)
				1646	return PyUnicode_DecodeLatin1(s, size, errors);
				1647
				1648	v = _PyUnicode_New(size);
				1649	if (v == NULL)
				1650	goto onError;
				1651	if (size == 0)
				1652	return (PyObject *)v;
				1653	p = PyUnicode_AS_UNICODE(v);
				1654	while (size-- > 0) {
				1655	unsigned char ch = *s++;
				1656	PyObject w, x;
				1657
				1658	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1659	w = PyInt_FromLong((long)ch);
				1660	if (w == NULL)
				1661	goto onError;
				1662	x = PyObject_GetItem(mapping, w);
				1663	Py_DECREF(w);
				1664	if (x == NULL) {
				1665	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1666	/* No mapping found: default to Latin-1 mapping */
				1667	PyErr_Clear();
				1668	*p++ = (Py_UNICODE)ch;
				1669	continue;
				1670	}
				1671	goto onError;
				1672	}
				1673
				1674	/* Apply mapping */
				1675	if (PyInt_Check(x)) {
				1676	int value = PyInt_AS_LONG(x);
				1677	if (value < 0 \|\| value > 65535) {
				1678	PyErr_SetString(PyExc_TypeError,
				1679	"character mapping must be in range(65336)");
				1680	Py_DECREF(x);
				1681	goto onError;
				1682	}
				1683	*p++ = (Py_UNICODE)value;
				1684	}
				1685	else if (x == Py_None) {
				1686	/* undefined mapping */
				1687	if (charmap_decoding_error(&s, &p, errors,
				1688	"character maps to <undefined>")) {
				1689	Py_DECREF(x);
				1690	goto onError;
				1691	}
				1692	}
				1693	else if (PyUnicode_Check(x)) {
				1694	if (PyUnicode_GET_SIZE(x) != 1) {
				1695	/* 1-n mapping */
				1696	PyErr_SetString(PyExc_NotImplementedError,
				1697	"1-n mappings are currently not implemented");
				1698	Py_DECREF(x);
				1699	goto onError;
				1700	}
				1701	p++ = PyUnicode_AS_UNICODE(x);
				1702	}
				1703	else {
				1704	/* wrong return value */
				1705	PyErr_SetString(PyExc_TypeError,
				1706	"character mapping must return integer, None or unicode");
				1707	Py_DECREF(x);
				1708	goto onError;
				1709	}
				1710	Py_DECREF(x);
				1711	}
				1712	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1713	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1714	goto onError;
				1715	return (PyObject *)v;
				1716
				1717	onError:
				1718	Py_XDECREF(v);
				1719	return NULL;
				1720	}
				1721
				1722	static
				1723	int charmap_encoding_error(const Py_UNICODE **source,
				1724	char **dest,
				1725	const char *errors,
				1726	const char *details)
				1727	{
				1728	if ((errors == NULL) \|\|
				1729	(strcmp(errors,"strict") == 0)) {
				1730	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1731	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1732	details);
				1733	return -1;
				1734	}
				1735	else if (strcmp(errors,"ignore") == 0) {
				1736	return 0;
				1737	}
				1738	else if (strcmp(errors,"replace") == 0) {
				1739	**dest = '?';
				1740	(*dest)++;
				1741	return 0;
				1742	}
				1743	else {
				1744	PyErr_Format(PyExc_ValueError,
				1745	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1746	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1747	errors);
				1748	return -1;
				1749	}
				1750	}
				1751
				1752	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1753	int size,
				1754	PyObject *mapping,
				1755	const char *errors)
				1756	{
				1757	PyObject *v;
				1758	char *s;
				1759
				1760	/* Default to Latin-1 */
				1761	if (mapping == NULL)
				1762	return PyUnicode_EncodeLatin1(p, size, errors);
				1763
				1764	v = PyString_FromStringAndSize(NULL, size);
				1765	if (v == NULL)
				1766	return NULL;
				1767	s = PyString_AS_STRING(v);
				1768	while (size-- > 0) {
				1769	Py_UNICODE ch = *p++;
				1770	PyObject w, x;
				1771
				1772	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				1773	w = PyInt_FromLong((long)ch);
				1774	if (w == NULL)
				1775	goto onError;
				1776	x = PyObject_GetItem(mapping, w);
				1777	Py_DECREF(w);
				1778	if (x == NULL) {
				1779	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1780	/* No mapping found: default to Latin-1 mapping if possible */
				1781	PyErr_Clear();
				1782	if (ch < 256) {
				1783	*s++ = (char)ch;
				1784	continue;
				1785	}
				1786	else if (!charmap_encoding_error(&p, &s, errors,
				1787	"missing character mapping"))
				1788	continue;
				1789	}
				1790	goto onError;
				1791	}
				1792
				1793	/* Apply mapping */
				1794	if (PyInt_Check(x)) {
				1795	int value = PyInt_AS_LONG(x);
				1796	if (value < 0 \|\| value > 255) {
				1797	PyErr_SetString(PyExc_TypeError,
				1798	"character mapping must be in range(256)");
				1799	Py_DECREF(x);
				1800	goto onError;
				1801	}
				1802	*s++ = (char)value;
				1803	}
				1804	else if (x == Py_None) {
				1805	/* undefined mapping */
				1806	if (charmap_encoding_error(&p, &s, errors,
				1807	"character maps to <undefined>")) {
				1808	Py_DECREF(x);
				1809	goto onError;
				1810	}
				1811	}
				1812	else if (PyString_Check(x)) {
				1813	if (PyString_GET_SIZE(x) != 1) {
				1814	/* 1-n mapping */
				1815	PyErr_SetString(PyExc_NotImplementedError,
				1816	"1-n mappings are currently not implemented");
				1817	Py_DECREF(x);
				1818	goto onError;
				1819	}
				1820	s++ = PyString_AS_STRING(x);
				1821	}
				1822	else {
				1823	/* wrong return value */
				1824	PyErr_SetString(PyExc_TypeError,
				1825	"character mapping must return integer, None or unicode");
				1826	Py_DECREF(x);
				1827	goto onError;
				1828	}
				1829	Py_DECREF(x);
				1830	}
				1831	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				1832	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				1833	goto onError;
				1834	return v;
				1835
				1836	onError:
				1837	Py_DECREF(v);
				1838	return NULL;
				1839	}
				1840
				1841	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				1842	PyObject *mapping)
				1843	{
				1844	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				1845	PyErr_BadArgument();
				1846	return NULL;
				1847	}
				1848	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				1849	PyUnicode_GET_SIZE(unicode),
				1850	mapping,
				1851	NULL);
				1852	}
				1853
				1854	static
				1855	int translate_error(const Py_UNICODE **source,
				1856	Py_UNICODE **dest,
				1857	const char *errors,
				1858	const char *details)
				1859	{
				1860	if ((errors == NULL) \|\|
				1861	(strcmp(errors,"strict") == 0)) {
				1862	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1863	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1864	details);
				1865	return -1;
				1866	}
				1867	else if (strcmp(errors,"ignore") == 0) {
				1868	return 0;
				1869	}
				1870	else if (strcmp(errors,"replace") == 0) {
				1871	**dest = '?';
				1872	(*dest)++;
				1873	return 0;
				1874	}
				1875	else {
				1876	PyErr_Format(PyExc_ValueError,
				1877	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1878	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1879	errors);
				1880	return -1;
				1881	}
				1882	}
				1883
				1884	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				1885	int size,
				1886	PyObject *mapping,
				1887	const char *errors)
				1888	{
				1889	PyUnicodeObject *v;
				1890	Py_UNICODE *p;
				1891
				1892	if (mapping == NULL) {
				1893	PyErr_BadArgument();
				1894	return NULL;
				1895	}
				1896
				1897	/* Output will never be longer than input */
				1898	v = _PyUnicode_New(size);
				1899	if (v == NULL)
				1900	goto onError;
				1901	if (size == 0)
				1902	goto done;
				1903	p = PyUnicode_AS_UNICODE(v);
				1904	while (size-- > 0) {
				1905	Py_UNICODE ch = *s++;
				1906	PyObject w, x;
				1907
				1908	/* Get mapping */
				1909	w = PyInt_FromLong(ch);
				1910	if (w == NULL)
				1911	goto onError;
				1912	x = PyObject_GetItem(mapping, w);
				1913	Py_DECREF(w);
				1914	if (x == NULL) {
				1915	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1916	/* No mapping found: default to 1-1 mapping */
				1917	PyErr_Clear();
				1918	*p++ = ch;
				1919	continue;
				1920	}
				1921	goto onError;
				1922	}
				1923
				1924	/* Apply mapping */
				1925	if (PyInt_Check(x))
				1926	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				1927	else if (x == Py_None) {
				1928	/* undefined mapping */
				1929	if (translate_error(&s, &p, errors,
				1930	"character maps to <undefined>")) {
				1931	Py_DECREF(x);
				1932	goto onError;
				1933	}
				1934	}
				1935	else if (PyUnicode_Check(x)) {
				1936	if (PyUnicode_GET_SIZE(x) != 1) {
				1937	/* 1-n mapping */
				1938	PyErr_SetString(PyExc_NotImplementedError,
				1939	"1-n mappings are currently not implemented");
				1940	Py_DECREF(x);
				1941	goto onError;
				1942	}
				1943	p++ = PyUnicode_AS_UNICODE(x);
				1944	}
				1945	else {
				1946	/* wrong return value */
				1947	PyErr_SetString(PyExc_TypeError,
				1948	"translate mapping must return integer, None or unicode");
				1949	Py_DECREF(x);
				1950	goto onError;
				1951	}
				1952	Py_DECREF(x);
				1953	}
				1954	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1955	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1956	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1957
				1958	done:
				1959	return (PyObject *)v;
				1960
				1961	onError:
				1962	Py_XDECREF(v);
				1963	return NULL;
				1964	}
				1965
				1966	PyObject PyUnicode_Translate(PyObject str,
				1967	PyObject *mapping,
				1968	const char *errors)
				1969	{
				1970	PyObject *result;
				1971
				1972	str = PyUnicode_FromObject(str);
				1973	if (str == NULL)
				1974	goto onError;
				1975	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				1976	PyUnicode_GET_SIZE(str),
				1977	mapping,
				1978	errors);
				1979	Py_DECREF(str);
				1980	return result;
				1981
				1982	onError:
				1983	Py_XDECREF(str);
				1984	return NULL;
				1985	}
				1986
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	1987	/* --- Decimal Encoder ---------------------------------------------------- */
				1988
				1989	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				1990	int length,
				1991	char *output,
				1992	const char *errors)
				1993	{
				1994	Py_UNICODE p, end;
				1995
				1996	if (output == NULL) {
				1997	PyErr_BadArgument();
				1998	return -1;
				1999	}
				2000
				2001	p = s;
				2002	end = s + length;
				2003	while (p < end) {
				2004	register Py_UNICODE ch = *p++;
				2005	int decimal;
				2006
				2007	if (Py_UNICODE_ISSPACE(ch)) {
				2008	*output++ = ' ';
				2009	continue;
				2010	}
				2011	decimal = Py_UNICODE_TODECIMAL(ch);
				2012	if (decimal >= 0) {
				2013	*output++ = '0' + decimal;
				2014	continue;
				2015	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2016	if (0 < ch && ch < 256) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2017	*output++ = ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2018	continue;
				2019	}
				2020	/* All other characters are considered invalid */
				2021	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2022	PyErr_SetString(PyExc_ValueError,
				2023	"invalid decimal Unicode string");
				2024	goto onError;
				2025	}
				2026	else if (strcmp(errors, "ignore") == 0)
				2027	continue;
				2028	else if (strcmp(errors, "replace") == 0) {
				2029	*output++ = '?';
				2030	continue;
				2031	}
				2032	}
				2033	/* 0-terminate the output string */
				2034	*output++ = '\0';
				2035	return 0;
				2036
				2037	onError:
				2038	return -1;
				2039	}
				2040
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2041	/* --- Helpers ------------------------------------------------------------ */
				2042
				2043	static
				2044	int count(PyUnicodeObject *self,
				2045	int start,
				2046	int end,
				2047	PyUnicodeObject *substring)
				2048	{
				2049	int count = 0;
				2050
				2051	end -= substring->length;
				2052
				2053	while (start <= end)
				2054	if (Py_UNICODE_MATCH(self, start, substring)) {
				2055	count++;
				2056	start += substring->length;
				2057	} else
				2058	start++;
				2059
				2060	return count;
				2061	}
				2062
				2063	int PyUnicode_Count(PyObject *str,
				2064	PyObject *substr,
				2065	int start,
				2066	int end)
				2067	{
				2068	int result;
				2069
				2070	str = PyUnicode_FromObject(str);
				2071	if (str == NULL)
				2072	return -1;
				2073	substr = PyUnicode_FromObject(substr);
				2074	if (substr == NULL) {
				2075	Py_DECREF(substr);
				2076	return -1;
				2077	}
				2078
				2079	result = count((PyUnicodeObject *)str,
				2080	start, end,
				2081	(PyUnicodeObject *)substr);
				2082
				2083	Py_DECREF(str);
				2084	Py_DECREF(substr);
				2085	return result;
				2086	}
				2087
				2088	static
				2089	int findstring(PyUnicodeObject *self,
				2090	PyUnicodeObject *substring,
				2091	int start,
				2092	int end,
				2093	int direction)
				2094	{
				2095	if (start < 0)
				2096	start += self->length;
				2097	if (start < 0)
				2098	start = 0;
				2099
				2100	if (substring->length == 0)
				2101	return start;
				2102
				2103	if (end > self->length)
				2104	end = self->length;
				2105	if (end < 0)
				2106	end += self->length;
				2107	if (end < 0)
				2108	end = 0;
				2109
				2110	end -= substring->length;
				2111
				2112	if (direction < 0) {
				2113	for (; end >= start; end--)
				2114	if (Py_UNICODE_MATCH(self, end, substring))
				2115	return end;
				2116	} else {
				2117	for (; start <= end; start++)
				2118	if (Py_UNICODE_MATCH(self, start, substring))
				2119	return start;
				2120	}
				2121
				2122	return -1;
				2123	}
				2124
				2125	int PyUnicode_Find(PyObject *str,
				2126	PyObject *substr,
				2127	int start,
				2128	int end,
				2129	int direction)
				2130	{
				2131	int result;
				2132
				2133	str = PyUnicode_FromObject(str);
				2134	if (str == NULL)
				2135	return -1;
				2136	substr = PyUnicode_FromObject(substr);
				2137	if (substr == NULL) {
				2138	Py_DECREF(substr);
				2139	return -1;
				2140	}
				2141
				2142	result = findstring((PyUnicodeObject *)str,
				2143	(PyUnicodeObject *)substr,
				2144	start, end, direction);
				2145	Py_DECREF(str);
				2146	Py_DECREF(substr);
				2147	return result;
				2148	}
				2149
				2150	static
				2151	int tailmatch(PyUnicodeObject *self,
				2152	PyUnicodeObject *substring,
				2153	int start,
				2154	int end,
				2155	int direction)
				2156	{
				2157	if (start < 0)
				2158	start += self->length;
				2159	if (start < 0)
				2160	start = 0;
				2161
				2162	if (substring->length == 0)
				2163	return 1;
				2164
				2165	if (end > self->length)
				2166	end = self->length;
				2167	if (end < 0)
				2168	end += self->length;
				2169	if (end < 0)
				2170	end = 0;
				2171
				2172	end -= substring->length;
				2173	if (end < start)
				2174	return 0;
				2175
				2176	if (direction > 0) {
				2177	if (Py_UNICODE_MATCH(self, end, substring))
				2178	return 1;
				2179	} else {
				2180	if (Py_UNICODE_MATCH(self, start, substring))
				2181	return 1;
				2182	}
				2183
				2184	return 0;
				2185	}
				2186
				2187	int PyUnicode_Tailmatch(PyObject *str,
				2188	PyObject *substr,
				2189	int start,
				2190	int end,
				2191	int direction)
				2192	{
				2193	int result;
				2194
				2195	str = PyUnicode_FromObject(str);
				2196	if (str == NULL)
				2197	return -1;
				2198	substr = PyUnicode_FromObject(substr);
				2199	if (substr == NULL) {
				2200	Py_DECREF(substr);
				2201	return -1;
				2202	}
				2203
				2204	result = tailmatch((PyUnicodeObject *)str,
				2205	(PyUnicodeObject *)substr,
				2206	start, end, direction);
				2207	Py_DECREF(str);
				2208	Py_DECREF(substr);
				2209	return result;
				2210	}
				2211
				2212	static
				2213	const Py_UNICODE findchar(const Py_UNICODE s,
				2214	int size,
				2215	Py_UNICODE ch)
				2216	{
				2217	/* like wcschr, but doesn't stop at NULL characters */
				2218
				2219	while (size-- > 0) {
				2220	if (*s == ch)
				2221	return s;
				2222	s++;
				2223	}
				2224
				2225	return NULL;
				2226	}
				2227
				2228	/* Apply fixfct filter to the Unicode object self and return a
				2229	reference to the modified object */
				2230
				2231	static
				2232	PyObject fixup(PyUnicodeObject self,
				2233	int (fixfct)(PyUnicodeObject s))
				2234	{
				2235
				2236	PyUnicodeObject *u;
				2237
				2238	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2239	self->length);
				2240	if (u == NULL)
				2241	return NULL;
				2242	if (!fixfct(u)) {
				2243	/* fixfct should return TRUE if it modified the buffer. If
				2244	FALSE, return a reference to the original buffer instead
				2245	(to save space, not time) */
				2246	Py_INCREF(self);
				2247	Py_DECREF(u);
				2248	return (PyObject*) self;
				2249	}
				2250	return (PyObject*) u;
				2251	}
				2252
				2253	static
				2254	int fixupper(PyUnicodeObject *self)
				2255	{
				2256	int len = self->length;
				2257	Py_UNICODE *s = self->str;
				2258	int status = 0;
				2259
				2260	while (len-- > 0) {
				2261	register Py_UNICODE ch;
				2262
				2263	ch = Py_UNICODE_TOUPPER(*s);
				2264	if (ch != *s) {
				2265	status = 1;
				2266	*s = ch;
				2267	}
				2268	s++;
				2269	}
				2270
				2271	return status;
				2272	}
				2273
				2274	static
				2275	int fixlower(PyUnicodeObject *self)
				2276	{
				2277	int len = self->length;
				2278	Py_UNICODE *s = self->str;
				2279	int status = 0;
				2280
				2281	while (len-- > 0) {
				2282	register Py_UNICODE ch;
				2283
				2284	ch = Py_UNICODE_TOLOWER(*s);
				2285	if (ch != *s) {
				2286	status = 1;
				2287	*s = ch;
				2288	}
				2289	s++;
				2290	}
				2291
				2292	return status;
				2293	}
				2294
				2295	static
				2296	int fixswapcase(PyUnicodeObject *self)
				2297	{
				2298	int len = self->length;
				2299	Py_UNICODE *s = self->str;
				2300	int status = 0;
				2301
				2302	while (len-- > 0) {
				2303	if (Py_UNICODE_ISUPPER(*s)) {
				2304	s = Py_UNICODE_TOLOWER(s);
				2305	status = 1;
				2306	} else if (Py_UNICODE_ISLOWER(*s)) {
				2307	s = Py_UNICODE_TOUPPER(s);
				2308	status = 1;
				2309	}
				2310	s++;
				2311	}
				2312
				2313	return status;
				2314	}
				2315
				2316	static
				2317	int fixcapitalize(PyUnicodeObject *self)
				2318	{
				2319	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2320	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2321	return 1;
				2322	}
				2323	return 0;
				2324	}
				2325
				2326	static
				2327	int fixtitle(PyUnicodeObject *self)
				2328	{
				2329	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2330	register Py_UNICODE *e;
				2331	int previous_is_cased;
				2332
				2333	/* Shortcut for single character strings */
				2334	if (PyUnicode_GET_SIZE(self) == 1) {
				2335	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2336	if (*p != ch) {
				2337	*p = ch;
				2338	return 1;
				2339	}
				2340	else
				2341	return 0;
				2342	}
				2343
				2344	e = p + PyUnicode_GET_SIZE(self);
				2345	previous_is_cased = 0;
				2346	for (; p < e; p++) {
				2347	register const Py_UNICODE ch = *p;
				2348
				2349	if (previous_is_cased)
				2350	*p = Py_UNICODE_TOLOWER(ch);
				2351	else
				2352	*p = Py_UNICODE_TOTITLE(ch);
				2353
				2354	if (Py_UNICODE_ISLOWER(ch) \|\|
				2355	Py_UNICODE_ISUPPER(ch) \|\|
				2356	Py_UNICODE_ISTITLE(ch))
				2357	previous_is_cased = 1;
				2358	else
				2359	previous_is_cased = 0;
				2360	}
				2361	return 1;
				2362	}
				2363
				2364	PyObject PyUnicode_Join(PyObject separator,
				2365	PyObject *seq)
				2366	{
				2367	Py_UNICODE *sep;
				2368	int seplen;
				2369	PyUnicodeObject *res = NULL;
				2370	int reslen = 0;
				2371	Py_UNICODE *p;
				2372	int seqlen = 0;
				2373	int sz = 100;
				2374	int i;
				2375
				2376	seqlen = PySequence_Length(seq);
				2377	if (seqlen < 0 && PyErr_Occurred())
				2378	return NULL;
				2379
				2380	if (separator == NULL) {
				2381	Py_UNICODE blank = ' ';
				2382	sep = &blank;
				2383	seplen = 1;
				2384	}
				2385	else {
				2386	separator = PyUnicode_FromObject(separator);
				2387	if (separator == NULL)
				2388	return NULL;
				2389	sep = PyUnicode_AS_UNICODE(separator);
				2390	seplen = PyUnicode_GET_SIZE(separator);
				2391	}
				2392
				2393	res = _PyUnicode_New(sz);
				2394	if (res == NULL)
				2395	goto onError;
				2396	p = PyUnicode_AS_UNICODE(res);
				2397	reslen = 0;
				2398
				2399	for (i = 0; i < seqlen; i++) {
				2400	int itemlen;
				2401	PyObject *item;
				2402
				2403	item = PySequence_GetItem(seq, i);
				2404	if (item == NULL)
				2405	goto onError;
				2406	if (!PyUnicode_Check(item)) {
				2407	PyObject *v;
				2408	v = PyUnicode_FromObject(item);
				2409	Py_DECREF(item);
				2410	item = v;
				2411	if (item == NULL)
				2412	goto onError;
				2413	}
				2414	itemlen = PyUnicode_GET_SIZE(item);
				2415	while (reslen + itemlen + seplen >= sz) {
				2416	if (_PyUnicode_Resize(res, sz*2))
				2417	goto onError;
				2418	sz *= 2;
				2419	p = PyUnicode_AS_UNICODE(res) + reslen;
				2420	}
				2421	if (i > 0) {
				2422	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2423	p += seplen;
				2424	reslen += seplen;
				2425	}
				2426	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2427	p += itemlen;
				2428	reslen += itemlen;
				2429	Py_DECREF(item);
				2430	}
				2431	if (_PyUnicode_Resize(res, reslen))
				2432	goto onError;
				2433
				2434	Py_XDECREF(separator);
				2435	return (PyObject *)res;
				2436
				2437	onError:
				2438	Py_XDECREF(separator);
				2439	Py_DECREF(res);
				2440	return NULL;
				2441	}
				2442
				2443	static
				2444	PyUnicodeObject pad(PyUnicodeObject self,
				2445	int left,
				2446	int right,
				2447	Py_UNICODE fill)
				2448	{
				2449	PyUnicodeObject *u;
				2450
				2451	if (left < 0)
				2452	left = 0;
				2453	if (right < 0)
				2454	right = 0;
				2455
				2456	if (left == 0 && right == 0) {
				2457	Py_INCREF(self);
				2458	return self;
				2459	}
				2460
				2461	u = _PyUnicode_New(left + self->length + right);
				2462	if (u) {
				2463	if (left)
				2464	Py_UNICODE_FILL(u->str, fill, left);
				2465	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2466	if (right)
				2467	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2468	}
				2469
				2470	return u;
				2471	}
				2472
				2473	#define SPLIT_APPEND(data, left, right) \
				2474	str = PyUnicode_FromUnicode(data + left, right - left); \
				2475	if (!str) \
				2476	goto onError; \
				2477	if (PyList_Append(list, str)) { \
				2478	Py_DECREF(str); \
				2479	goto onError; \
				2480	} \
				2481	else \
				2482	Py_DECREF(str);
				2483
				2484	static
				2485	PyObject split_whitespace(PyUnicodeObject self,
				2486	PyObject *list,
				2487	int maxcount)
				2488	{
				2489	register int i;
				2490	register int j;
				2491	int len = self->length;
				2492	PyObject *str;
				2493
				2494	for (i = j = 0; i < len; ) {
				2495	/* find a token */
				2496	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2497	i++;
				2498	j = i;
				2499	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2500	i++;
				2501	if (j < i) {
				2502	if (maxcount-- <= 0)
				2503	break;
				2504	SPLIT_APPEND(self->str, j, i);
				2505	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2506	i++;
				2507	j = i;
				2508	}
				2509	}
				2510	if (j < len) {
				2511	SPLIT_APPEND(self->str, j, len);
				2512	}
				2513	return list;
				2514
				2515	onError:
				2516	Py_DECREF(list);
				2517	return NULL;
				2518	}
				2519
				2520	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2521	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2522	{
				2523	register int i;
				2524	register int j;
				2525	int len;
				2526	PyObject *list;
				2527	PyObject *str;
				2528	Py_UNICODE *data;
				2529
				2530	string = PyUnicode_FromObject(string);
				2531	if (string == NULL)
				2532	return NULL;
				2533	data = PyUnicode_AS_UNICODE(string);
				2534	len = PyUnicode_GET_SIZE(string);
				2535
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2536	list = PyList_New(0);
				2537	if (!list)
				2538	goto onError;
				2539
				2540	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2541	int eol;
				2542
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2543	/* Find a line and append it */
				2544	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2545	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2546
				2547	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2548	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2549	if (i < len) {
				2550	if (data[i] == '\r' && i + 1 < len &&
				2551	data[i+1] == '\n')
				2552	i += 2;
				2553	else
				2554	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2555	if (keepends)
				2556	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2557	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2558	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2559	j = i;
				2560	}
				2561	if (j < len) {
				2562	SPLIT_APPEND(data, j, len);
				2563	}
				2564
				2565	Py_DECREF(string);
				2566	return list;
				2567
				2568	onError:
				2569	Py_DECREF(list);
				2570	Py_DECREF(string);
				2571	return NULL;
				2572	}
				2573
				2574	static
				2575	PyObject split_char(PyUnicodeObject self,
				2576	PyObject *list,
				2577	Py_UNICODE ch,
				2578	int maxcount)
				2579	{
				2580	register int i;
				2581	register int j;
				2582	int len = self->length;
				2583	PyObject *str;
				2584
				2585	for (i = j = 0; i < len; ) {
				2586	if (self->str[i] == ch) {
				2587	if (maxcount-- <= 0)
				2588	break;
				2589	SPLIT_APPEND(self->str, j, i);
				2590	i = j = i + 1;
				2591	} else
				2592	i++;
				2593	}
				2594	if (j <= len) {
				2595	SPLIT_APPEND(self->str, j, len);
				2596	}
				2597	return list;
				2598
				2599	onError:
				2600	Py_DECREF(list);
				2601	return NULL;
				2602	}
				2603
				2604	static
				2605	PyObject split_substring(PyUnicodeObject self,
				2606	PyObject *list,
				2607	PyUnicodeObject *substring,
				2608	int maxcount)
				2609	{
				2610	register int i;
				2611	register int j;
				2612	int len = self->length;
				2613	int sublen = substring->length;
				2614	PyObject *str;
				2615
				2616	for (i = j = 0; i < len - sublen; ) {
				2617	if (Py_UNICODE_MATCH(self, i, substring)) {
				2618	if (maxcount-- <= 0)
				2619	break;
				2620	SPLIT_APPEND(self->str, j, i);
				2621	i = j = i + sublen;
				2622	} else
				2623	i++;
				2624	}
				2625	if (j <= len) {
				2626	SPLIT_APPEND(self->str, j, len);
				2627	}
				2628	return list;
				2629
				2630	onError:
				2631	Py_DECREF(list);
				2632	return NULL;
				2633	}
				2634
				2635	#undef SPLIT_APPEND
				2636
				2637	static
				2638	PyObject split(PyUnicodeObject self,
				2639	PyUnicodeObject *substring,
				2640	int maxcount)
				2641	{
				2642	PyObject *list;
				2643
				2644	if (maxcount < 0)
				2645	maxcount = INT_MAX;
				2646
				2647	list = PyList_New(0);
				2648	if (!list)
				2649	return NULL;
				2650
				2651	if (substring == NULL)
				2652	return split_whitespace(self,list,maxcount);
				2653
				2654	else if (substring->length == 1)
				2655	return split_char(self,list,substring->str[0],maxcount);
				2656
				2657	else if (substring->length == 0) {
				2658	Py_DECREF(list);
				2659	PyErr_SetString(PyExc_ValueError, "empty separator");
				2660	return NULL;
				2661	}
				2662	else
				2663	return split_substring(self,list,substring,maxcount);
				2664	}
				2665
				2666	static
				2667	PyObject strip(PyUnicodeObject self,
				2668	int left,
				2669	int right)
				2670	{
				2671	Py_UNICODE *p = self->str;
				2672	int start = 0;
				2673	int end = self->length;
				2674
				2675	if (left)
				2676	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2677	start++;
				2678
				2679	if (right)
				2680	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2681	end--;
				2682
				2683	if (start == 0 && end == self->length) {
				2684	/* couldn't strip anything off, return original string */
				2685	Py_INCREF(self);
				2686	return (PyObject*) self;
				2687	}
				2688
				2689	return (PyObject*) PyUnicode_FromUnicode(
				2690	self->str + start,
				2691	end - start
				2692	);
				2693	}
				2694
				2695	static
				2696	PyObject replace(PyUnicodeObject self,
				2697	PyUnicodeObject *str1,
				2698	PyUnicodeObject *str2,
				2699	int maxcount)
				2700	{
				2701	PyUnicodeObject *u;
				2702
				2703	if (maxcount < 0)
				2704	maxcount = INT_MAX;
				2705
				2706	if (str1->length == 1 && str2->length == 1) {
				2707	int i;
				2708
				2709	/* replace characters */
				2710	if (!findchar(self->str, self->length, str1->str[0])) {
				2711	/* nothing to replace, return original string */
				2712	Py_INCREF(self);
				2713	u = self;
				2714	} else {
				2715	Py_UNICODE u1 = str1->str[0];
				2716	Py_UNICODE u2 = str2->str[0];
				2717
				2718	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2719	self->str,
				2720	self->length
				2721	);
				2722	if (u)
				2723	for (i = 0; i < u->length; i++)
				2724	if (u->str[i] == u1) {
				2725	if (--maxcount < 0)
				2726	break;
				2727	u->str[i] = u2;
				2728	}
				2729	}
				2730
				2731	} else {
				2732	int n, i;
				2733	Py_UNICODE *p;
				2734
				2735	/* replace strings */
				2736	n = count(self, 0, self->length, str1);
				2737	if (n > maxcount)
				2738	n = maxcount;
				2739	if (n == 0) {
				2740	/* nothing to replace, return original string */
				2741	Py_INCREF(self);
				2742	u = self;
				2743	} else {
				2744	u = _PyUnicode_New(
				2745	self->length + n * (str2->length - str1->length));
				2746	if (u) {
				2747	i = 0;
				2748	p = u->str;
				2749	while (i <= self->length - str1->length)
				2750	if (Py_UNICODE_MATCH(self, i, str1)) {
				2751	/* replace string segment */
				2752	Py_UNICODE_COPY(p, str2->str, str2->length);
				2753	p += str2->length;
				2754	i += str1->length;
				2755	if (--n <= 0) {
				2756	/* copy remaining part */
				2757	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				2758	break;
				2759	}
				2760	} else
				2761	*p++ = self->str[i++];
				2762	}
				2763	}
				2764	}
				2765
				2766	return (PyObject *) u;
				2767	}
				2768
				2769	/* --- Unicode Object Methods --------------------------------------------- */
				2770
				2771	static char title__doc__[] =
				2772	"S.title() -> unicode\n\
				2773	\n\
				2774	Return a titlecased version of S, i.e. words start with title case\n\
				2775	characters, all remaining cased characters have lower case.";
				2776
				2777	static PyObject*
				2778	unicode_title(PyUnicodeObject self, PyObject args)
				2779	{
				2780	if (!PyArg_NoArgs(args))
				2781	return NULL;
				2782	return fixup(self, fixtitle);
				2783	}
				2784
				2785	static char capitalize__doc__[] =
				2786	"S.capitalize() -> unicode\n\
				2787	\n\
				2788	Return a capitalized version of S, i.e. make the first character\n\
				2789	have upper case.";
				2790
				2791	static PyObject*
				2792	unicode_capitalize(PyUnicodeObject self, PyObject args)
				2793	{
				2794	if (!PyArg_NoArgs(args))
				2795	return NULL;
				2796	return fixup(self, fixcapitalize);
				2797	}
				2798
				2799	#if 0
				2800	static char capwords__doc__[] =
				2801	"S.capwords() -> unicode\n\
				2802	\n\
				2803	Apply .capitalize() to all words in S and return the result with\n\
				2804	normalized whitespace (all whitespace strings are replaced by ' ').";
				2805
				2806	static PyObject*
				2807	unicode_capwords(PyUnicodeObject self, PyObject args)
				2808	{
				2809	PyObject *list;
				2810	PyObject *item;
				2811	int i;
				2812
				2813	if (!PyArg_NoArgs(args))
				2814	return NULL;
				2815
				2816	/* Split into words */
				2817	list = split(self, NULL, -1);
				2818	if (!list)
				2819	return NULL;
				2820
				2821	/* Capitalize each word */
				2822	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				2823	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				2824	fixcapitalize);
				2825	if (item == NULL)
				2826	goto onError;
				2827	Py_DECREF(PyList_GET_ITEM(list, i));
				2828	PyList_SET_ITEM(list, i, item);
				2829	}
				2830
				2831	/* Join the words to form a new string */
				2832	item = PyUnicode_Join(NULL, list);
				2833
				2834	onError:
				2835	Py_DECREF(list);
				2836	return (PyObject *)item;
				2837	}
				2838	#endif
				2839
				2840	static char center__doc__[] =
				2841	"S.center(width) -> unicode\n\
				2842	\n\
				2843	Return S centered in a Unicode string of length width. Padding is done\n\
				2844	using spaces.";
				2845
				2846	static PyObject *
				2847	unicode_center(PyUnicodeObject self, PyObject args)
				2848	{
				2849	int marg, left;
				2850	int width;
				2851
				2852	if (!PyArg_ParseTuple(args, "i:center", &width))
				2853	return NULL;
				2854
				2855	if (self->length >= width) {
				2856	Py_INCREF(self);
				2857	return (PyObject*) self;
				2858	}
				2859
				2860	marg = width - self->length;
				2861	left = marg / 2 + (marg & width & 1);
				2862
				2863	return (PyObject*) pad(self, left, marg - left, ' ');
				2864	}
				2865
				2866	static int
				2867	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				2868	{
				2869	int len1, len2;
				2870	Py_UNICODE *s1 = str1->str;
				2871	Py_UNICODE *s2 = str2->str;
				2872
				2873	len1 = str1->length;
				2874	len2 = str2->length;
				2875
				2876	while (len1 > 0 && len2 > 0) {
				2877	int cmp = (s1++) - (s2++);
				2878	if (cmp)
				2879	/* This should make Christian happy! */
				2880	return (cmp < 0) ? -1 : (cmp != 0);
				2881	len1--, len2--;
				2882	}
				2883
				2884	return (len1 < len2) ? -1 : (len1 != len2);
				2885	}
				2886
				2887	int PyUnicode_Compare(PyObject *left,
				2888	PyObject *right)
				2889	{
				2890	PyUnicodeObject u = NULL, v = NULL;
				2891	int result;
				2892
				2893	/* Coerce the two arguments */
				2894	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2895	if (u == NULL)
				2896	goto onError;
				2897	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2898	if (v == NULL)
				2899	goto onError;
				2900
				2901	/* Shortcut for emtpy or interned objects */
				2902	if (v == u) {
				2903	Py_DECREF(u);
				2904	Py_DECREF(v);
				2905	return 0;
				2906	}
				2907
				2908	result = unicode_compare(u, v);
				2909
				2910	Py_DECREF(u);
				2911	Py_DECREF(v);
				2912	return result;
				2913
				2914	onError:
				2915	Py_XDECREF(u);
				2916	Py_XDECREF(v);
				2917	return -1;
				2918	}
				2919
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2920	int PyUnicode_Contains(PyObject *container,
				2921	PyObject *element)
				2922	{
				2923	PyUnicodeObject u = NULL, v = NULL;
				2924	int result;
				2925	register const Py_UNICODE p, e;
				2926	register Py_UNICODE ch;
				2927
				2928	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2929	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
				2930	if (v == NULL)
				2931	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2932	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				2933	if (u == NULL) {
				2934	Py_DECREF(v);
				2935	goto onError;
				2936	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2937
				2938	/* Check v in u */
				2939	if (PyUnicode_GET_SIZE(v) != 1) {
				2940	PyErr_SetString(PyExc_TypeError,
				2941	"string member test needs char left operand");
				2942	goto onError;
				2943	}
				2944	ch = *PyUnicode_AS_UNICODE(v);
				2945	p = PyUnicode_AS_UNICODE(u);
				2946	e = p + PyUnicode_GET_SIZE(u);
				2947	result = 0;
				2948	while (p < e) {
				2949	if (*p++ == ch) {
				2950	result = 1;
				2951	break;
				2952	}
				2953	}
				2954
				2955	Py_DECREF(u);
				2956	Py_DECREF(v);
				2957	return result;
				2958
				2959	onError:
				2960	Py_XDECREF(u);
				2961	Py_XDECREF(v);
				2962	return -1;
				2963	}
				2964
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2965	/* Concat to string or Unicode object giving a new Unicode object. */
				2966
				2967	PyObject PyUnicode_Concat(PyObject left,
				2968	PyObject *right)
				2969	{
				2970	PyUnicodeObject u = NULL, v = NULL, *w;
				2971
				2972	/* Coerce the two arguments */
				2973	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2974	if (u == NULL)
				2975	goto onError;
				2976	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2977	if (v == NULL)
				2978	goto onError;
				2979
				2980	/* Shortcuts */
				2981	if (v == unicode_empty) {
				2982	Py_DECREF(v);
				2983	return (PyObject *)u;
				2984	}
				2985	if (u == unicode_empty) {
				2986	Py_DECREF(u);
				2987	return (PyObject *)v;
				2988	}
				2989
				2990	/* Concat the two Unicode strings */
				2991	w = _PyUnicode_New(u->length + v->length);
				2992	if (w == NULL)
				2993	goto onError;
				2994	Py_UNICODE_COPY(w->str, u->str, u->length);
				2995	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				2996
				2997	Py_DECREF(u);
				2998	Py_DECREF(v);
				2999	return (PyObject *)w;
				3000
				3001	onError:
				3002	Py_XDECREF(u);
				3003	Py_XDECREF(v);
				3004	return NULL;
				3005	}
				3006
				3007	static char count__doc__[] =
				3008	"S.count(sub[, start[, end]]) -> int\n\
				3009	\n\
				3010	Return the number of occurrences of substring sub in Unicode string\n\
				3011	S[start:end]. Optional arguments start and end are\n\
				3012	interpreted as in slice notation.";
				3013
				3014	static PyObject *
				3015	unicode_count(PyUnicodeObject self, PyObject args)
				3016	{
				3017	PyUnicodeObject *substring;
				3018	int start = 0;
				3019	int end = INT_MAX;
				3020	PyObject *result;
				3021
				3022	if (!PyArg_ParseTuple(args, "O\|ii:count", &substring, &start, &end))
				3023	return NULL;
				3024
				3025	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3026	(PyObject *)substring);
				3027	if (substring == NULL)
				3028	return NULL;
				3029
				3030	if (substring->length == 0) {
				3031	Py_DECREF(substring);
				3032	return PyInt_FromLong((long) 0);
				3033	}
				3034
				3035	if (start < 0)
				3036	start += self->length;
				3037	if (start < 0)
				3038	start = 0;
				3039	if (end > self->length)
				3040	end = self->length;
				3041	if (end < 0)
				3042	end += self->length;
				3043	if (end < 0)
				3044	end = 0;
				3045
				3046	result = PyInt_FromLong((long) count(self, start, end, substring));
				3047
				3048	Py_DECREF(substring);
				3049	return result;
				3050	}
				3051
				3052	static char encode__doc__[] =
				3053	"S.encode([encoding[,errors]]) -> string\n\
				3054	\n\
				3055	Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
				3056	errors may be given to set a different error handling scheme. Default\n\
				3057	is 'strict' meaning that encoding errors raise a ValueError. Other\n\
				3058	possible values are 'ignore' and 'replace'.";
				3059
				3060	static PyObject *
				3061	unicode_encode(PyUnicodeObject self, PyObject args)
				3062	{
				3063	char *encoding = NULL;
				3064	char *errors = NULL;
				3065	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3066	return NULL;
				3067	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3068	}
				3069
				3070	static char expandtabs__doc__[] =
				3071	"S.expandtabs([tabsize]) -> unicode\n\
				3072	\n\
				3073	Return a copy of S where all tab characters are expanded using spaces.\n\
				3074	If tabsize is not given, a tab size of 8 characters is assumed.";
				3075
				3076	static PyObject*
				3077	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3078	{
				3079	Py_UNICODE *e;
				3080	Py_UNICODE *p;
				3081	Py_UNICODE *q;
				3082	int i, j;
				3083	PyUnicodeObject *u;
				3084	int tabsize = 8;
				3085
				3086	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3087	return NULL;
				3088
				3089	/* First pass: determine size of ouput string */
				3090	i = j = 0;
				3091	e = self->str + self->length;
				3092	for (p = self->str; p < e; p++)
				3093	if (*p == '\t') {
				3094	if (tabsize > 0)
				3095	j += tabsize - (j % tabsize);
				3096	}
				3097	else {
				3098	j++;
				3099	if (p == '\n' \|\| p == '\r') {
				3100	i += j;
				3101	j = 0;
				3102	}
				3103	}
				3104
				3105	/* Second pass: create output string and fill it */
				3106	u = _PyUnicode_New(i + j);
				3107	if (!u)
				3108	return NULL;
				3109
				3110	j = 0;
				3111	q = u->str;
				3112
				3113	for (p = self->str; p < e; p++)
				3114	if (*p == '\t') {
				3115	if (tabsize > 0) {
				3116	i = tabsize - (j % tabsize);
				3117	j += i;
				3118	while (i--)
				3119	*q++ = ' ';
				3120	}
				3121	}
				3122	else {
				3123	j++;
				3124	q++ = p;
				3125	if (p == '\n' \|\| p == '\r')
				3126	j = 0;
				3127	}
				3128
				3129	return (PyObject*) u;
				3130	}
				3131
				3132	static char find__doc__[] =
				3133	"S.find(sub [,start [,end]]) -> int\n\
				3134	\n\
				3135	Return the lowest index in S where substring sub is found,\n\
				3136	such that sub is contained within s[start,end]. Optional\n\
				3137	arguments start and end are interpreted as in slice notation.\n\
				3138	\n\
				3139	Return -1 on failure.";
				3140
				3141	static PyObject *
				3142	unicode_find(PyUnicodeObject self, PyObject args)
				3143	{
				3144	PyUnicodeObject *substring;
				3145	int start = 0;
				3146	int end = INT_MAX;
				3147	PyObject *result;
				3148
				3149	if (!PyArg_ParseTuple(args, "O\|ii:find", &substring, &start, &end))
				3150	return NULL;
				3151	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3152	(PyObject *)substring);
				3153	if (substring == NULL)
				3154	return NULL;
				3155
				3156	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3157
				3158	Py_DECREF(substring);
				3159	return result;
				3160	}
				3161
				3162	static PyObject *
				3163	unicode_getitem(PyUnicodeObject *self, int index)
				3164	{
				3165	if (index < 0 \|\| index >= self->length) {
				3166	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3167	return NULL;
				3168	}
				3169
				3170	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3171	}
				3172
				3173	static long
				3174	unicode_hash(PyUnicodeObject *self)
				3175	{
				3176	long hash;
				3177	PyObject *utf8;
				3178
				3179	/* Since Unicode objects compare equal to their UTF-8 string
				3180	counterparts, they should also use the UTF-8 strings as basis
				3181	for their hash value. This is needed to assure that strings and
				3182	Unicode objects behave in the same way as dictionary
				3183	keys. Unfortunately, this costs some performance and also some
				3184	memory if the cached UTF-8 representation is not used later
				3185	on. */
				3186	if (self->hash != -1)
				3187	return self->hash;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	3188	utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3189	if (utf8 == NULL)
				3190	return -1;
				3191	hash = PyObject_Hash(utf8);
				3192	if (hash == -1)
				3193	return -1;
				3194	self->hash = hash;
				3195	return hash;
				3196	}
				3197
				3198	static char index__doc__[] =
				3199	"S.index(sub [,start [,end]]) -> int\n\
				3200	\n\
				3201	Like S.find() but raise ValueError when the substring is not found.";
				3202
				3203	static PyObject *
				3204	unicode_index(PyUnicodeObject self, PyObject args)
				3205	{
				3206	int result;
				3207	PyUnicodeObject *substring;
				3208	int start = 0;
				3209	int end = INT_MAX;
				3210
				3211	if (!PyArg_ParseTuple(args, "O\|ii:index", &substring, &start, &end))
				3212	return NULL;
				3213
				3214	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3215	(PyObject *)substring);
				3216	if (substring == NULL)
				3217	return NULL;
				3218
				3219	result = findstring(self, substring, start, end, 1);
				3220
				3221	Py_DECREF(substring);
				3222	if (result < 0) {
				3223	PyErr_SetString(PyExc_ValueError, "substring not found");
				3224	return NULL;
				3225	}
				3226	return PyInt_FromLong(result);
				3227	}
				3228
				3229	static char islower__doc__[] =
				3230	"S.islower() -> int\n\
				3231	\n\
				3232	Return 1 if all cased characters in S are lowercase and there is\n\
				3233	at least one cased character in S, 0 otherwise.";
				3234
				3235	static PyObject*
				3236	unicode_islower(PyUnicodeObject self, PyObject args)
				3237	{
				3238	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3239	register const Py_UNICODE *e;
				3240	int cased;
				3241
				3242	if (!PyArg_NoArgs(args))
				3243	return NULL;
				3244
				3245	/* Shortcut for single character strings */
				3246	if (PyUnicode_GET_SIZE(self) == 1)
				3247	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3248
				3249	e = p + PyUnicode_GET_SIZE(self);
				3250	cased = 0;
				3251	for (; p < e; p++) {
				3252	register const Py_UNICODE ch = *p;
				3253
				3254	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3255	return PyInt_FromLong(0);
				3256	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3257	cased = 1;
				3258	}
				3259	return PyInt_FromLong(cased);
				3260	}
				3261
				3262	static char isupper__doc__[] =
				3263	"S.isupper() -> int\n\
				3264	\n\
				3265	Return 1 if all cased characters in S are uppercase and there is\n\
				3266	at least one cased character in S, 0 otherwise.";
				3267
				3268	static PyObject*
				3269	unicode_isupper(PyUnicodeObject self, PyObject args)
				3270	{
				3271	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3272	register const Py_UNICODE *e;
				3273	int cased;
				3274
				3275	if (!PyArg_NoArgs(args))
				3276	return NULL;
				3277
				3278	/* Shortcut for single character strings */
				3279	if (PyUnicode_GET_SIZE(self) == 1)
				3280	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3281
				3282	e = p + PyUnicode_GET_SIZE(self);
				3283	cased = 0;
				3284	for (; p < e; p++) {
				3285	register const Py_UNICODE ch = *p;
				3286
				3287	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3288	return PyInt_FromLong(0);
				3289	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3290	cased = 1;
				3291	}
				3292	return PyInt_FromLong(cased);
				3293	}
				3294
				3295	static char istitle__doc__[] =
				3296	"S.istitle() -> int\n\
				3297	\n\
				3298	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3299	may only follow uncased characters and lowercase characters only cased\n\
				3300	ones. Return 0 otherwise.";
				3301
				3302	static PyObject*
				3303	unicode_istitle(PyUnicodeObject self, PyObject args)
				3304	{
				3305	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3306	register const Py_UNICODE *e;
				3307	int cased, previous_is_cased;
				3308
				3309	if (!PyArg_NoArgs(args))
				3310	return NULL;
				3311
				3312	/* Shortcut for single character strings */
				3313	if (PyUnicode_GET_SIZE(self) == 1)
				3314	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3315	(Py_UNICODE_ISUPPER(*p) != 0));
				3316
				3317	e = p + PyUnicode_GET_SIZE(self);
				3318	cased = 0;
				3319	previous_is_cased = 0;
				3320	for (; p < e; p++) {
				3321	register const Py_UNICODE ch = *p;
				3322
				3323	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3324	if (previous_is_cased)
				3325	return PyInt_FromLong(0);
				3326	previous_is_cased = 1;
				3327	cased = 1;
				3328	}
				3329	else if (Py_UNICODE_ISLOWER(ch)) {
				3330	if (!previous_is_cased)
				3331	return PyInt_FromLong(0);
				3332	previous_is_cased = 1;
				3333	cased = 1;
				3334	}
				3335	else
				3336	previous_is_cased = 0;
				3337	}
				3338	return PyInt_FromLong(cased);
				3339	}
				3340
				3341	static char isspace__doc__[] =
				3342	"S.isspace() -> int\n\
				3343	\n\
				3344	Return 1 if there are only whitespace characters in S,\n\
				3345	0 otherwise.";
				3346
				3347	static PyObject*
				3348	unicode_isspace(PyUnicodeObject self, PyObject args)
				3349	{
				3350	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3351	register const Py_UNICODE *e;
				3352
				3353	if (!PyArg_NoArgs(args))
				3354	return NULL;
				3355
				3356	/* Shortcut for single character strings */
				3357	if (PyUnicode_GET_SIZE(self) == 1 &&
				3358	Py_UNICODE_ISSPACE(*p))
				3359	return PyInt_FromLong(1);
				3360
				3361	e = p + PyUnicode_GET_SIZE(self);
				3362	for (; p < e; p++) {
				3363	if (!Py_UNICODE_ISSPACE(*p))
				3364	return PyInt_FromLong(0);
				3365	}
				3366	return PyInt_FromLong(1);
				3367	}
				3368
				3369	static char isdecimal__doc__[] =
				3370	"S.isdecimal() -> int\n\
				3371	\n\
				3372	Return 1 if there are only decimal characters in S,\n\
				3373	0 otherwise.";
				3374
				3375	static PyObject*
				3376	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3377	{
				3378	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3379	register const Py_UNICODE *e;
				3380
				3381	if (!PyArg_NoArgs(args))
				3382	return NULL;
				3383
				3384	/* Shortcut for single character strings */
				3385	if (PyUnicode_GET_SIZE(self) == 1 &&
				3386	Py_UNICODE_ISDECIMAL(*p))
				3387	return PyInt_FromLong(1);
				3388
				3389	e = p + PyUnicode_GET_SIZE(self);
				3390	for (; p < e; p++) {
				3391	if (!Py_UNICODE_ISDECIMAL(*p))
				3392	return PyInt_FromLong(0);
				3393	}
				3394	return PyInt_FromLong(1);
				3395	}
				3396
				3397	static char isdigit__doc__[] =
				3398	"S.isdigit() -> int\n\
				3399	\n\
				3400	Return 1 if there are only digit characters in S,\n\
				3401	0 otherwise.";
				3402
				3403	static PyObject*
				3404	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3405	{
				3406	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3407	register const Py_UNICODE *e;
				3408
				3409	if (!PyArg_NoArgs(args))
				3410	return NULL;
				3411
				3412	/* Shortcut for single character strings */
				3413	if (PyUnicode_GET_SIZE(self) == 1 &&
				3414	Py_UNICODE_ISDIGIT(*p))
				3415	return PyInt_FromLong(1);
				3416
				3417	e = p + PyUnicode_GET_SIZE(self);
				3418	for (; p < e; p++) {
				3419	if (!Py_UNICODE_ISDIGIT(*p))
				3420	return PyInt_FromLong(0);
				3421	}
				3422	return PyInt_FromLong(1);
				3423	}
				3424
				3425	static char isnumeric__doc__[] =
				3426	"S.isnumeric() -> int\n\
				3427	\n\
				3428	Return 1 if there are only numeric characters in S,\n\
				3429	0 otherwise.";
				3430
				3431	static PyObject*
				3432	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3433	{
				3434	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3435	register const Py_UNICODE *e;
				3436
				3437	if (!PyArg_NoArgs(args))
				3438	return NULL;
				3439
				3440	/* Shortcut for single character strings */
				3441	if (PyUnicode_GET_SIZE(self) == 1 &&
				3442	Py_UNICODE_ISNUMERIC(*p))
				3443	return PyInt_FromLong(1);
				3444
				3445	e = p + PyUnicode_GET_SIZE(self);
				3446	for (; p < e; p++) {
				3447	if (!Py_UNICODE_ISNUMERIC(*p))
				3448	return PyInt_FromLong(0);
				3449	}
				3450	return PyInt_FromLong(1);
				3451	}
				3452
				3453	static char join__doc__[] =
				3454	"S.join(sequence) -> unicode\n\
				3455	\n\
				3456	Return a string which is the concatenation of the strings in the\n\
				3457	sequence. The separator between elements is S.";
				3458
				3459	static PyObject*
				3460	unicode_join(PyUnicodeObject self, PyObject args)
				3461	{
				3462	PyObject *data;
				3463	if (!PyArg_ParseTuple(args, "O:join", &data))
				3464	return NULL;
				3465
				3466	return PyUnicode_Join((PyObject *)self, data);
				3467	}
				3468
				3469	static int
				3470	unicode_length(PyUnicodeObject *self)
				3471	{
				3472	return self->length;
				3473	}
				3474
				3475	static char ljust__doc__[] =
				3476	"S.ljust(width) -> unicode\n\
				3477	\n\
				3478	Return S left justified in a Unicode string of length width. Padding is\n\
				3479	done using spaces.";
				3480
				3481	static PyObject *
				3482	unicode_ljust(PyUnicodeObject self, PyObject args)
				3483	{
				3484	int width;
				3485	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3486	return NULL;
				3487
				3488	if (self->length >= width) {
				3489	Py_INCREF(self);
				3490	return (PyObject*) self;
				3491	}
				3492
				3493	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3494	}
				3495
				3496	static char lower__doc__[] =
				3497	"S.lower() -> unicode\n\
				3498	\n\
				3499	Return a copy of the string S converted to lowercase.";
				3500
				3501	static PyObject*
				3502	unicode_lower(PyUnicodeObject self, PyObject args)
				3503	{
				3504	if (!PyArg_NoArgs(args))
				3505	return NULL;
				3506	return fixup(self, fixlower);
				3507	}
				3508
				3509	static char lstrip__doc__[] =
				3510	"S.lstrip() -> unicode\n\
				3511	\n\
				3512	Return a copy of the string S with leading whitespace removed.";
				3513
				3514	static PyObject *
				3515	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3516	{
				3517	if (!PyArg_NoArgs(args))
				3518	return NULL;
				3519	return strip(self, 1, 0);
				3520	}
				3521
				3522	static PyObject*
				3523	unicode_repeat(PyUnicodeObject *str, int len)
				3524	{
				3525	PyUnicodeObject *u;
				3526	Py_UNICODE *p;
				3527
				3528	if (len < 0)
				3529	len = 0;
				3530
				3531	if (len == 1) {
				3532	/* no repeat, return original string */
				3533	Py_INCREF(str);
				3534	return (PyObject*) str;
				3535	}
				3536
				3537	u = _PyUnicode_New(len * str->length);
				3538	if (!u)
				3539	return NULL;
				3540
				3541	p = u->str;
				3542
				3543	while (len-- > 0) {
				3544	Py_UNICODE_COPY(p, str->str, str->length);
				3545	p += str->length;
				3546	}
				3547
				3548	return (PyObject*) u;
				3549	}
				3550
				3551	PyObject PyUnicode_Replace(PyObject obj,
				3552	PyObject *subobj,
				3553	PyObject *replobj,
				3554	int maxcount)
				3555	{
				3556	PyObject *self;
				3557	PyObject *str1;
				3558	PyObject *str2;
				3559	PyObject *result;
				3560
				3561	self = PyUnicode_FromObject(obj);
				3562	if (self == NULL)
				3563	return NULL;
				3564	str1 = PyUnicode_FromObject(subobj);
				3565	if (str1 == NULL) {
				3566	Py_DECREF(self);
				3567	return NULL;
				3568	}
				3569	str2 = PyUnicode_FromObject(replobj);
				3570	if (str2 == NULL) {
				3571	Py_DECREF(self);
				3572	Py_DECREF(str1);
				3573	return NULL;
				3574	}
				3575	result = replace((PyUnicodeObject *)self,
				3576	(PyUnicodeObject *)str1,
				3577	(PyUnicodeObject *)str2,
				3578	maxcount);
				3579	Py_DECREF(self);
				3580	Py_DECREF(str1);
				3581	Py_DECREF(str2);
				3582	return result;
				3583	}
				3584
				3585	static char replace__doc__[] =
				3586	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3587	\n\
				3588	Return a copy of S with all occurrences of substring\n\
				3589	old replaced by new. If the optional argument maxsplit is\n\
				3590	given, only the first maxsplit occurrences are replaced.";
				3591
				3592	static PyObject*
				3593	unicode_replace(PyUnicodeObject self, PyObject args)
				3594	{
				3595	PyUnicodeObject *str1;
				3596	PyUnicodeObject *str2;
				3597	int maxcount = -1;
				3598	PyObject *result;
				3599
				3600	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3601	return NULL;
				3602	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3603	if (str1 == NULL)
				3604	return NULL;
				3605	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3606	if (str2 == NULL)
				3607	return NULL;
				3608
				3609	result = replace(self, str1, str2, maxcount);
				3610
				3611	Py_DECREF(str1);
				3612	Py_DECREF(str2);
				3613	return result;
				3614	}
				3615
				3616	static
				3617	PyObject unicode_repr(PyObject unicode)
				3618	{
				3619	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				3620	PyUnicode_GET_SIZE(unicode),
				3621	1);
				3622	}
				3623
				3624	static char rfind__doc__[] =
				3625	"S.rfind(sub [,start [,end]]) -> int\n\
				3626	\n\
				3627	Return the highest index in S where substring sub is found,\n\
				3628	such that sub is contained within s[start,end]. Optional\n\
				3629	arguments start and end are interpreted as in slice notation.\n\
				3630	\n\
				3631	Return -1 on failure.";
				3632
				3633	static PyObject *
				3634	unicode_rfind(PyUnicodeObject self, PyObject args)
				3635	{
				3636	PyUnicodeObject *substring;
				3637	int start = 0;
				3638	int end = INT_MAX;
				3639	PyObject *result;
				3640
				3641	if (!PyArg_ParseTuple(args, "O\|ii:rfind", &substring, &start, &end))
				3642	return NULL;
				3643	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3644	(PyObject *)substring);
				3645	if (substring == NULL)
				3646	return NULL;
				3647
				3648	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				3649
				3650	Py_DECREF(substring);
				3651	return result;
				3652	}
				3653
				3654	static char rindex__doc__[] =
				3655	"S.rindex(sub [,start [,end]]) -> int\n\
				3656	\n\
				3657	Like S.rfind() but raise ValueError when the substring is not found.";
				3658
				3659	static PyObject *
				3660	unicode_rindex(PyUnicodeObject self, PyObject args)
				3661	{
				3662	int result;
				3663	PyUnicodeObject *substring;
				3664	int start = 0;
				3665	int end = INT_MAX;
				3666
				3667	if (!PyArg_ParseTuple(args, "O\|ii:rindex", &substring, &start, &end))
				3668	return NULL;
				3669	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3670	(PyObject *)substring);
				3671	if (substring == NULL)
				3672	return NULL;
				3673
				3674	result = findstring(self, substring, start, end, -1);
				3675
				3676	Py_DECREF(substring);
				3677	if (result < 0) {
				3678	PyErr_SetString(PyExc_ValueError, "substring not found");
				3679	return NULL;
				3680	}
				3681	return PyInt_FromLong(result);
				3682	}
				3683
				3684	static char rjust__doc__[] =
				3685	"S.rjust(width) -> unicode\n\
				3686	\n\
				3687	Return S right justified in a Unicode string of length width. Padding is\n\
				3688	done using spaces.";
				3689
				3690	static PyObject *
				3691	unicode_rjust(PyUnicodeObject self, PyObject args)
				3692	{
				3693	int width;
				3694	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				3695	return NULL;
				3696
				3697	if (self->length >= width) {
				3698	Py_INCREF(self);
				3699	return (PyObject*) self;
				3700	}
				3701
				3702	return (PyObject*) pad(self, width - self->length, 0, ' ');
				3703	}
				3704
				3705	static char rstrip__doc__[] =
				3706	"S.rstrip() -> unicode\n\
				3707	\n\
				3708	Return a copy of the string S with trailing whitespace removed.";
				3709
				3710	static PyObject *
				3711	unicode_rstrip(PyUnicodeObject self, PyObject args)
				3712	{
				3713	if (!PyArg_NoArgs(args))
				3714	return NULL;
				3715	return strip(self, 0, 1);
				3716	}
				3717
				3718	static PyObject*
				3719	unicode_slice(PyUnicodeObject *self, int start, int end)
				3720	{
				3721	/* standard clamping */
				3722	if (start < 0)
				3723	start = 0;
				3724	if (end < 0)
				3725	end = 0;
				3726	if (end > self->length)
				3727	end = self->length;
				3728	if (start == 0 && end == self->length) {
				3729	/* full slice, return original string */
				3730	Py_INCREF(self);
				3731	return (PyObject*) self;
				3732	}
				3733	if (start > end)
				3734	start = end;
				3735	/* copy slice */
				3736	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				3737	end - start);
				3738	}
				3739
				3740	PyObject PyUnicode_Split(PyObject s,
				3741	PyObject *sep,
				3742	int maxsplit)
				3743	{
				3744	PyObject *result;
				3745
				3746	s = PyUnicode_FromObject(s);
				3747	if (s == NULL)
				3748	return NULL;
				3749	if (sep != NULL) {
				3750	sep = PyUnicode_FromObject(sep);
				3751	if (sep == NULL) {
				3752	Py_DECREF(s);
				3753	return NULL;
				3754	}
				3755	}
				3756
				3757	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				3758
				3759	Py_DECREF(s);
				3760	Py_XDECREF(sep);
				3761	return result;
				3762	}
				3763
				3764	static char split__doc__[] =
				3765	"S.split([sep [,maxsplit]]) -> list of strings\n\
				3766	\n\
				3767	Return a list of the words in S, using sep as the\n\
				3768	delimiter string. If maxsplit is given, at most maxsplit\n\
				3769	splits are done. If sep is not specified, any whitespace string\n\
				3770	is a separator.";
				3771
				3772	static PyObject*
				3773	unicode_split(PyUnicodeObject self, PyObject args)
				3774	{
				3775	PyObject *substring = Py_None;
				3776	int maxcount = -1;
				3777
				3778	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				3779	return NULL;
				3780
				3781	if (substring == Py_None)
				3782	return split(self, NULL, maxcount);
				3783	else if (PyUnicode_Check(substring))
				3784	return split(self, (PyUnicodeObject *)substring, maxcount);
				3785	else
				3786	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				3787	}
				3788
				3789	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3790	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3791	\n\
				3792	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3793	Line breaks are not included in the resulting list unless keepends\n\
				3794	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3795
				3796	static PyObject*
				3797	unicode_splitlines(PyUnicodeObject self, PyObject args)
				3798	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3799	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3800
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3801	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3802	return NULL;
				3803
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3804	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3805	}
				3806
				3807	static
				3808	PyObject unicode_str(PyUnicodeObject self)
				3809	{
				3810	return PyUnicode_AsUTF8String((PyObject *)self);
				3811	}
				3812
				3813	static char strip__doc__[] =
				3814	"S.strip() -> unicode\n\
				3815	\n\
				3816	Return a copy of S with leading and trailing whitespace removed.";
				3817
				3818	static PyObject *
				3819	unicode_strip(PyUnicodeObject self, PyObject args)
				3820	{
				3821	if (!PyArg_NoArgs(args))
				3822	return NULL;
				3823	return strip(self, 1, 1);
				3824	}
				3825
				3826	static char swapcase__doc__[] =
				3827	"S.swapcase() -> unicode\n\
				3828	\n\
				3829	Return a copy of S with uppercase characters converted to lowercase\n\
				3830	and vice versa.";
				3831
				3832	static PyObject*
				3833	unicode_swapcase(PyUnicodeObject self, PyObject args)
				3834	{
				3835	if (!PyArg_NoArgs(args))
				3836	return NULL;
				3837	return fixup(self, fixswapcase);
				3838	}
				3839
				3840	static char translate__doc__[] =
				3841	"S.translate(table) -> unicode\n\
				3842	\n\
				3843	Return a copy of the string S, where all characters have been mapped\n\
				3844	through the given translation table, which must be a mapping of\n\
				3845	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				3846	are left untouched. Characters mapped to None are deleted.";
				3847
				3848	static PyObject*
				3849	unicode_translate(PyUnicodeObject self, PyObject args)
				3850	{
				3851	PyObject *table;
				3852
				3853	if (!PyArg_ParseTuple(args, "O:translate", &table))
				3854	return NULL;
				3855	return PyUnicode_TranslateCharmap(self->str,
				3856	self->length,
				3857	table,
				3858	"ignore");
				3859	}
				3860
				3861	static char upper__doc__[] =
				3862	"S.upper() -> unicode\n\
				3863	\n\
				3864	Return a copy of S converted to uppercase.";
				3865
				3866	static PyObject*
				3867	unicode_upper(PyUnicodeObject self, PyObject args)
				3868	{
				3869	if (!PyArg_NoArgs(args))
				3870	return NULL;
				3871	return fixup(self, fixupper);
				3872	}
				3873
				3874	#if 0
				3875	static char zfill__doc__[] =
				3876	"S.zfill(width) -> unicode\n\
				3877	\n\
				3878	Pad a numeric string x with zeros on the left, to fill a field\n\
				3879	of the specified width. The string x is never truncated.";
				3880
				3881	static PyObject *
				3882	unicode_zfill(PyUnicodeObject self, PyObject args)
				3883	{
				3884	int fill;
				3885	PyUnicodeObject *u;
				3886
				3887	int width;
				3888	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				3889	return NULL;
				3890
				3891	if (self->length >= width) {
				3892	Py_INCREF(self);
				3893	return (PyObject*) self;
				3894	}
				3895
				3896	fill = width - self->length;
				3897
				3898	u = pad(self, fill, 0, '0');
				3899
				3900	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				3901	/* move sign to beginning of string */
				3902	u->str[0] = u->str[fill];
				3903	u->str[fill] = '0';
				3904	}
				3905
				3906	return (PyObject*) u;
				3907	}
				3908	#endif
				3909
				3910	#if 0
				3911	static PyObject*
				3912	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				3913	{
				3914	if (!PyArg_NoArgs(args))
				3915	return NULL;
				3916	return PyInt_FromLong(unicode_freelist_size);
				3917	}
				3918	#endif
				3919
				3920	static char startswith__doc__[] =
				3921	"S.startswith(prefix[, start[, end]]) -> int\n\
				3922	\n\
				3923	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				3924	optional start, test S beginning at that position. With optional end, stop\n\
				3925	comparing S at that position.";
				3926
				3927	static PyObject *
				3928	unicode_startswith(PyUnicodeObject *self,
				3929	PyObject *args)
				3930	{
				3931	PyUnicodeObject *substring;
				3932	int start = 0;
				3933	int end = INT_MAX;
				3934	PyObject *result;
				3935
				3936	if (!PyArg_ParseTuple(args, "O\|ii:startswith", &substring, &start, &end))
				3937	return NULL;
				3938	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3939	(PyObject *)substring);
				3940	if (substring == NULL)
				3941	return NULL;
				3942
				3943	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				3944
				3945	Py_DECREF(substring);
				3946	return result;
				3947	}
				3948
				3949
				3950	static char endswith__doc__[] =
				3951	"S.endswith(suffix[, start[, end]]) -> int\n\
				3952	\n\
				3953	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				3954	optional start, test S beginning at that position. With optional end, stop\n\
				3955	comparing S at that position.";
				3956
				3957	static PyObject *
				3958	unicode_endswith(PyUnicodeObject *self,
				3959	PyObject *args)
				3960	{
				3961	PyUnicodeObject *substring;
				3962	int start = 0;
				3963	int end = INT_MAX;
				3964	PyObject *result;
				3965
				3966	if (!PyArg_ParseTuple(args, "O\|ii:endswith", &substring, &start, &end))
				3967	return NULL;
				3968	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3969	(PyObject *)substring);
				3970	if (substring == NULL)
				3971	return NULL;
				3972
				3973	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				3974
				3975	Py_DECREF(substring);
				3976	return result;
				3977	}
				3978
				3979
				3980	static PyMethodDef unicode_methods[] = {
				3981
				3982	/* Order is according to common usage: often used methods should
				3983	appear first, since lookup is done sequentially. */
				3984
				3985	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				3986	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				3987	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				3988	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				3989	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				3990	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				3991	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				3992	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				3993	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				3994	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				3995	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				3996	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				3997	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				3998	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				3999	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4000	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4001	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4002	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4003	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4004	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4005	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4006	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4007	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4008	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4009	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4010	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4011	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4012	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4013	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4014	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4015	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4016	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4017	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
				4018	#if 0
				4019	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4020	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4021	#endif
				4022
				4023	#if 0
				4024	/* This one is just used for debugging the implementation. */
				4025	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4026	#endif
				4027
				4028	{NULL, NULL}
				4029	};
				4030
				4031	static PyObject *
				4032	unicode_getattr(PyUnicodeObject self, char name)
				4033	{
				4034	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4035	}
				4036
				4037	static PySequenceMethods unicode_as_sequence = {
				4038	(inquiry) unicode_length, /* sq_length */
				4039	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4040	(intargfunc) unicode_repeat, /* sq_repeat */
				4041	(intargfunc) unicode_getitem, /* sq_item */
				4042	(intintargfunc) unicode_slice, /* sq_slice */
				4043	0, /* sq_ass_item */
				4044	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4045	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4046	};
				4047
				4048	static int
				4049	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4050	int index,
				4051	const void **ptr)
				4052	{
				4053	if (index != 0) {
				4054	PyErr_SetString(PyExc_SystemError,
				4055	"accessing non-existent unicode segment");
				4056	return -1;
				4057	}
				4058	ptr = (void ) self->str;
				4059	return PyUnicode_GET_DATA_SIZE(self);
				4060	}
				4061
				4062	static int
				4063	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4064	const void **ptr)
				4065	{
				4066	PyErr_SetString(PyExc_TypeError,
				4067	"cannot use unicode as modifyable buffer");
				4068	return -1;
				4069	}
				4070
				4071	static int
				4072	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4073	int *lenp)
				4074	{
				4075	if (lenp)
				4076	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4077	return 1;
				4078	}
				4079
				4080	static int
				4081	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4082	int index,
				4083	const void **ptr)
				4084	{
				4085	PyObject *str;
				4086
				4087	if (index != 0) {
				4088	PyErr_SetString(PyExc_SystemError,
				4089	"accessing non-existent unicode segment");
				4090	return -1;
				4091	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	4092	str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4093	if (str == NULL)
				4094	return -1;
				4095	ptr = (void ) PyString_AS_STRING(str);
				4096	return PyString_GET_SIZE(str);
				4097	}
				4098
				4099	/* Helpers for PyUnicode_Format() */
				4100
				4101	static PyObject *
				4102	getnextarg(args, arglen, p_argidx)
				4103	PyObject *args;
				4104	int arglen;
				4105	int *p_argidx;
				4106	{
				4107	int argidx = *p_argidx;
				4108	if (argidx < arglen) {
				4109	(*p_argidx)++;
				4110	if (arglen < 0)
				4111	return args;
				4112	else
				4113	return PyTuple_GetItem(args, argidx);
				4114	}
				4115	PyErr_SetString(PyExc_TypeError,
				4116	"not enough arguments for format string");
				4117	return NULL;
				4118	}
				4119
				4120	#define F_LJUST (1<<0)
				4121	#define F_SIGN (1<<1)
				4122	#define F_BLANK (1<<2)
				4123	#define F_ALT (1<<3)
				4124	#define F_ZERO (1<<4)
				4125
				4126	static
				4127	#ifdef HAVE_STDARG_PROTOTYPES
				4128	int usprintf(register Py_UNICODE buffer, char format, ...)
				4129	#else
				4130	int usprintf(va_alist) va_dcl
				4131	#endif
				4132	{
				4133	register int i;
				4134	int len;
				4135	va_list va;
				4136	char *charbuffer;
				4137	#ifdef HAVE_STDARG_PROTOTYPES
				4138	va_start(va, format);
				4139	#else
				4140	Py_UNICODE *args;
				4141	char *format;
				4142
				4143	va_start(va);
				4144	buffer = va_arg(va, Py_UNICODE *);
				4145	format = va_arg(va, char *);
				4146	#endif
				4147
				4148	/* First, format the string as char array, then expand to Py_UNICODE
				4149	array. */
				4150	charbuffer = (char *)buffer;
				4151	len = vsprintf(charbuffer, format, va);
				4152	for (i = len - 1; i >= 0; i--)
				4153	buffer[i] = (Py_UNICODE) charbuffer[i];
				4154
				4155	va_end(va);
				4156	return len;
				4157	}
				4158
				4159	static int
				4160	formatfloat(Py_UNICODE *buf,
				4161	int flags,
				4162	int prec,
				4163	int type,
				4164	PyObject *v)
				4165	{
				4166	char fmt[20];
				4167	double x;
				4168
				4169	x = PyFloat_AsDouble(v);
				4170	if (x == -1.0 && PyErr_Occurred())
				4171	return -1;
				4172	if (prec < 0)
				4173	prec = 6;
				4174	if (prec > 50)
				4175	prec = 50; /* Arbitrary limitation */
				4176	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4177	type = 'g';
				4178	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
				4179	return usprintf(buf, fmt, x);
				4180	}
				4181
				4182	static int
				4183	formatint(Py_UNICODE *buf,
				4184	int flags,
				4185	int prec,
				4186	int type,
				4187	PyObject *v)
				4188	{
				4189	char fmt[20];
				4190	long x;
				4191
				4192	x = PyInt_AsLong(v);
				4193	if (x == -1 && PyErr_Occurred())
				4194	return -1;
				4195	if (prec < 0)
				4196	prec = 1;
				4197	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4198	return usprintf(buf, fmt, x);
				4199	}
				4200
				4201	static int
				4202	formatchar(Py_UNICODE *buf,
				4203	PyObject *v)
				4204	{
				4205	if (PyUnicode_Check(v))
				4206	buf[0] = PyUnicode_AS_UNICODE(v)[0];
				4207
				4208	else if (PyString_Check(v))
				4209	buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
				4210
				4211	else {
				4212	/* Integer input truncated to a character */
				4213	long x;
				4214	x = PyInt_AsLong(v);
				4215	if (x == -1 && PyErr_Occurred())
				4216	return -1;
				4217	buf[0] = (char) x;
				4218	}
				4219	buf[1] = '\0';
				4220	return 1;
				4221	}
				4222
				4223	PyObject PyUnicode_Format(PyObject format,
				4224	PyObject *args)
				4225	{
				4226	Py_UNICODE fmt, res;
				4227	int fmtcnt, rescnt, reslen, arglen, argidx;
				4228	int args_owned = 0;
				4229	PyUnicodeObject *result = NULL;
				4230	PyObject *dict = NULL;
				4231	PyObject *uformat;
				4232
				4233	if (format == NULL \|\| args == NULL) {
				4234	PyErr_BadInternalCall();
				4235	return NULL;
				4236	}
				4237	uformat = PyUnicode_FromObject(format);
				4238	fmt = PyUnicode_AS_UNICODE(uformat);
				4239	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4240
				4241	reslen = rescnt = fmtcnt + 100;
				4242	result = _PyUnicode_New(reslen);
				4243	if (result == NULL)
				4244	goto onError;
				4245	res = PyUnicode_AS_UNICODE(result);
				4246
				4247	if (PyTuple_Check(args)) {
				4248	arglen = PyTuple_Size(args);
				4249	argidx = 0;
				4250	}
				4251	else {
				4252	arglen = -1;
				4253	argidx = -2;
				4254	}
				4255	if (args->ob_type->tp_as_mapping)
				4256	dict = args;
				4257
				4258	while (--fmtcnt >= 0) {
				4259	if (*fmt != '%') {
				4260	if (--rescnt < 0) {
				4261	rescnt = fmtcnt + 100;
				4262	reslen += rescnt;
				4263	if (_PyUnicode_Resize(result, reslen) < 0)
				4264	return NULL;
				4265	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4266	--rescnt;
				4267	}
				4268	res++ = fmt++;
				4269	}
				4270	else {
				4271	/* Got a format specifier */
				4272	int flags = 0;
				4273	int width = -1;
				4274	int prec = -1;
				4275	int size = 0;
				4276	Py_UNICODE c = '\0';
				4277	Py_UNICODE fill;
				4278	PyObject *v = NULL;
				4279	PyObject *temp = NULL;
				4280	Py_UNICODE *buf;
				4281	Py_UNICODE sign;
				4282	int len;
				4283	Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
				4284
				4285	fmt++;
				4286	if (*fmt == '(') {
				4287	Py_UNICODE *keystart;
				4288	int keylen;
				4289	PyObject *key;
				4290	int pcount = 1;
				4291
				4292	if (dict == NULL) {
				4293	PyErr_SetString(PyExc_TypeError,
				4294	"format requires a mapping");
				4295	goto onError;
				4296	}
				4297	++fmt;
				4298	--fmtcnt;
				4299	keystart = fmt;
				4300	/* Skip over balanced parentheses */
				4301	while (pcount > 0 && --fmtcnt >= 0) {
				4302	if (*fmt == ')')
				4303	--pcount;
				4304	else if (*fmt == '(')
				4305	++pcount;
				4306	fmt++;
				4307	}
				4308	keylen = fmt - keystart - 1;
				4309	if (fmtcnt < 0 \|\| pcount > 0) {
				4310	PyErr_SetString(PyExc_ValueError,
				4311	"incomplete format key");
				4312	goto onError;
				4313	}
				4314	/* keys are converted to strings (using UTF-8) and
				4315	then looked up since Python uses strings to hold
				4316	variables names etc. in its namespaces and we
				4317	wouldn't want to break common idioms. The
				4318	alternative would be using Unicode objects for the
				4319	lookup but u"abc" and "abc" have different hash
				4320	values (on purpose). */
				4321	key = PyUnicode_EncodeUTF8(keystart,
				4322	keylen,
				4323	NULL);
				4324	if (key == NULL)
				4325	goto onError;
				4326	if (args_owned) {
				4327	Py_DECREF(args);
				4328	args_owned = 0;
				4329	}
				4330	args = PyObject_GetItem(dict, key);
				4331	Py_DECREF(key);
				4332	if (args == NULL) {
				4333	goto onError;
				4334	}
				4335	args_owned = 1;
				4336	arglen = -1;
				4337	argidx = -2;
				4338	}
				4339	while (--fmtcnt >= 0) {
				4340	switch (c = *fmt++) {
				4341	case '-': flags \|= F_LJUST; continue;
				4342	case '+': flags \|= F_SIGN; continue;
				4343	case ' ': flags \|= F_BLANK; continue;
				4344	case '#': flags \|= F_ALT; continue;
				4345	case '0': flags \|= F_ZERO; continue;
				4346	}
				4347	break;
				4348	}
				4349	if (c == '*') {
				4350	v = getnextarg(args, arglen, &argidx);
				4351	if (v == NULL)
				4352	goto onError;
				4353	if (!PyInt_Check(v)) {
				4354	PyErr_SetString(PyExc_TypeError,
				4355	"* wants int");
				4356	goto onError;
				4357	}
				4358	width = PyInt_AsLong(v);
				4359	if (width < 0) {
				4360	flags \|= F_LJUST;
				4361	width = -width;
				4362	}
				4363	if (--fmtcnt >= 0)
				4364	c = *fmt++;
				4365	}
				4366	else if (c >= '0' && c <= '9') {
				4367	width = c - '0';
				4368	while (--fmtcnt >= 0) {
				4369	c = *fmt++;
				4370	if (c < '0' \|\| c > '9')
				4371	break;
				4372	if ((width*10) / 10 != width) {
				4373	PyErr_SetString(PyExc_ValueError,
				4374	"width too big");
				4375	goto onError;
				4376	}
				4377	width = width*10 + (c - '0');
				4378	}
				4379	}
				4380	if (c == '.') {
				4381	prec = 0;
				4382	if (--fmtcnt >= 0)
				4383	c = *fmt++;
				4384	if (c == '*') {
				4385	v = getnextarg(args, arglen, &argidx);
				4386	if (v == NULL)
				4387	goto onError;
				4388	if (!PyInt_Check(v)) {
				4389	PyErr_SetString(PyExc_TypeError,
				4390	"* wants int");
				4391	goto onError;
				4392	}
				4393	prec = PyInt_AsLong(v);
				4394	if (prec < 0)
				4395	prec = 0;
				4396	if (--fmtcnt >= 0)
				4397	c = *fmt++;
				4398	}
				4399	else if (c >= '0' && c <= '9') {
				4400	prec = c - '0';
				4401	while (--fmtcnt >= 0) {
				4402	c = Py_CHARMASK(*fmt++);
				4403	if (c < '0' \|\| c > '9')
				4404	break;
				4405	if ((prec*10) / 10 != prec) {
				4406	PyErr_SetString(PyExc_ValueError,
				4407	"prec too big");
				4408	goto onError;
				4409	}
				4410	prec = prec*10 + (c - '0');
				4411	}
				4412	}
				4413	} /* prec */
				4414	if (fmtcnt >= 0) {
				4415	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4416	size = c;
				4417	if (--fmtcnt >= 0)
				4418	c = *fmt++;
				4419	}
				4420	}
				4421	if (fmtcnt < 0) {
				4422	PyErr_SetString(PyExc_ValueError,
				4423	"incomplete format");
				4424	goto onError;
				4425	}
				4426	if (c != '%') {
				4427	v = getnextarg(args, arglen, &argidx);
				4428	if (v == NULL)
				4429	goto onError;
				4430	}
				4431	sign = 0;
				4432	fill = ' ';
				4433	switch (c) {
				4434
				4435	case '%':
				4436	buf = tmpbuf;
				4437	buf[0] = '%';
				4438	len = 1;
				4439	break;
				4440
				4441	case 's':
				4442	case 'r':
				4443	if (PyUnicode_Check(v) && c == 's') {
				4444	temp = v;
				4445	Py_INCREF(temp);
				4446	}
				4447	else {
				4448	PyObject *unicode;
				4449	if (c == 's')
				4450	temp = PyObject_Str(v);
				4451	else
				4452	temp = PyObject_Repr(v);
				4453	if (temp == NULL)
				4454	goto onError;
				4455	if (!PyString_Check(temp)) {
				4456	/* XXX Note: this should never happen, since
				4457	PyObject_Repr() and PyObject_Str() assure
				4458	this */
				4459	Py_DECREF(temp);
				4460	PyErr_SetString(PyExc_TypeError,
				4461	"%s argument has non-string str()");
				4462	goto onError;
				4463	}
				4464	unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
				4465	PyString_GET_SIZE(temp),
				4466	"strict");
				4467	Py_DECREF(temp);
				4468	temp = unicode;
				4469	if (temp == NULL)
				4470	goto onError;
				4471	}
				4472	buf = PyUnicode_AS_UNICODE(temp);
				4473	len = PyUnicode_GET_SIZE(temp);
				4474	if (prec >= 0 && len > prec)
				4475	len = prec;
				4476	break;
				4477
				4478	case 'i':
				4479	case 'd':
				4480	case 'u':
				4481	case 'o':
				4482	case 'x':
				4483	case 'X':
				4484	if (c == 'i')
				4485	c = 'd';
				4486	buf = tmpbuf;
				4487	len = formatint(buf, flags, prec, c, v);
				4488	if (len < 0)
				4489	goto onError;
				4490	sign = (c == 'd');
				4491	if (flags & F_ZERO) {
				4492	fill = '0';
				4493	if ((flags&F_ALT) &&
				4494	(c == 'x' \|\| c == 'X') &&
				4495	buf[0] == '0' && buf[1] == c) {
				4496	res++ = buf++;
				4497	res++ = buf++;
				4498	rescnt -= 2;
				4499	len -= 2;
				4500	width -= 2;
				4501	if (width < 0)
				4502	width = 0;
				4503	}
				4504	}
				4505	break;
				4506
				4507	case 'e':
				4508	case 'E':
				4509	case 'f':
				4510	case 'g':
				4511	case 'G':
				4512	buf = tmpbuf;
				4513	len = formatfloat(buf, flags, prec, c, v);
				4514	if (len < 0)
				4515	goto onError;
				4516	sign = 1;
				4517	if (flags&F_ZERO)
				4518	fill = '0';
				4519	break;
				4520
				4521	case 'c':
				4522	buf = tmpbuf;
				4523	len = formatchar(buf, v);
				4524	if (len < 0)
				4525	goto onError;
				4526	break;
				4527
				4528	default:
				4529	PyErr_Format(PyExc_ValueError,
				4530	"unsupported format character '%c' (0x%x)",
				4531	c, c);
				4532	goto onError;
				4533	}
				4534	if (sign) {
				4535	if (buf == '-' \|\| buf == '+') {
				4536	sign = *buf++;
				4537	len--;
				4538	}
				4539	else if (flags & F_SIGN)
				4540	sign = '+';
				4541	else if (flags & F_BLANK)
				4542	sign = ' ';
				4543	else
				4544	sign = 0;
				4545	}
				4546	if (width < len)
				4547	width = len;
				4548	if (rescnt < width + (sign != 0)) {
				4549	reslen -= rescnt;
				4550	rescnt = width + fmtcnt + 100;
				4551	reslen += rescnt;
				4552	if (_PyUnicode_Resize(result, reslen) < 0)
				4553	return NULL;
				4554	res = PyUnicode_AS_UNICODE(result)
				4555	+ reslen - rescnt;
				4556	}
				4557	if (sign) {
				4558	if (fill != ' ')
				4559	*res++ = sign;
				4560	rescnt--;
				4561	if (width > len)
				4562	width--;
				4563	}
				4564	if (width > len && !(flags & F_LJUST)) {
				4565	do {
				4566	--rescnt;
				4567	*res++ = fill;
				4568	} while (--width > len);
				4569	}
				4570	if (sign && fill == ' ')
				4571	*res++ = sign;
				4572	memcpy(res, buf, len * sizeof(Py_UNICODE));
				4573	res += len;
				4574	rescnt -= len;
				4575	while (--width >= len) {
				4576	--rescnt;
				4577	*res++ = ' ';
				4578	}
				4579	if (dict && (argidx < arglen) && c != '%') {
				4580	PyErr_SetString(PyExc_TypeError,
				4581	"not all arguments converted");
				4582	goto onError;
				4583	}
				4584	Py_XDECREF(temp);
				4585	} /* '%' */
				4586	} /* until end */
				4587	if (argidx < arglen && !dict) {
				4588	PyErr_SetString(PyExc_TypeError,
				4589	"not all arguments converted");
				4590	goto onError;
				4591	}
				4592
				4593	if (args_owned) {
				4594	Py_DECREF(args);
				4595	}
				4596	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4597	if (_PyUnicode_Resize(result, reslen - rescnt))
				4598	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4599	return (PyObject *)result;
				4600
				4601	onError:
				4602	Py_XDECREF(result);
				4603	Py_DECREF(uformat);
				4604	if (args_owned) {
				4605	Py_DECREF(args);
				4606	}
				4607	return NULL;
				4608	}
				4609
				4610	static PyBufferProcs unicode_as_buffer = {
				4611	(getreadbufferproc) unicode_buffer_getreadbuf,
				4612	(getwritebufferproc) unicode_buffer_getwritebuf,
				4613	(getsegcountproc) unicode_buffer_getsegcount,
				4614	(getcharbufferproc) unicode_buffer_getcharbuf,
				4615	};
				4616
				4617	PyTypeObject PyUnicode_Type = {
				4618	PyObject_HEAD_INIT(&PyType_Type)
				4619	0, /* ob_size */
				4620	"unicode", /* tp_name */
				4621	sizeof(PyUnicodeObject), /* tp_size */
				4622	0, /* tp_itemsize */
				4623	/* Slots */
				4624	(destructor)_PyUnicode_Free, /* tp_dealloc */
				4625	0, /* tp_print */
				4626	(getattrfunc)unicode_getattr, /* tp_getattr */
				4627	0, /* tp_setattr */
				4628	(cmpfunc) unicode_compare, /* tp_compare */
				4629	(reprfunc) unicode_repr, /* tp_repr */
				4630	0, /* tp_as_number */
				4631	&unicode_as_sequence, /* tp_as_sequence */
				4632	0, /* tp_as_mapping */
				4633	(hashfunc) unicode_hash, /* tp_hash*/
				4634	0, /* tp_call*/
				4635	(reprfunc) unicode_str, /* tp_str */
				4636	(getattrofunc) NULL, /* tp_getattro */
				4637	(setattrofunc) NULL, /* tp_setattro */
				4638	&unicode_as_buffer, /* tp_as_buffer */
				4639	Py_TPFLAGS_DEFAULT, /* tp_flags */
				4640	};
				4641
				4642	/* Initialize the Unicode implementation */
				4643
				4644	void _PyUnicode_Init()
				4645	{
				4646	/* Doublecheck the configuration... */
				4647	if (sizeof(Py_UNICODE) != 2)
				4648	Py_FatalError("Unicode configuration error: "
				4649	"sizeof(Py_UNICODE) != 2 bytes");
				4650
				4651	unicode_empty = _PyUnicode_New(0);
				4652	}
				4653
				4654	/* Finalize the Unicode implementation */
				4655
				4656	void
				4657	_PyUnicode_Fini()
				4658	{
				4659	PyUnicodeObject *u = unicode_freelist;
				4660
				4661	while (u != NULL) {
				4662	PyUnicodeObject *v = u;
				4663	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4664	if (v->str)
				4665	free(v->str);
				4666	Py_XDECREF(v->utf8str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4667	free(v);
				4668	}
				4669	Py_XDECREF(unicode_empty);
				4670	}