Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 02d1b0d5d8ab535dbbe3609c614a2253d825189e [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	69	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	#if defined(HAVE_LIMITS_H)
				72	#include <limits.h>
				73	#else
				74	#define INT_MAX 2147483647
				75	#endif
				76
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	77	#ifdef MS_WIN32
				78	#include <windows.h>
				79	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	80
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	81	/* Limit for the Unicode object free list */
				82
				83	#define MAX_UNICODE_FREELIST_SIZE 1024
				84
				85	/* Limit for the Unicode object free list stay alive optimization.
				86
				87	The implementation will keep allocated Unicode memory intact for
				88	all objects on the free list having a size less than this
				89	limit. This reduces malloc() overhead for small Unicode objects.
				90
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	91	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	92	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	93	malloc()-overhead) bytes of unused garbage.
				94
				95	Setting the limit to 0 effectively turns the feature off.
				96
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	97	Note: This is an experimental feature ! If you get core dumps when
				98	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	99
				100	*/
				101
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	102	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	103
				104	/* Endianness switches; defaults to little endian */
				105
				106	#ifdef WORDS_BIGENDIAN
				107	# define BYTEORDER_IS_BIG_ENDIAN
				108	#else
				109	# define BYTEORDER_IS_LITTLE_ENDIAN
				110	#endif
				111
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	112	/* --- Globals ------------------------------------------------------------
				113
				114	The globals are initialized by the _PyUnicode_Init() API and should
				115	not be used before calling that API.
				116
				117	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118
				119	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	120	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	121
				122	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	123	static PyUnicodeObject *unicode_freelist;
				124	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	125
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	126	/* Default encoding to use and assume when NULL is passed as encoding
				127	parameter; it is initialized by _PyUnicode_Init().
				128
				129	Always use the PyUnicode_SetDefaultEncoding() and
				130	PyUnicode_GetDefaultEncoding() APIs to access this global.
				131
				132	*/
				133
				134	static char unicode_default_encoding[100];
				135
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	136	/* --- Unicode Object ----------------------------------------------------- */
				137
				138	static
				139	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				140	int length)
				141	{
				142	void *oldstr;
				143
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	144	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	145	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	146	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	147
				148	/* Resizing unicode_empty is not allowed. */
				149	if (unicode == unicode_empty) {
				150	PyErr_SetString(PyExc_SystemError,
				151	"can't resize empty unicode object");
				152	return -1;
				153	}
				154
				155	/* We allocate one more byte to make sure the string is
				156	Ux0000 terminated -- XXX is this needed ? */
				157	oldstr = unicode->str;
				158	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				159	if (!unicode->str) {
				160	unicode->str = oldstr;
				161	PyErr_NoMemory();
				162	return -1;
				163	}
				164	unicode->str[length] = 0;
				165	unicode->length = length;
				166
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	167	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	168	/* Reset the object caches */
				169	if (unicode->utf8str) {
				170	Py_DECREF(unicode->utf8str);
				171	unicode->utf8str = NULL;
				172	}
				173	unicode->hash = -1;
				174
				175	return 0;
				176	}
				177
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	178	int PyUnicode_Resize(PyObject **unicode,
				179	int length)
				180	{
				181	PyUnicodeObject *v;
				182
				183	if (unicode == NULL) {
				184	PyErr_BadInternalCall();
				185	return -1;
				186	}
				187	v = (PyUnicodeObject )unicode;
				188	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				189	PyErr_BadInternalCall();
				190	return -1;
				191	}
				192	return _PyUnicode_Resize(v, length);
				193	}
				194
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	195	/* We allocate one more byte to make sure the string is
				196	Ux0000 terminated -- XXX is this needed ?
				197
				198	XXX This allocator could further be enhanced by assuring that the
				199	free list never reduces its size below 1.
				200
				201	*/
				202
				203	static
				204	PyUnicodeObject *_PyUnicode_New(int length)
				205	{
				206	register PyUnicodeObject *unicode;
				207
				208	/* Optimization for empty strings */
				209	if (length == 0 && unicode_empty != NULL) {
				210	Py_INCREF(unicode_empty);
				211	return unicode_empty;
				212	}
				213
				214	/* Unicode freelist & memory allocation */
				215	if (unicode_freelist) {
				216	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	217	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	219	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	220	/* Keep-Alive optimization: we only upsize the buffer,
				221	never downsize it. */
				222	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	223	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	224	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	225	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	226	}
				227	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	228	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	230	}
				231	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	232	}
				233	else {
				234	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				235	if (unicode == NULL)
				236	return NULL;
				237	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				238	}
				239
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	240	if (!unicode->str) {
				241	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	242	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	243	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	244	unicode->str[length] = 0;
				245	unicode->length = length;
				246	unicode->hash = -1;
				247	unicode->utf8str = NULL;
				248	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	249
				250	onError:
				251	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	252	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	253	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	254	}
				255
				256	static
				257	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				258	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	259	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	260	/* Keep-Alive optimization */
				261	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	262	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	263	unicode->str = NULL;
				264	unicode->length = 0;
				265	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	266	if (unicode->utf8str) {
				267	Py_DECREF(unicode->utf8str);
				268	unicode->utf8str = NULL;
				269	}
				270	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	271	(PyUnicodeObject *)unicode = unicode_freelist;
				272	unicode_freelist = unicode;
				273	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	274	}
				275	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	276	PyMem_DEL(unicode->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	277	Py_XDECREF(unicode->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	278	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	279	}
				280	}
				281
				282	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				283	int size)
				284	{
				285	PyUnicodeObject *unicode;
				286
				287	unicode = _PyUnicode_New(size);
				288	if (!unicode)
				289	return NULL;
				290
				291	/* Copy the Unicode data into the new object */
				292	if (u != NULL)
				293	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				294
				295	return (PyObject *)unicode;
				296	}
				297
				298	#ifdef HAVE_WCHAR_H
				299
				300	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				301	int size)
				302	{
				303	PyUnicodeObject *unicode;
				304
				305	if (w == NULL) {
				306	PyErr_BadInternalCall();
				307	return NULL;
				308	}
				309
				310	unicode = _PyUnicode_New(size);
				311	if (!unicode)
				312	return NULL;
				313
				314	/* Copy the wchar_t data into the new object */
				315	#ifdef HAVE_USABLE_WCHAR_T
				316	memcpy(unicode->str, w, size * sizeof(wchar_t));
				317	#else
				318	{
				319	register Py_UNICODE *u;
				320	register int i;
				321	u = PyUnicode_AS_UNICODE(unicode);
				322	for (i = size; i >= 0; i--)
				323	u++ = w++;
				324	}
				325	#endif
				326
				327	return (PyObject *)unicode;
				328	}
				329
				330	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				331	register wchar_t *w,
				332	int size)
				333	{
				334	if (unicode == NULL) {
				335	PyErr_BadInternalCall();
				336	return -1;
				337	}
				338	if (size > PyUnicode_GET_SIZE(unicode))
				339	size = PyUnicode_GET_SIZE(unicode);
				340	#ifdef HAVE_USABLE_WCHAR_T
				341	memcpy(w, unicode->str, size * sizeof(wchar_t));
				342	#else
				343	{
				344	register Py_UNICODE *u;
				345	register int i;
				346	u = PyUnicode_AS_UNICODE(unicode);
				347	for (i = size; i >= 0; i--)
				348	w++ = u++;
				349	}
				350	#endif
				351
				352	return size;
				353	}
				354
				355	#endif
				356
				357	PyObject PyUnicode_FromObject(register PyObject obj)
				358	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	359	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				360	}
				361
				362	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				363	const char *encoding,
				364	const char *errors)
				365	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	366	const char *s;
				367	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	368	int owned = 0;
				369	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	370
				371	if (obj == NULL) {
				372	PyErr_BadInternalCall();
				373	return NULL;
				374	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	375
				376	/* Coerce object */
				377	if (PyInstance_Check(obj)) {
				378	PyObject *func;
				379	func = PyObject_GetAttrString(obj, "__str__");
				380	if (func == NULL) {
				381	PyErr_SetString(PyExc_TypeError,
				382	"coercing to Unicode: instance doesn't define __str__");
				383	return NULL;
				384	}
				385	obj = PyEval_CallObject(func, NULL);
				386	Py_DECREF(func);
				387	if (obj == NULL)
				388	return NULL;
				389	owned = 1;
				390	}
				391	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	392	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	393	v = obj;
				394	if (encoding) {
				395	PyErr_SetString(PyExc_TypeError,
				396	"decoding Unicode is not supported");
				397	return NULL;
				398	}
				399	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	400	}
				401	else if (PyString_Check(obj)) {
				402	s = PyString_AS_STRING(obj);
				403	len = PyString_GET_SIZE(obj);
				404	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	405	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				406	/* Overwrite the error message with something more useful in
				407	case of a TypeError. */
				408	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	409	PyErr_Format(PyExc_TypeError,
				410	"coercing to Unicode: need string or buffer, "
				411	"%.80s found",
				412	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	413	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	414	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	415
				416	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	417	if (len == 0) {
				418	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	419	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	420	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	421	else
				422	v = PyUnicode_Decode(s, len, encoding, errors);
				423	done:
				424	if (owned)
				425	Py_DECREF(obj);
				426	return v;
				427
				428	onError:
				429	if (owned)
				430	Py_DECREF(obj);
				431	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	432	}
				433
				434	PyObject PyUnicode_Decode(const char s,
				435	int size,
				436	const char *encoding,
				437	const char *errors)
				438	{
				439	PyObject buffer = NULL, unicode;
				440
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	441	if (encoding == NULL)
				442	encoding = PyUnicode_GetDefaultEncoding();
				443
				444	/* Shortcuts for common default encodings */
				445	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	446	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	447	else if (strcmp(encoding, "latin-1") == 0)
				448	return PyUnicode_DecodeLatin1(s, size, errors);
				449	else if (strcmp(encoding, "ascii") == 0)
				450	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	451
				452	/* Decode via the codec registry */
				453	buffer = PyBuffer_FromMemory((void *)s, size);
				454	if (buffer == NULL)
				455	goto onError;
				456	unicode = PyCodec_Decode(buffer, encoding, errors);
				457	if (unicode == NULL)
				458	goto onError;
				459	if (!PyUnicode_Check(unicode)) {
				460	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	461	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	462	unicode->ob_type->tp_name);
				463	Py_DECREF(unicode);
				464	goto onError;
				465	}
				466	Py_DECREF(buffer);
				467	return unicode;
				468
				469	onError:
				470	Py_XDECREF(buffer);
				471	return NULL;
				472	}
				473
				474	PyObject PyUnicode_Encode(const Py_UNICODE s,
				475	int size,
				476	const char *encoding,
				477	const char *errors)
				478	{
				479	PyObject v, unicode;
				480
				481	unicode = PyUnicode_FromUnicode(s, size);
				482	if (unicode == NULL)
				483	return NULL;
				484	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				485	Py_DECREF(unicode);
				486	return v;
				487	}
				488
				489	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				490	const char *encoding,
				491	const char *errors)
				492	{
				493	PyObject *v;
				494
				495	if (!PyUnicode_Check(unicode)) {
				496	PyErr_BadArgument();
				497	goto onError;
				498	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	499
				500	if (encoding == NULL)
				501	encoding = PyUnicode_GetDefaultEncoding();
				502
				503	/* Shortcuts for common default encodings */
				504	if (errors == NULL) {
				505	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	506	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	507	else if (strcmp(encoding, "latin-1") == 0)
				508	return PyUnicode_AsLatin1String(unicode);
				509	else if (strcmp(encoding, "ascii") == 0)
				510	return PyUnicode_AsASCIIString(unicode);
				511	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	512
				513	/* Encode via the codec registry */
				514	v = PyCodec_Encode(unicode, encoding, errors);
				515	if (v == NULL)
				516	goto onError;
				517	/* XXX Should we really enforce this ? */
				518	if (!PyString_Check(v)) {
				519	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	520	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	521	v->ob_type->tp_name);
				522	Py_DECREF(v);
				523	goto onError;
				524	}
				525	return v;
				526
				527	onError:
				528	return NULL;
				529	}
				530
				531	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				532	{
				533	if (!PyUnicode_Check(unicode)) {
				534	PyErr_BadArgument();
				535	goto onError;
				536	}
				537	return PyUnicode_AS_UNICODE(unicode);
				538
				539	onError:
				540	return NULL;
				541	}
				542
				543	int PyUnicode_GetSize(PyObject *unicode)
				544	{
				545	if (!PyUnicode_Check(unicode)) {
				546	PyErr_BadArgument();
				547	goto onError;
				548	}
				549	return PyUnicode_GET_SIZE(unicode);
				550
				551	onError:
				552	return -1;
				553	}
				554
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	555	const char *PyUnicode_GetDefaultEncoding()
				556	{
				557	return unicode_default_encoding;
				558	}
				559
				560	int PyUnicode_SetDefaultEncoding(const char *encoding)
				561	{
				562	PyObject *v;
				563
				564	/* Make sure the encoding is valid. As side effect, this also
				565	loads the encoding into the codec registry cache. */
				566	v = _PyCodec_Lookup(encoding);
				567	if (v == NULL)
				568	goto onError;
				569	Py_DECREF(v);
				570	strncpy(unicode_default_encoding,
				571	encoding,
				572	sizeof(unicode_default_encoding));
				573	return 0;
				574
				575	onError:
				576	return -1;
				577	}
				578
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	579	/* --- UTF-8 Codec -------------------------------------------------------- */
				580
				581	static
				582	char utf8_code_length[256] = {
				583	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				584	illegal prefix. see RFC 2279 for details */
				585	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				586	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				587	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				588	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				589	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				590	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				591	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				592	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				593	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				594	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				595	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				596	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				597	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				598	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				599	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				600	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				601	};
				602
				603	static
				604	int utf8_decoding_error(const char **source,
				605	Py_UNICODE **dest,
				606	const char *errors,
				607	const char *details)
				608	{
				609	if ((errors == NULL) \|\|
				610	(strcmp(errors,"strict") == 0)) {
				611	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	612	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	613	details);
				614	return -1;
				615	}
				616	else if (strcmp(errors,"ignore") == 0) {
				617	(*source)++;
				618	return 0;
				619	}
				620	else if (strcmp(errors,"replace") == 0) {
				621	(*source)++;
				622	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				623	(*dest)++;
				624	return 0;
				625	}
				626	else {
				627	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	628	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	629	errors);
				630	return -1;
				631	}
				632	}
				633
				634	#define UTF8_ERROR(details) do { \
				635	if (utf8_decoding_error(&s, &p, errors, details)) \
				636	goto onError; \
				637	continue; \
				638	} while (0)
				639
				640	PyObject PyUnicode_DecodeUTF8(const char s,
				641	int size,
				642	const char *errors)
				643	{
				644	int n;
				645	const char *e;
				646	PyUnicodeObject *unicode;
				647	Py_UNICODE *p;
				648
				649	/* Note: size will always be longer than the resulting Unicode
				650	character count */
				651	unicode = _PyUnicode_New(size);
				652	if (!unicode)
				653	return NULL;
				654	if (size == 0)
				655	return (PyObject *)unicode;
				656
				657	/* Unpack UTF-8 encoded data */
				658	p = unicode->str;
				659	e = s + size;
				660
				661	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	662	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	663
				664	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	665	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	666	s++;
				667	continue;
				668	}
				669
				670	n = utf8_code_length[ch];
				671
				672	if (s + n > e)
				673	UTF8_ERROR("unexpected end of data");
				674
				675	switch (n) {
				676
				677	case 0:
				678	UTF8_ERROR("unexpected code byte");
				679	break;
				680
				681	case 1:
				682	UTF8_ERROR("internal error");
				683	break;
				684
				685	case 2:
				686	if ((s[1] & 0xc0) != 0x80)
				687	UTF8_ERROR("invalid data");
				688	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				689	if (ch < 0x80)
				690	UTF8_ERROR("illegal encoding");
				691	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	692	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	693	break;
				694
				695	case 3:
				696	if ((s[1] & 0xc0) != 0x80 \|\|
				697	(s[2] & 0xc0) != 0x80)
				698	UTF8_ERROR("invalid data");
				699	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				700	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				701	UTF8_ERROR("illegal encoding");
				702	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	703	*p++ = (Py_UNICODE)ch;
				704	break;
				705
				706	case 4:
				707	if ((s[1] & 0xc0) != 0x80 \|\|
				708	(s[2] & 0xc0) != 0x80 \|\|
				709	(s[3] & 0xc0) != 0x80)
				710	UTF8_ERROR("invalid data");
				711	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				712	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				713	/* validate and convert to UTF-16 */
				714	if ((ch < 0x10000) \|\| /* minimum value allowed for 4 byte encoding */
				715	(ch > 0x10ffff)) /* maximum value allowed for UTF-16 */
				716	UTF8_ERROR("illegal encoding");
				717	/* compute and append the two surrogates: */
				718
				719	/* translate from 10000..10FFFF to 0..FFFF */
				720	ch -= 0x10000;
				721
				722	/* high surrogate = top 10 bits added to D800 */
				723	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				724
				725	/* low surrogate = bottom 10 bits added to DC00 */
				726	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	727	break;
				728
				729	default:
				730	/* Other sizes are only needed for UCS-4 */
				731	UTF8_ERROR("unsupported Unicode code range");
				732	}
				733	s += n;
				734	}
				735
				736	/* Adjust length */
				737	if (_PyUnicode_Resize(unicode, p - unicode->str))
				738	goto onError;
				739
				740	return (PyObject *)unicode;
				741
				742	onError:
				743	Py_DECREF(unicode);
				744	return NULL;
				745	}
				746
				747	#undef UTF8_ERROR
				748
				749	static
				750	int utf8_encoding_error(const Py_UNICODE **source,
				751	char **dest,
				752	const char *errors,
				753	const char *details)
				754	{
				755	if ((errors == NULL) \|\|
				756	(strcmp(errors,"strict") == 0)) {
				757	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	758	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	759	details);
				760	return -1;
				761	}
				762	else if (strcmp(errors,"ignore") == 0) {
				763	return 0;
				764	}
				765	else if (strcmp(errors,"replace") == 0) {
				766	**dest = '?';
				767	(*dest)++;
				768	return 0;
				769	}
				770	else {
				771	PyErr_Format(PyExc_ValueError,
				772	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	773	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	774	errors);
				775	return -1;
				776	}
				777	}
				778
				779	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				780	int size,
				781	const char *errors)
				782	{
				783	PyObject *v;
				784	char *p;
				785	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	786	Py_UCS4 ch2;
				787	unsigned int cbAllocated = 3 * size;
				788	unsigned int cbWritten = 0;
				789	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	790
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	791	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	792	if (v == NULL)
				793	return NULL;
				794	if (size == 0)
				795	goto done;
				796
				797	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	798	while (i < size) {
				799	Py_UCS4 ch = s[i++];
				800	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	801	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	802	cbWritten++;
				803	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	804	else if (ch < 0x0800) {
				805	*p++ = 0xc0 \| (ch >> 6);
				806	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	807	cbWritten += 2;
				808	}
				809	else {
				810	/* Check for high surrogate */
				811	if (0xD800 <= ch && ch <= 0xDBFF) {
				812	if (i != size) {
				813	ch2 = s[i];
				814	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				815
				816	if (cbWritten >= (cbAllocated - 4)) {
				817	/* Provide enough room for some more
				818	surrogates */
				819	cbAllocated += 4*10;
				820	if (_PyString_Resize(&v, cbAllocated))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	821	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	822	}
				823
				824	/* combine the two values */
				825	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				826
				827	*p++ = (char)((ch >> 18) \| 0xf0);
				828	*p++ = (char)(0x80 \| (ch >> 12) & 0x3f);
				829	i++;
				830	cbWritten += 4;
				831	}
				832	}
				833	}
				834	else {
				835	*p++ = (char)(0xe0 \| (ch >> 12));
				836	cbWritten += 3;
				837	}
				838	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				839	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	840	}
				841	}
				842	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	843	if (_PyString_Resize(&v, p - q))
				844	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	845
				846	done:
				847	return v;
				848
				849	onError:
				850	Py_DECREF(v);
				851	return NULL;
				852	}
				853
				854	/* Return a Python string holding the UTF-8 encoded value of the
				855	Unicode object.
				856
				857	The resulting string is cached in the Unicode object for subsequent
				858	usage by this function. The cached version is needed to implement
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	859	the character buffer interface and will live (at least) as long as
				860	the Unicode object itself.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	861
				862	The refcount of the string is not incremented.
				863
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	864	* Exported for internal use by the interpreter only !!! *
				865
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	866	*/
				867
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	868	PyObject _PyUnicode_AsUTF8String(PyObject unicode,
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	869	const char *errors)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	870	{
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	871	PyObject v = ((PyUnicodeObject )unicode)->utf8str;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	872
				873	if (v)
				874	return v;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	875	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				876	PyUnicode_GET_SIZE(unicode),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	877	errors);
				878	if (v && errors == NULL)
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	879	((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	880	return v;
				881	}
				882
				883	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				884	{
				885	PyObject *str;
				886
				887	if (!PyUnicode_Check(unicode)) {
				888	PyErr_BadArgument();
				889	return NULL;
				890	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	891	str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	892	if (str == NULL)
				893	return NULL;
				894	Py_INCREF(str);
				895	return str;
				896	}
				897
				898	/* --- UTF-16 Codec ------------------------------------------------------- */
				899
				900	static
				901	int utf16_decoding_error(const Py_UNICODE **source,
				902	Py_UNICODE **dest,
				903	const char *errors,
				904	const char *details)
				905	{
				906	if ((errors == NULL) \|\|
				907	(strcmp(errors,"strict") == 0)) {
				908	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	909	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	910	details);
				911	return -1;
				912	}
				913	else if (strcmp(errors,"ignore") == 0) {
				914	return 0;
				915	}
				916	else if (strcmp(errors,"replace") == 0) {
				917	if (dest) {
				918	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				919	(*dest)++;
				920	}
				921	return 0;
				922	}
				923	else {
				924	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	925	"UTF-16 decoding error; "
				926	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	927	errors);
				928	return -1;
				929	}
				930	}
				931
				932	#define UTF16_ERROR(details) do { \
				933	if (utf16_decoding_error(&q, &p, errors, details)) \
				934	goto onError; \
				935	continue; \
				936	} while(0)
				937
				938	PyObject PyUnicode_DecodeUTF16(const char s,
				939	int size,
				940	const char *errors,
				941	int *byteorder)
				942	{
				943	PyUnicodeObject *unicode;
				944	Py_UNICODE *p;
				945	const Py_UNICODE q, e;
				946	int bo = 0;
				947
				948	/* size should be an even number */
				949	if (size % sizeof(Py_UNICODE) != 0) {
				950	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				951	return NULL;
				952	/* The remaining input chars are ignored if we fall through
				953	here... */
				954	}
				955
				956	/* Note: size will always be longer than the resulting Unicode
				957	character count */
				958	unicode = _PyUnicode_New(size);
				959	if (!unicode)
				960	return NULL;
				961	if (size == 0)
				962	return (PyObject *)unicode;
				963
				964	/* Unpack UTF-16 encoded data */
				965	p = unicode->str;
				966	q = (Py_UNICODE *)s;
				967	e = q + (size / sizeof(Py_UNICODE));
				968
				969	if (byteorder)
				970	bo = *byteorder;
				971
				972	while (q < e) {
				973	register Py_UNICODE ch = *q++;
				974
				975	/* Check for BOM marks (U+FEFF) in the input and adjust
				976	current byte order setting accordingly. Swap input
				977	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				978	!) */
				979	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				980	if (ch == 0xFEFF) {
				981	bo = -1;
				982	continue;
				983	} else if (ch == 0xFFFE) {
				984	bo = 1;
				985	continue;
				986	}
				987	if (bo == 1)
				988	ch = (ch >> 8) \| (ch << 8);
				989	#else
				990	if (ch == 0xFEFF) {
				991	bo = 1;
				992	continue;
				993	} else if (ch == 0xFFFE) {
				994	bo = -1;
				995	continue;
				996	}
				997	if (bo == -1)
				998	ch = (ch >> 8) \| (ch << 8);
				999	#endif
				1000	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1001	*p++ = ch;
				1002	continue;
				1003	}
				1004
				1005	/* UTF-16 code pair: */
				1006	if (q >= e)
				1007	UTF16_ERROR("unexpected end of data");
				1008	if (0xDC00 <= q && q <= 0xDFFF) {
				1009	q++;
				1010	if (0xD800 <= q && q <= 0xDBFF)
				1011	/* This is valid data (a UTF-16 surrogate pair), but
				1012	we are not able to store this information since our
				1013	Py_UNICODE type only has 16 bits... this might
				1014	change someday, even though it's unlikely. */
				1015	UTF16_ERROR("code pairs are not supported");
				1016	else
				1017	continue;
				1018	}
				1019	UTF16_ERROR("illegal encoding");
				1020	}
				1021
				1022	if (byteorder)
				1023	*byteorder = bo;
				1024
				1025	/* Adjust length */
				1026	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1027	goto onError;
				1028
				1029	return (PyObject *)unicode;
				1030
				1031	onError:
				1032	Py_DECREF(unicode);
				1033	return NULL;
				1034	}
				1035
				1036	#undef UTF16_ERROR
				1037
				1038	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1039	int size,
				1040	const char *errors,
				1041	int byteorder)
				1042	{
				1043	PyObject *v;
				1044	Py_UNICODE *p;
				1045	char *q;
				1046
				1047	/* We don't create UTF-16 pairs... */
				1048	v = PyString_FromStringAndSize(NULL,
				1049	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1050	if (v == NULL)
				1051	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1052
				1053	q = PyString_AS_STRING(v);
				1054	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1055	if (byteorder == 0)
				1056	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1057	if (size == 0)
				1058	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1059	if (byteorder == 0 \|\|
				1060	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1061	byteorder == -1
				1062	#else
				1063	byteorder == 1
				1064	#endif
				1065	)
				1066	memcpy(p, s, size * sizeof(Py_UNICODE));
				1067	else
				1068	while (size-- > 0) {
				1069	Py_UNICODE ch = *s++;
				1070	*p++ = (ch >> 8) \| (ch << 8);
				1071	}
				1072	done:
				1073	return v;
				1074	}
				1075
				1076	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1077	{
				1078	if (!PyUnicode_Check(unicode)) {
				1079	PyErr_BadArgument();
				1080	return NULL;
				1081	}
				1082	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1083	PyUnicode_GET_SIZE(unicode),
				1084	NULL,
				1085	0);
				1086	}
				1087
				1088	/* --- Unicode Escape Codec ----------------------------------------------- */
				1089
				1090	static
				1091	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1092	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1093	const char *errors,
				1094	const char *details)
				1095	{
				1096	if ((errors == NULL) \|\|
				1097	(strcmp(errors,"strict") == 0)) {
				1098	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1099	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1100	details);
				1101	return -1;
				1102	}
				1103	else if (strcmp(errors,"ignore") == 0) {
				1104	return 0;
				1105	}
				1106	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1107	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1108	return 0;
				1109	}
				1110	else {
				1111	PyErr_Format(PyExc_ValueError,
				1112	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1113	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1114	errors);
				1115	return -1;
				1116	}
				1117	}
				1118
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1119	static _Py_UCNHashAPI *pucnHash = NULL;
				1120
				1121	static
				1122	int mystrnicmp(const char s1, const char s2, size_t count)
				1123	{
				1124	char c1, c2;
				1125
				1126	if (count)
				1127	{
				1128	do
				1129	{
				1130	c1 = tolower(*(s1++));
				1131	c2 = tolower(*(s2++));
				1132	}
				1133	while(--count && c1 == c2);
				1134
				1135	return c1 - c2;
				1136	}
				1137
				1138	return 0;
				1139	}
				1140
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1141	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1142	int size,
				1143	const char *errors)
				1144	{
				1145	PyUnicodeObject *v;
				1146	Py_UNICODE p = NULL, buf = NULL;
				1147	const char *end;
				1148
				1149	/* Escaped strings will always be longer than the resulting
				1150	Unicode string, so we start with size here and then reduce the
				1151	length after conversion to the true value. */
				1152	v = _PyUnicode_New(size);
				1153	if (v == NULL)
				1154	goto onError;
				1155	if (size == 0)
				1156	return (PyObject *)v;
				1157	p = buf = PyUnicode_AS_UNICODE(v);
				1158	end = s + size;
				1159	while (s < end) {
				1160	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1161	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1162	int i;
				1163
				1164	/* Non-escape characters are interpreted as Unicode ordinals */
				1165	if (*s != '\\') {
				1166	p++ = (unsigned char)s++;
				1167	continue;
				1168	}
				1169
				1170	/* \ - Escapes */
				1171	s++;
				1172	switch (*s++) {
				1173
				1174	/* \x escapes */
				1175	case '\n': break;
				1176	case '\\': *p++ = '\\'; break;
				1177	case '\'': *p++ = '\''; break;
				1178	case '\"': *p++ = '\"'; break;
				1179	case 'b': *p++ = '\b'; break;
				1180	case 'f': p++ = '\014'; break; / FF */
				1181	case 't': *p++ = '\t'; break;
				1182	case 'n': *p++ = '\n'; break;
				1183	case 'r': *p++ = '\r'; break;
				1184	case 'v': p++ = '\013'; break; / VT */
				1185	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1186
				1187	/* \OOO (octal) escapes */
				1188	case '0': case '1': case '2': case '3':
				1189	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1190	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1191	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1192	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1193	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1194	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1195	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1196	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1197	break;
				1198
				1199	/* \xXXXX escape with 0-4 hex digits */
				1200	case 'x':
				1201	x = 0;
				1202	c = (unsigned char)*s;
				1203	if (isxdigit(c)) {
				1204	do {
				1205	x = (x<<4) & ~0xF;
				1206	if ('0' <= c && c <= '9')
				1207	x += c - '0';
				1208	else if ('a' <= c && c <= 'f')
				1209	x += 10 + c - 'a';
				1210	else
				1211	x += 10 + c - 'A';
				1212	c = (unsigned char)*++s;
				1213	} while (isxdigit(c));
				1214	*p++ = x;
				1215	} else {
				1216	*p++ = '\\';
				1217	*p++ = (unsigned char)s[-1];
				1218	}
				1219	break;
				1220
				1221	/* \uXXXX with 4 hex digits */
				1222	case 'u':
				1223	for (x = 0, i = 0; i < 4; i++) {
				1224	c = (unsigned char)s[i];
				1225	if (!isxdigit(c)) {
				1226	if (unicodeescape_decoding_error(&s, &x, errors,
				1227	"truncated \\uXXXX"))
				1228	goto onError;
				1229	i++;
				1230	break;
				1231	}
				1232	x = (x<<4) & ~0xF;
				1233	if (c >= '0' && c <= '9')
				1234	x += c - '0';
				1235	else if (c >= 'a' && c <= 'f')
				1236	x += 10 + c - 'a';
				1237	else
				1238	x += 10 + c - 'A';
				1239	}
				1240	s += i;
				1241	*p++ = x;
				1242	break;
				1243
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1244	case 'N':
				1245	/* Ok, we need to deal with Unicode Character Names now,
				1246	* make sure we've imported the hash table data...
				1247	*/
				1248	if (pucnHash == NULL)
				1249	{
				1250	PyObject mod = 0, v = 0;
				1251
				1252	mod = PyImport_ImportModule("ucnhash");
				1253	if (mod == NULL)
				1254	goto onError;
				1255	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1256	Py_DECREF(mod);
				1257	if (v == NULL)
				1258	{
				1259	goto onError;
				1260	}
				1261	pucnHash = PyCObject_AsVoidPtr(v);
				1262	Py_DECREF(v);
				1263	if (pucnHash == NULL)
				1264	{
				1265	goto onError;
				1266	}
				1267	}
				1268
				1269	if (*s == '{')
				1270	{
				1271	const char *start = s + 1;
				1272	const char *endBrace = start;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1273	Py_UCS4 value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1274	unsigned long j;
				1275
				1276	/* look for either the closing brace, or we
				1277	* exceed the maximum length of the unicode character names
				1278	*/
				1279	while (*endBrace != '}' &&
				1280	(unsigned int)(endBrace - start) <=
				1281	pucnHash->cchMax &&
				1282	endBrace < end)
				1283	{
				1284	endBrace++;
				1285	}
				1286	if (endBrace != end && *endBrace == '}')
				1287	{
				1288	j = pucnHash->hash(start, endBrace - start);
				1289	if (j > pucnHash->cKeys \|\|
				1290	mystrnicmp(
				1291	start,
				1292	((_Py_UnicodeCharacterName *)
				1293	(pucnHash->getValue(j)))->pszUCN,
				1294	(int)(endBrace - start)) != 0)
				1295	{
				1296	if (unicodeescape_decoding_error(
				1297	&s, &x, errors,
				1298	"Invalid Unicode Character Name"))
				1299	{
				1300	goto onError;
				1301	}
				1302	goto ucnFallthrough;
				1303	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1304	value = ((_Py_UnicodeCharacterName *)
				1305	(pucnHash->getValue(j)))->value;
				1306	if (value < 1<<16)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1307	{
				1308	/* In UCS-2 range, easy solution.. */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1309	*p++ = value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1310	}
				1311	else
				1312	{
				1313	/* Oops, its in UCS-4 space, */
				1314	/* compute and append the two surrogates: */
				1315	/* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1316	value -= 0x10000;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1317
				1318	/* high surrogate = top 10 bits added to D800 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1319	*p++ = 0xD800 + (value >> 10);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1320
				1321	/* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1322	*p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1323	}
				1324	s = endBrace + 1;
				1325	}
				1326	else
				1327	{
				1328	if (unicodeescape_decoding_error(
				1329	&s, &x, errors,
				1330	"Unicode name missing closing brace"))
				1331	goto onError;
				1332	goto ucnFallthrough;
				1333	}
				1334	break;
				1335	}
				1336	if (unicodeescape_decoding_error(
				1337	&s, &x, errors,
				1338	"Missing opening brace for Unicode Character Name escape"))
				1339	goto onError;
				1340	ucnFallthrough:
				1341	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1342	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1343	*p++ = '\\';
				1344	*p++ = (unsigned char)s[-1];
				1345	break;
				1346	}
				1347	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1348	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1349	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1350	return (PyObject *)v;
				1351
				1352	onError:
				1353	Py_XDECREF(v);
				1354	return NULL;
				1355	}
				1356
				1357	/* Return a Unicode-Escape string version of the Unicode object.
				1358
				1359	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1360	appropriate.
				1361
				1362	*/
				1363
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1364	static const Py_UNICODE findchar(const Py_UNICODE s,
				1365	int size,
				1366	Py_UNICODE ch);
				1367
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1368	static
				1369	PyObject unicodeescape_string(const Py_UNICODE s,
				1370	int size,
				1371	int quotes)
				1372	{
				1373	PyObject *repr;
				1374	char *p;
				1375	char *q;
				1376
				1377	static const char *hexdigit = "0123456789ABCDEF";
				1378
				1379	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1380	if (repr == NULL)
				1381	return NULL;
				1382
				1383	p = q = PyString_AS_STRING(repr);
				1384
				1385	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1386	*p++ = 'u';
				1387	*p++ = (findchar(s, size, '\'') &&
				1388	!findchar(s, size, '"')) ? '"' : '\'';
				1389	}
				1390	while (size-- > 0) {
				1391	Py_UNICODE ch = *s++;
				1392	/* Escape quotes */
				1393	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1394	*p++ = '\\';
				1395	*p++ = (char) ch;
				1396	}
				1397	/* Map 16-bit characters to '\uxxxx' */
				1398	else if (ch >= 256) {
				1399	*p++ = '\\';
				1400	*p++ = 'u';
				1401	*p++ = hexdigit[(ch >> 12) & 0xf];
				1402	*p++ = hexdigit[(ch >> 8) & 0xf];
				1403	*p++ = hexdigit[(ch >> 4) & 0xf];
				1404	*p++ = hexdigit[ch & 15];
				1405	}
				1406	/* Map non-printable US ASCII to '\ooo' */
				1407	else if (ch < ' ' \|\| ch >= 128) {
				1408	*p++ = '\\';
				1409	*p++ = hexdigit[(ch >> 6) & 7];
				1410	*p++ = hexdigit[(ch >> 3) & 7];
				1411	*p++ = hexdigit[ch & 7];
				1412	}
				1413	/* Copy everything else as-is */
				1414	else
				1415	*p++ = (char) ch;
				1416	}
				1417	if (quotes)
				1418	*p++ = q[1];
				1419
				1420	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1421	if (_PyString_Resize(&repr, p - q))
				1422	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1423
				1424	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1425
				1426	onError:
				1427	Py_DECREF(repr);
				1428	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1429	}
				1430
				1431	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1432	int size)
				1433	{
				1434	return unicodeescape_string(s, size, 0);
				1435	}
				1436
				1437	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1438	{
				1439	if (!PyUnicode_Check(unicode)) {
				1440	PyErr_BadArgument();
				1441	return NULL;
				1442	}
				1443	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1444	PyUnicode_GET_SIZE(unicode));
				1445	}
				1446
				1447	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1448
				1449	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1450	int size,
				1451	const char *errors)
				1452	{
				1453	PyUnicodeObject *v;
				1454	Py_UNICODE p, buf;
				1455	const char *end;
				1456	const char *bs;
				1457
				1458	/* Escaped strings will always be longer than the resulting
				1459	Unicode string, so we start with size here and then reduce the
				1460	length after conversion to the true value. */
				1461	v = _PyUnicode_New(size);
				1462	if (v == NULL)
				1463	goto onError;
				1464	if (size == 0)
				1465	return (PyObject *)v;
				1466	p = buf = PyUnicode_AS_UNICODE(v);
				1467	end = s + size;
				1468	while (s < end) {
				1469	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1470	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1471	int i;
				1472
				1473	/* Non-escape characters are interpreted as Unicode ordinals */
				1474	if (*s != '\\') {
				1475	p++ = (unsigned char)s++;
				1476	continue;
				1477	}
				1478
				1479	/* \u-escapes are only interpreted iff the number of leading
				1480	backslashes if odd */
				1481	bs = s;
				1482	for (;s < end;) {
				1483	if (*s != '\\')
				1484	break;
				1485	p++ = (unsigned char)s++;
				1486	}
				1487	if (((s - bs) & 1) == 0 \|\|
				1488	s >= end \|\|
				1489	*s != 'u') {
				1490	continue;
				1491	}
				1492	p--;
				1493	s++;
				1494
				1495	/* \uXXXX with 4 hex digits */
				1496	for (x = 0, i = 0; i < 4; i++) {
				1497	c = (unsigned char)s[i];
				1498	if (!isxdigit(c)) {
				1499	if (unicodeescape_decoding_error(&s, &x, errors,
				1500	"truncated \\uXXXX"))
				1501	goto onError;
				1502	i++;
				1503	break;
				1504	}
				1505	x = (x<<4) & ~0xF;
				1506	if (c >= '0' && c <= '9')
				1507	x += c - '0';
				1508	else if (c >= 'a' && c <= 'f')
				1509	x += 10 + c - 'a';
				1510	else
				1511	x += 10 + c - 'A';
				1512	}
				1513	s += i;
				1514	*p++ = x;
				1515	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1516	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1517	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1518	return (PyObject *)v;
				1519
				1520	onError:
				1521	Py_XDECREF(v);
				1522	return NULL;
				1523	}
				1524
				1525	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1526	int size)
				1527	{
				1528	PyObject *repr;
				1529	char *p;
				1530	char *q;
				1531
				1532	static const char *hexdigit = "0123456789ABCDEF";
				1533
				1534	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1535	if (repr == NULL)
				1536	return NULL;
				1537
				1538	p = q = PyString_AS_STRING(repr);
				1539	while (size-- > 0) {
				1540	Py_UNICODE ch = *s++;
				1541	/* Map 16-bit characters to '\uxxxx' */
				1542	if (ch >= 256) {
				1543	*p++ = '\\';
				1544	*p++ = 'u';
				1545	*p++ = hexdigit[(ch >> 12) & 0xf];
				1546	*p++ = hexdigit[(ch >> 8) & 0xf];
				1547	*p++ = hexdigit[(ch >> 4) & 0xf];
				1548	*p++ = hexdigit[ch & 15];
				1549	}
				1550	/* Copy everything else as-is */
				1551	else
				1552	*p++ = (char) ch;
				1553	}
				1554	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1555	if (_PyString_Resize(&repr, p - q))
				1556	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1557
				1558	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1559
				1560	onError:
				1561	Py_DECREF(repr);
				1562	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1563	}
				1564
				1565	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1566	{
				1567	if (!PyUnicode_Check(unicode)) {
				1568	PyErr_BadArgument();
				1569	return NULL;
				1570	}
				1571	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1572	PyUnicode_GET_SIZE(unicode));
				1573	}
				1574
				1575	/* --- Latin-1 Codec ------------------------------------------------------ */
				1576
				1577	PyObject PyUnicode_DecodeLatin1(const char s,
				1578	int size,
				1579	const char *errors)
				1580	{
				1581	PyUnicodeObject *v;
				1582	Py_UNICODE *p;
				1583
				1584	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1585	v = _PyUnicode_New(size);
				1586	if (v == NULL)
				1587	goto onError;
				1588	if (size == 0)
				1589	return (PyObject *)v;
				1590	p = PyUnicode_AS_UNICODE(v);
				1591	while (size-- > 0)
				1592	p++ = (unsigned char)s++;
				1593	return (PyObject *)v;
				1594
				1595	onError:
				1596	Py_XDECREF(v);
				1597	return NULL;
				1598	}
				1599
				1600	static
				1601	int latin1_encoding_error(const Py_UNICODE **source,
				1602	char **dest,
				1603	const char *errors,
				1604	const char *details)
				1605	{
				1606	if ((errors == NULL) \|\|
				1607	(strcmp(errors,"strict") == 0)) {
				1608	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1609	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1610	details);
				1611	return -1;
				1612	}
				1613	else if (strcmp(errors,"ignore") == 0) {
				1614	return 0;
				1615	}
				1616	else if (strcmp(errors,"replace") == 0) {
				1617	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1618	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1619	return 0;
				1620	}
				1621	else {
				1622	PyErr_Format(PyExc_ValueError,
				1623	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1624	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1625	errors);
				1626	return -1;
				1627	}
				1628	}
				1629
				1630	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1631	int size,
				1632	const char *errors)
				1633	{
				1634	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1635	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1636	repr = PyString_FromStringAndSize(NULL, size);
				1637	if (repr == NULL)
				1638	return NULL;
				1639
				1640	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1641	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1642	while (size-- > 0) {
				1643	Py_UNICODE ch = *p++;
				1644	if (ch >= 256) {
				1645	if (latin1_encoding_error(&p, &s, errors,
				1646	"ordinal not in range(256)"))
				1647	goto onError;
				1648	}
				1649	else
				1650	*s++ = (char)ch;
				1651	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1652	/* Resize if error handling skipped some characters */
				1653	if (s - start < PyString_GET_SIZE(repr))
				1654	if (_PyString_Resize(&repr, s - start))
				1655	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1656	return repr;
				1657
				1658	onError:
				1659	Py_DECREF(repr);
				1660	return NULL;
				1661	}
				1662
				1663	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1664	{
				1665	if (!PyUnicode_Check(unicode)) {
				1666	PyErr_BadArgument();
				1667	return NULL;
				1668	}
				1669	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1670	PyUnicode_GET_SIZE(unicode),
				1671	NULL);
				1672	}
				1673
				1674	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1675
				1676	static
				1677	int ascii_decoding_error(const char **source,
				1678	Py_UNICODE **dest,
				1679	const char *errors,
				1680	const char *details)
				1681	{
				1682	if ((errors == NULL) \|\|
				1683	(strcmp(errors,"strict") == 0)) {
				1684	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1685	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1686	details);
				1687	return -1;
				1688	}
				1689	else if (strcmp(errors,"ignore") == 0) {
				1690	return 0;
				1691	}
				1692	else if (strcmp(errors,"replace") == 0) {
				1693	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1694	(*dest)++;
				1695	return 0;
				1696	}
				1697	else {
				1698	PyErr_Format(PyExc_ValueError,
				1699	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1700	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1701	errors);
				1702	return -1;
				1703	}
				1704	}
				1705
				1706	PyObject PyUnicode_DecodeASCII(const char s,
				1707	int size,
				1708	const char *errors)
				1709	{
				1710	PyUnicodeObject *v;
				1711	Py_UNICODE *p;
				1712
				1713	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1714	v = _PyUnicode_New(size);
				1715	if (v == NULL)
				1716	goto onError;
				1717	if (size == 0)
				1718	return (PyObject *)v;
				1719	p = PyUnicode_AS_UNICODE(v);
				1720	while (size-- > 0) {
				1721	register unsigned char c;
				1722
				1723	c = (unsigned char)*s++;
				1724	if (c < 128)
				1725	*p++ = c;
				1726	else if (ascii_decoding_error(&s, &p, errors,
				1727	"ordinal not in range(128)"))
				1728	goto onError;
				1729	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1730	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1731	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1732	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1733	return (PyObject *)v;
				1734
				1735	onError:
				1736	Py_XDECREF(v);
				1737	return NULL;
				1738	}
				1739
				1740	static
				1741	int ascii_encoding_error(const Py_UNICODE **source,
				1742	char **dest,
				1743	const char *errors,
				1744	const char *details)
				1745	{
				1746	if ((errors == NULL) \|\|
				1747	(strcmp(errors,"strict") == 0)) {
				1748	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1749	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1750	details);
				1751	return -1;
				1752	}
				1753	else if (strcmp(errors,"ignore") == 0) {
				1754	return 0;
				1755	}
				1756	else if (strcmp(errors,"replace") == 0) {
				1757	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1758	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1759	return 0;
				1760	}
				1761	else {
				1762	PyErr_Format(PyExc_ValueError,
				1763	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1764	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1765	errors);
				1766	return -1;
				1767	}
				1768	}
				1769
				1770	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1771	int size,
				1772	const char *errors)
				1773	{
				1774	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1775	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1776	repr = PyString_FromStringAndSize(NULL, size);
				1777	if (repr == NULL)
				1778	return NULL;
				1779
				1780	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1781	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1782	while (size-- > 0) {
				1783	Py_UNICODE ch = *p++;
				1784	if (ch >= 128) {
				1785	if (ascii_encoding_error(&p, &s, errors,
				1786	"ordinal not in range(128)"))
				1787	goto onError;
				1788	}
				1789	else
				1790	*s++ = (char)ch;
				1791	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1792	/* Resize if error handling skipped some characters */
				1793	if (s - start < PyString_GET_SIZE(repr))
				1794	if (_PyString_Resize(&repr, s - start))
				1795	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1796	return repr;
				1797
				1798	onError:
				1799	Py_DECREF(repr);
				1800	return NULL;
				1801	}
				1802
				1803	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1804	{
				1805	if (!PyUnicode_Check(unicode)) {
				1806	PyErr_BadArgument();
				1807	return NULL;
				1808	}
				1809	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1810	PyUnicode_GET_SIZE(unicode),
				1811	NULL);
				1812	}
				1813
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1814	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1815
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1816	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1817
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1818	PyObject PyUnicode_DecodeMBCS(const char s,
				1819	int size,
				1820	const char *errors)
				1821	{
				1822	PyUnicodeObject *v;
				1823	Py_UNICODE *p;
				1824
				1825	/* First get the size of the result */
				1826	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1827	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1828	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1829
				1830	v = _PyUnicode_New(usize);
				1831	if (v == NULL)
				1832	return NULL;
				1833	if (usize == 0)
				1834	return (PyObject *)v;
				1835	p = PyUnicode_AS_UNICODE(v);
				1836	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1837	Py_DECREF(v);
				1838	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1839	}
				1840
				1841	return (PyObject *)v;
				1842	}
				1843
				1844	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1845	int size,
				1846	const char *errors)
				1847	{
				1848	PyObject *repr;
				1849	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1850	DWORD mbcssize;
				1851
				1852	/* If there are no characters, bail now! */
				1853	if (size==0)
				1854	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1855
				1856	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1857	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1858	if (mbcssize==0)
				1859	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1860
				1861	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1862	if (repr == NULL)
				1863	return NULL;
				1864	if (mbcssize==0)
				1865	return repr;
				1866
				1867	/* Do the conversion */
				1868	s = PyString_AS_STRING(repr);
				1869	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1870	Py_DECREF(repr);
				1871	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1872	}
				1873	return repr;
				1874	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1875
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1876	#endif /* MS_WIN32 */
				1877
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1878	/* --- Character Mapping Codec -------------------------------------------- */
				1879
				1880	static
				1881	int charmap_decoding_error(const char **source,
				1882	Py_UNICODE **dest,
				1883	const char *errors,
				1884	const char *details)
				1885	{
				1886	if ((errors == NULL) \|\|
				1887	(strcmp(errors,"strict") == 0)) {
				1888	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1889	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1890	details);
				1891	return -1;
				1892	}
				1893	else if (strcmp(errors,"ignore") == 0) {
				1894	return 0;
				1895	}
				1896	else if (strcmp(errors,"replace") == 0) {
				1897	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1898	(*dest)++;
				1899	return 0;
				1900	}
				1901	else {
				1902	PyErr_Format(PyExc_ValueError,
				1903	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1904	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1905	errors);
				1906	return -1;
				1907	}
				1908	}
				1909
				1910	PyObject PyUnicode_DecodeCharmap(const char s,
				1911	int size,
				1912	PyObject *mapping,
				1913	const char *errors)
				1914	{
				1915	PyUnicodeObject *v;
				1916	Py_UNICODE *p;
				1917
				1918	/* Default to Latin-1 */
				1919	if (mapping == NULL)
				1920	return PyUnicode_DecodeLatin1(s, size, errors);
				1921
				1922	v = _PyUnicode_New(size);
				1923	if (v == NULL)
				1924	goto onError;
				1925	if (size == 0)
				1926	return (PyObject *)v;
				1927	p = PyUnicode_AS_UNICODE(v);
				1928	while (size-- > 0) {
				1929	unsigned char ch = *s++;
				1930	PyObject w, x;
				1931
				1932	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1933	w = PyInt_FromLong((long)ch);
				1934	if (w == NULL)
				1935	goto onError;
				1936	x = PyObject_GetItem(mapping, w);
				1937	Py_DECREF(w);
				1938	if (x == NULL) {
				1939	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1940	/* No mapping found: default to Latin-1 mapping */
				1941	PyErr_Clear();
				1942	*p++ = (Py_UNICODE)ch;
				1943	continue;
				1944	}
				1945	goto onError;
				1946	}
				1947
				1948	/* Apply mapping */
				1949	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1950	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1951	if (value < 0 \|\| value > 65535) {
				1952	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1953	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1954	Py_DECREF(x);
				1955	goto onError;
				1956	}
				1957	*p++ = (Py_UNICODE)value;
				1958	}
				1959	else if (x == Py_None) {
				1960	/* undefined mapping */
				1961	if (charmap_decoding_error(&s, &p, errors,
				1962	"character maps to <undefined>")) {
				1963	Py_DECREF(x);
				1964	goto onError;
				1965	}
				1966	}
				1967	else if (PyUnicode_Check(x)) {
				1968	if (PyUnicode_GET_SIZE(x) != 1) {
				1969	/* 1-n mapping */
				1970	PyErr_SetString(PyExc_NotImplementedError,
				1971	"1-n mappings are currently not implemented");
				1972	Py_DECREF(x);
				1973	goto onError;
				1974	}
				1975	p++ = PyUnicode_AS_UNICODE(x);
				1976	}
				1977	else {
				1978	/* wrong return value */
				1979	PyErr_SetString(PyExc_TypeError,
				1980	"character mapping must return integer, None or unicode");
				1981	Py_DECREF(x);
				1982	goto onError;
				1983	}
				1984	Py_DECREF(x);
				1985	}
				1986	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1987	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1988	goto onError;
				1989	return (PyObject *)v;
				1990
				1991	onError:
				1992	Py_XDECREF(v);
				1993	return NULL;
				1994	}
				1995
				1996	static
				1997	int charmap_encoding_error(const Py_UNICODE **source,
				1998	char **dest,
				1999	const char *errors,
				2000	const char *details)
				2001	{
				2002	if ((errors == NULL) \|\|
				2003	(strcmp(errors,"strict") == 0)) {
				2004	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2005	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2006	details);
				2007	return -1;
				2008	}
				2009	else if (strcmp(errors,"ignore") == 0) {
				2010	return 0;
				2011	}
				2012	else if (strcmp(errors,"replace") == 0) {
				2013	**dest = '?';
				2014	(*dest)++;
				2015	return 0;
				2016	}
				2017	else {
				2018	PyErr_Format(PyExc_ValueError,
				2019	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2020	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2021	errors);
				2022	return -1;
				2023	}
				2024	}
				2025
				2026	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2027	int size,
				2028	PyObject *mapping,
				2029	const char *errors)
				2030	{
				2031	PyObject *v;
				2032	char *s;
				2033
				2034	/* Default to Latin-1 */
				2035	if (mapping == NULL)
				2036	return PyUnicode_EncodeLatin1(p, size, errors);
				2037
				2038	v = PyString_FromStringAndSize(NULL, size);
				2039	if (v == NULL)
				2040	return NULL;
				2041	s = PyString_AS_STRING(v);
				2042	while (size-- > 0) {
				2043	Py_UNICODE ch = *p++;
				2044	PyObject w, x;
				2045
				2046	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2047	w = PyInt_FromLong((long)ch);
				2048	if (w == NULL)
				2049	goto onError;
				2050	x = PyObject_GetItem(mapping, w);
				2051	Py_DECREF(w);
				2052	if (x == NULL) {
				2053	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2054	/* No mapping found: default to Latin-1 mapping if possible */
				2055	PyErr_Clear();
				2056	if (ch < 256) {
				2057	*s++ = (char)ch;
				2058	continue;
				2059	}
				2060	else if (!charmap_encoding_error(&p, &s, errors,
				2061	"missing character mapping"))
				2062	continue;
				2063	}
				2064	goto onError;
				2065	}
				2066
				2067	/* Apply mapping */
				2068	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2069	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2070	if (value < 0 \|\| value > 255) {
				2071	PyErr_SetString(PyExc_TypeError,
				2072	"character mapping must be in range(256)");
				2073	Py_DECREF(x);
				2074	goto onError;
				2075	}
				2076	*s++ = (char)value;
				2077	}
				2078	else if (x == Py_None) {
				2079	/* undefined mapping */
				2080	if (charmap_encoding_error(&p, &s, errors,
				2081	"character maps to <undefined>")) {
				2082	Py_DECREF(x);
				2083	goto onError;
				2084	}
				2085	}
				2086	else if (PyString_Check(x)) {
				2087	if (PyString_GET_SIZE(x) != 1) {
				2088	/* 1-n mapping */
				2089	PyErr_SetString(PyExc_NotImplementedError,
				2090	"1-n mappings are currently not implemented");
				2091	Py_DECREF(x);
				2092	goto onError;
				2093	}
				2094	s++ = PyString_AS_STRING(x);
				2095	}
				2096	else {
				2097	/* wrong return value */
				2098	PyErr_SetString(PyExc_TypeError,
				2099	"character mapping must return integer, None or unicode");
				2100	Py_DECREF(x);
				2101	goto onError;
				2102	}
				2103	Py_DECREF(x);
				2104	}
				2105	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2106	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2107	goto onError;
				2108	return v;
				2109
				2110	onError:
				2111	Py_DECREF(v);
				2112	return NULL;
				2113	}
				2114
				2115	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2116	PyObject *mapping)
				2117	{
				2118	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2119	PyErr_BadArgument();
				2120	return NULL;
				2121	}
				2122	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2123	PyUnicode_GET_SIZE(unicode),
				2124	mapping,
				2125	NULL);
				2126	}
				2127
				2128	static
				2129	int translate_error(const Py_UNICODE **source,
				2130	Py_UNICODE **dest,
				2131	const char *errors,
				2132	const char *details)
				2133	{
				2134	if ((errors == NULL) \|\|
				2135	(strcmp(errors,"strict") == 0)) {
				2136	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2137	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2138	details);
				2139	return -1;
				2140	}
				2141	else if (strcmp(errors,"ignore") == 0) {
				2142	return 0;
				2143	}
				2144	else if (strcmp(errors,"replace") == 0) {
				2145	**dest = '?';
				2146	(*dest)++;
				2147	return 0;
				2148	}
				2149	else {
				2150	PyErr_Format(PyExc_ValueError,
				2151	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2152	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2153	errors);
				2154	return -1;
				2155	}
				2156	}
				2157
				2158	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2159	int size,
				2160	PyObject *mapping,
				2161	const char *errors)
				2162	{
				2163	PyUnicodeObject *v;
				2164	Py_UNICODE *p;
				2165
				2166	if (mapping == NULL) {
				2167	PyErr_BadArgument();
				2168	return NULL;
				2169	}
				2170
				2171	/* Output will never be longer than input */
				2172	v = _PyUnicode_New(size);
				2173	if (v == NULL)
				2174	goto onError;
				2175	if (size == 0)
				2176	goto done;
				2177	p = PyUnicode_AS_UNICODE(v);
				2178	while (size-- > 0) {
				2179	Py_UNICODE ch = *s++;
				2180	PyObject w, x;
				2181
				2182	/* Get mapping */
				2183	w = PyInt_FromLong(ch);
				2184	if (w == NULL)
				2185	goto onError;
				2186	x = PyObject_GetItem(mapping, w);
				2187	Py_DECREF(w);
				2188	if (x == NULL) {
				2189	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2190	/* No mapping found: default to 1-1 mapping */
				2191	PyErr_Clear();
				2192	*p++ = ch;
				2193	continue;
				2194	}
				2195	goto onError;
				2196	}
				2197
				2198	/* Apply mapping */
				2199	if (PyInt_Check(x))
				2200	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2201	else if (x == Py_None) {
				2202	/* undefined mapping */
				2203	if (translate_error(&s, &p, errors,
				2204	"character maps to <undefined>")) {
				2205	Py_DECREF(x);
				2206	goto onError;
				2207	}
				2208	}
				2209	else if (PyUnicode_Check(x)) {
				2210	if (PyUnicode_GET_SIZE(x) != 1) {
				2211	/* 1-n mapping */
				2212	PyErr_SetString(PyExc_NotImplementedError,
				2213	"1-n mappings are currently not implemented");
				2214	Py_DECREF(x);
				2215	goto onError;
				2216	}
				2217	p++ = PyUnicode_AS_UNICODE(x);
				2218	}
				2219	else {
				2220	/* wrong return value */
				2221	PyErr_SetString(PyExc_TypeError,
				2222	"translate mapping must return integer, None or unicode");
				2223	Py_DECREF(x);
				2224	goto onError;
				2225	}
				2226	Py_DECREF(x);
				2227	}
				2228	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2229	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2230	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2231
				2232	done:
				2233	return (PyObject *)v;
				2234
				2235	onError:
				2236	Py_XDECREF(v);
				2237	return NULL;
				2238	}
				2239
				2240	PyObject PyUnicode_Translate(PyObject str,
				2241	PyObject *mapping,
				2242	const char *errors)
				2243	{
				2244	PyObject *result;
				2245
				2246	str = PyUnicode_FromObject(str);
				2247	if (str == NULL)
				2248	goto onError;
				2249	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2250	PyUnicode_GET_SIZE(str),
				2251	mapping,
				2252	errors);
				2253	Py_DECREF(str);
				2254	return result;
				2255
				2256	onError:
				2257	Py_XDECREF(str);
				2258	return NULL;
				2259	}
				2260
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2261	/* --- Decimal Encoder ---------------------------------------------------- */
				2262
				2263	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2264	int length,
				2265	char *output,
				2266	const char *errors)
				2267	{
				2268	Py_UNICODE p, end;
				2269
				2270	if (output == NULL) {
				2271	PyErr_BadArgument();
				2272	return -1;
				2273	}
				2274
				2275	p = s;
				2276	end = s + length;
				2277	while (p < end) {
				2278	register Py_UNICODE ch = *p++;
				2279	int decimal;
				2280
				2281	if (Py_UNICODE_ISSPACE(ch)) {
				2282	*output++ = ' ';
				2283	continue;
				2284	}
				2285	decimal = Py_UNICODE_TODECIMAL(ch);
				2286	if (decimal >= 0) {
				2287	*output++ = '0' + decimal;
				2288	continue;
				2289	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2290	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2291	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2292	continue;
				2293	}
				2294	/* All other characters are considered invalid */
				2295	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2296	PyErr_SetString(PyExc_ValueError,
				2297	"invalid decimal Unicode string");
				2298	goto onError;
				2299	}
				2300	else if (strcmp(errors, "ignore") == 0)
				2301	continue;
				2302	else if (strcmp(errors, "replace") == 0) {
				2303	*output++ = '?';
				2304	continue;
				2305	}
				2306	}
				2307	/* 0-terminate the output string */
				2308	*output++ = '\0';
				2309	return 0;
				2310
				2311	onError:
				2312	return -1;
				2313	}
				2314
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2315	/* --- Helpers ------------------------------------------------------------ */
				2316
				2317	static
				2318	int count(PyUnicodeObject *self,
				2319	int start,
				2320	int end,
				2321	PyUnicodeObject *substring)
				2322	{
				2323	int count = 0;
				2324
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2325	if (substring->length == 0)
				2326	return (end - start + 1);
				2327
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2328	end -= substring->length;
				2329
				2330	while (start <= end)
				2331	if (Py_UNICODE_MATCH(self, start, substring)) {
				2332	count++;
				2333	start += substring->length;
				2334	} else
				2335	start++;
				2336
				2337	return count;
				2338	}
				2339
				2340	int PyUnicode_Count(PyObject *str,
				2341	PyObject *substr,
				2342	int start,
				2343	int end)
				2344	{
				2345	int result;
				2346
				2347	str = PyUnicode_FromObject(str);
				2348	if (str == NULL)
				2349	return -1;
				2350	substr = PyUnicode_FromObject(substr);
				2351	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2352	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2353	return -1;
				2354	}
				2355
				2356	result = count((PyUnicodeObject *)str,
				2357	start, end,
				2358	(PyUnicodeObject *)substr);
				2359
				2360	Py_DECREF(str);
				2361	Py_DECREF(substr);
				2362	return result;
				2363	}
				2364
				2365	static
				2366	int findstring(PyUnicodeObject *self,
				2367	PyUnicodeObject *substring,
				2368	int start,
				2369	int end,
				2370	int direction)
				2371	{
				2372	if (start < 0)
				2373	start += self->length;
				2374	if (start < 0)
				2375	start = 0;
				2376
				2377	if (substring->length == 0)
				2378	return start;
				2379
				2380	if (end > self->length)
				2381	end = self->length;
				2382	if (end < 0)
				2383	end += self->length;
				2384	if (end < 0)
				2385	end = 0;
				2386
				2387	end -= substring->length;
				2388
				2389	if (direction < 0) {
				2390	for (; end >= start; end--)
				2391	if (Py_UNICODE_MATCH(self, end, substring))
				2392	return end;
				2393	} else {
				2394	for (; start <= end; start++)
				2395	if (Py_UNICODE_MATCH(self, start, substring))
				2396	return start;
				2397	}
				2398
				2399	return -1;
				2400	}
				2401
				2402	int PyUnicode_Find(PyObject *str,
				2403	PyObject *substr,
				2404	int start,
				2405	int end,
				2406	int direction)
				2407	{
				2408	int result;
				2409
				2410	str = PyUnicode_FromObject(str);
				2411	if (str == NULL)
				2412	return -1;
				2413	substr = PyUnicode_FromObject(substr);
				2414	if (substr == NULL) {
				2415	Py_DECREF(substr);
				2416	return -1;
				2417	}
				2418
				2419	result = findstring((PyUnicodeObject *)str,
				2420	(PyUnicodeObject *)substr,
				2421	start, end, direction);
				2422	Py_DECREF(str);
				2423	Py_DECREF(substr);
				2424	return result;
				2425	}
				2426
				2427	static
				2428	int tailmatch(PyUnicodeObject *self,
				2429	PyUnicodeObject *substring,
				2430	int start,
				2431	int end,
				2432	int direction)
				2433	{
				2434	if (start < 0)
				2435	start += self->length;
				2436	if (start < 0)
				2437	start = 0;
				2438
				2439	if (substring->length == 0)
				2440	return 1;
				2441
				2442	if (end > self->length)
				2443	end = self->length;
				2444	if (end < 0)
				2445	end += self->length;
				2446	if (end < 0)
				2447	end = 0;
				2448
				2449	end -= substring->length;
				2450	if (end < start)
				2451	return 0;
				2452
				2453	if (direction > 0) {
				2454	if (Py_UNICODE_MATCH(self, end, substring))
				2455	return 1;
				2456	} else {
				2457	if (Py_UNICODE_MATCH(self, start, substring))
				2458	return 1;
				2459	}
				2460
				2461	return 0;
				2462	}
				2463
				2464	int PyUnicode_Tailmatch(PyObject *str,
				2465	PyObject *substr,
				2466	int start,
				2467	int end,
				2468	int direction)
				2469	{
				2470	int result;
				2471
				2472	str = PyUnicode_FromObject(str);
				2473	if (str == NULL)
				2474	return -1;
				2475	substr = PyUnicode_FromObject(substr);
				2476	if (substr == NULL) {
				2477	Py_DECREF(substr);
				2478	return -1;
				2479	}
				2480
				2481	result = tailmatch((PyUnicodeObject *)str,
				2482	(PyUnicodeObject *)substr,
				2483	start, end, direction);
				2484	Py_DECREF(str);
				2485	Py_DECREF(substr);
				2486	return result;
				2487	}
				2488
				2489	static
				2490	const Py_UNICODE findchar(const Py_UNICODE s,
				2491	int size,
				2492	Py_UNICODE ch)
				2493	{
				2494	/* like wcschr, but doesn't stop at NULL characters */
				2495
				2496	while (size-- > 0) {
				2497	if (*s == ch)
				2498	return s;
				2499	s++;
				2500	}
				2501
				2502	return NULL;
				2503	}
				2504
				2505	/* Apply fixfct filter to the Unicode object self and return a
				2506	reference to the modified object */
				2507
				2508	static
				2509	PyObject fixup(PyUnicodeObject self,
				2510	int (fixfct)(PyUnicodeObject s))
				2511	{
				2512
				2513	PyUnicodeObject *u;
				2514
				2515	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2516	self->length);
				2517	if (u == NULL)
				2518	return NULL;
				2519	if (!fixfct(u)) {
				2520	/* fixfct should return TRUE if it modified the buffer. If
				2521	FALSE, return a reference to the original buffer instead
				2522	(to save space, not time) */
				2523	Py_INCREF(self);
				2524	Py_DECREF(u);
				2525	return (PyObject*) self;
				2526	}
				2527	return (PyObject*) u;
				2528	}
				2529
				2530	static
				2531	int fixupper(PyUnicodeObject *self)
				2532	{
				2533	int len = self->length;
				2534	Py_UNICODE *s = self->str;
				2535	int status = 0;
				2536
				2537	while (len-- > 0) {
				2538	register Py_UNICODE ch;
				2539
				2540	ch = Py_UNICODE_TOUPPER(*s);
				2541	if (ch != *s) {
				2542	status = 1;
				2543	*s = ch;
				2544	}
				2545	s++;
				2546	}
				2547
				2548	return status;
				2549	}
				2550
				2551	static
				2552	int fixlower(PyUnicodeObject *self)
				2553	{
				2554	int len = self->length;
				2555	Py_UNICODE *s = self->str;
				2556	int status = 0;
				2557
				2558	while (len-- > 0) {
				2559	register Py_UNICODE ch;
				2560
				2561	ch = Py_UNICODE_TOLOWER(*s);
				2562	if (ch != *s) {
				2563	status = 1;
				2564	*s = ch;
				2565	}
				2566	s++;
				2567	}
				2568
				2569	return status;
				2570	}
				2571
				2572	static
				2573	int fixswapcase(PyUnicodeObject *self)
				2574	{
				2575	int len = self->length;
				2576	Py_UNICODE *s = self->str;
				2577	int status = 0;
				2578
				2579	while (len-- > 0) {
				2580	if (Py_UNICODE_ISUPPER(*s)) {
				2581	s = Py_UNICODE_TOLOWER(s);
				2582	status = 1;
				2583	} else if (Py_UNICODE_ISLOWER(*s)) {
				2584	s = Py_UNICODE_TOUPPER(s);
				2585	status = 1;
				2586	}
				2587	s++;
				2588	}
				2589
				2590	return status;
				2591	}
				2592
				2593	static
				2594	int fixcapitalize(PyUnicodeObject *self)
				2595	{
				2596	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2597	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2598	return 1;
				2599	}
				2600	return 0;
				2601	}
				2602
				2603	static
				2604	int fixtitle(PyUnicodeObject *self)
				2605	{
				2606	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2607	register Py_UNICODE *e;
				2608	int previous_is_cased;
				2609
				2610	/* Shortcut for single character strings */
				2611	if (PyUnicode_GET_SIZE(self) == 1) {
				2612	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2613	if (*p != ch) {
				2614	*p = ch;
				2615	return 1;
				2616	}
				2617	else
				2618	return 0;
				2619	}
				2620
				2621	e = p + PyUnicode_GET_SIZE(self);
				2622	previous_is_cased = 0;
				2623	for (; p < e; p++) {
				2624	register const Py_UNICODE ch = *p;
				2625
				2626	if (previous_is_cased)
				2627	*p = Py_UNICODE_TOLOWER(ch);
				2628	else
				2629	*p = Py_UNICODE_TOTITLE(ch);
				2630
				2631	if (Py_UNICODE_ISLOWER(ch) \|\|
				2632	Py_UNICODE_ISUPPER(ch) \|\|
				2633	Py_UNICODE_ISTITLE(ch))
				2634	previous_is_cased = 1;
				2635	else
				2636	previous_is_cased = 0;
				2637	}
				2638	return 1;
				2639	}
				2640
				2641	PyObject PyUnicode_Join(PyObject separator,
				2642	PyObject *seq)
				2643	{
				2644	Py_UNICODE *sep;
				2645	int seplen;
				2646	PyUnicodeObject *res = NULL;
				2647	int reslen = 0;
				2648	Py_UNICODE *p;
				2649	int seqlen = 0;
				2650	int sz = 100;
				2651	int i;
				2652
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame^]	2653	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2654	if (seqlen < 0 && PyErr_Occurred())
				2655	return NULL;
				2656
				2657	if (separator == NULL) {
				2658	Py_UNICODE blank = ' ';
				2659	sep = &blank;
				2660	seplen = 1;
				2661	}
				2662	else {
				2663	separator = PyUnicode_FromObject(separator);
				2664	if (separator == NULL)
				2665	return NULL;
				2666	sep = PyUnicode_AS_UNICODE(separator);
				2667	seplen = PyUnicode_GET_SIZE(separator);
				2668	}
				2669
				2670	res = _PyUnicode_New(sz);
				2671	if (res == NULL)
				2672	goto onError;
				2673	p = PyUnicode_AS_UNICODE(res);
				2674	reslen = 0;
				2675
				2676	for (i = 0; i < seqlen; i++) {
				2677	int itemlen;
				2678	PyObject *item;
				2679
				2680	item = PySequence_GetItem(seq, i);
				2681	if (item == NULL)
				2682	goto onError;
				2683	if (!PyUnicode_Check(item)) {
				2684	PyObject *v;
				2685	v = PyUnicode_FromObject(item);
				2686	Py_DECREF(item);
				2687	item = v;
				2688	if (item == NULL)
				2689	goto onError;
				2690	}
				2691	itemlen = PyUnicode_GET_SIZE(item);
				2692	while (reslen + itemlen + seplen >= sz) {
				2693	if (_PyUnicode_Resize(res, sz*2))
				2694	goto onError;
				2695	sz *= 2;
				2696	p = PyUnicode_AS_UNICODE(res) + reslen;
				2697	}
				2698	if (i > 0) {
				2699	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2700	p += seplen;
				2701	reslen += seplen;
				2702	}
				2703	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2704	p += itemlen;
				2705	reslen += itemlen;
				2706	Py_DECREF(item);
				2707	}
				2708	if (_PyUnicode_Resize(res, reslen))
				2709	goto onError;
				2710
				2711	Py_XDECREF(separator);
				2712	return (PyObject *)res;
				2713
				2714	onError:
				2715	Py_XDECREF(separator);
				2716	Py_DECREF(res);
				2717	return NULL;
				2718	}
				2719
				2720	static
				2721	PyUnicodeObject pad(PyUnicodeObject self,
				2722	int left,
				2723	int right,
				2724	Py_UNICODE fill)
				2725	{
				2726	PyUnicodeObject *u;
				2727
				2728	if (left < 0)
				2729	left = 0;
				2730	if (right < 0)
				2731	right = 0;
				2732
				2733	if (left == 0 && right == 0) {
				2734	Py_INCREF(self);
				2735	return self;
				2736	}
				2737
				2738	u = _PyUnicode_New(left + self->length + right);
				2739	if (u) {
				2740	if (left)
				2741	Py_UNICODE_FILL(u->str, fill, left);
				2742	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2743	if (right)
				2744	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2745	}
				2746
				2747	return u;
				2748	}
				2749
				2750	#define SPLIT_APPEND(data, left, right) \
				2751	str = PyUnicode_FromUnicode(data + left, right - left); \
				2752	if (!str) \
				2753	goto onError; \
				2754	if (PyList_Append(list, str)) { \
				2755	Py_DECREF(str); \
				2756	goto onError; \
				2757	} \
				2758	else \
				2759	Py_DECREF(str);
				2760
				2761	static
				2762	PyObject split_whitespace(PyUnicodeObject self,
				2763	PyObject *list,
				2764	int maxcount)
				2765	{
				2766	register int i;
				2767	register int j;
				2768	int len = self->length;
				2769	PyObject *str;
				2770
				2771	for (i = j = 0; i < len; ) {
				2772	/* find a token */
				2773	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2774	i++;
				2775	j = i;
				2776	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2777	i++;
				2778	if (j < i) {
				2779	if (maxcount-- <= 0)
				2780	break;
				2781	SPLIT_APPEND(self->str, j, i);
				2782	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2783	i++;
				2784	j = i;
				2785	}
				2786	}
				2787	if (j < len) {
				2788	SPLIT_APPEND(self->str, j, len);
				2789	}
				2790	return list;
				2791
				2792	onError:
				2793	Py_DECREF(list);
				2794	return NULL;
				2795	}
				2796
				2797	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2798	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2799	{
				2800	register int i;
				2801	register int j;
				2802	int len;
				2803	PyObject *list;
				2804	PyObject *str;
				2805	Py_UNICODE *data;
				2806
				2807	string = PyUnicode_FromObject(string);
				2808	if (string == NULL)
				2809	return NULL;
				2810	data = PyUnicode_AS_UNICODE(string);
				2811	len = PyUnicode_GET_SIZE(string);
				2812
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2813	list = PyList_New(0);
				2814	if (!list)
				2815	goto onError;
				2816
				2817	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2818	int eol;
				2819
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2820	/* Find a line and append it */
				2821	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2822	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2823
				2824	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2825	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2826	if (i < len) {
				2827	if (data[i] == '\r' && i + 1 < len &&
				2828	data[i+1] == '\n')
				2829	i += 2;
				2830	else
				2831	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2832	if (keepends)
				2833	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2834	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2835	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2836	j = i;
				2837	}
				2838	if (j < len) {
				2839	SPLIT_APPEND(data, j, len);
				2840	}
				2841
				2842	Py_DECREF(string);
				2843	return list;
				2844
				2845	onError:
				2846	Py_DECREF(list);
				2847	Py_DECREF(string);
				2848	return NULL;
				2849	}
				2850
				2851	static
				2852	PyObject split_char(PyUnicodeObject self,
				2853	PyObject *list,
				2854	Py_UNICODE ch,
				2855	int maxcount)
				2856	{
				2857	register int i;
				2858	register int j;
				2859	int len = self->length;
				2860	PyObject *str;
				2861
				2862	for (i = j = 0; i < len; ) {
				2863	if (self->str[i] == ch) {
				2864	if (maxcount-- <= 0)
				2865	break;
				2866	SPLIT_APPEND(self->str, j, i);
				2867	i = j = i + 1;
				2868	} else
				2869	i++;
				2870	}
				2871	if (j <= len) {
				2872	SPLIT_APPEND(self->str, j, len);
				2873	}
				2874	return list;
				2875
				2876	onError:
				2877	Py_DECREF(list);
				2878	return NULL;
				2879	}
				2880
				2881	static
				2882	PyObject split_substring(PyUnicodeObject self,
				2883	PyObject *list,
				2884	PyUnicodeObject *substring,
				2885	int maxcount)
				2886	{
				2887	register int i;
				2888	register int j;
				2889	int len = self->length;
				2890	int sublen = substring->length;
				2891	PyObject *str;
				2892
				2893	for (i = j = 0; i < len - sublen; ) {
				2894	if (Py_UNICODE_MATCH(self, i, substring)) {
				2895	if (maxcount-- <= 0)
				2896	break;
				2897	SPLIT_APPEND(self->str, j, i);
				2898	i = j = i + sublen;
				2899	} else
				2900	i++;
				2901	}
				2902	if (j <= len) {
				2903	SPLIT_APPEND(self->str, j, len);
				2904	}
				2905	return list;
				2906
				2907	onError:
				2908	Py_DECREF(list);
				2909	return NULL;
				2910	}
				2911
				2912	#undef SPLIT_APPEND
				2913
				2914	static
				2915	PyObject split(PyUnicodeObject self,
				2916	PyUnicodeObject *substring,
				2917	int maxcount)
				2918	{
				2919	PyObject *list;
				2920
				2921	if (maxcount < 0)
				2922	maxcount = INT_MAX;
				2923
				2924	list = PyList_New(0);
				2925	if (!list)
				2926	return NULL;
				2927
				2928	if (substring == NULL)
				2929	return split_whitespace(self,list,maxcount);
				2930
				2931	else if (substring->length == 1)
				2932	return split_char(self,list,substring->str[0],maxcount);
				2933
				2934	else if (substring->length == 0) {
				2935	Py_DECREF(list);
				2936	PyErr_SetString(PyExc_ValueError, "empty separator");
				2937	return NULL;
				2938	}
				2939	else
				2940	return split_substring(self,list,substring,maxcount);
				2941	}
				2942
				2943	static
				2944	PyObject strip(PyUnicodeObject self,
				2945	int left,
				2946	int right)
				2947	{
				2948	Py_UNICODE *p = self->str;
				2949	int start = 0;
				2950	int end = self->length;
				2951
				2952	if (left)
				2953	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2954	start++;
				2955
				2956	if (right)
				2957	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2958	end--;
				2959
				2960	if (start == 0 && end == self->length) {
				2961	/* couldn't strip anything off, return original string */
				2962	Py_INCREF(self);
				2963	return (PyObject*) self;
				2964	}
				2965
				2966	return (PyObject*) PyUnicode_FromUnicode(
				2967	self->str + start,
				2968	end - start
				2969	);
				2970	}
				2971
				2972	static
				2973	PyObject replace(PyUnicodeObject self,
				2974	PyUnicodeObject *str1,
				2975	PyUnicodeObject *str2,
				2976	int maxcount)
				2977	{
				2978	PyUnicodeObject *u;
				2979
				2980	if (maxcount < 0)
				2981	maxcount = INT_MAX;
				2982
				2983	if (str1->length == 1 && str2->length == 1) {
				2984	int i;
				2985
				2986	/* replace characters */
				2987	if (!findchar(self->str, self->length, str1->str[0])) {
				2988	/* nothing to replace, return original string */
				2989	Py_INCREF(self);
				2990	u = self;
				2991	} else {
				2992	Py_UNICODE u1 = str1->str[0];
				2993	Py_UNICODE u2 = str2->str[0];
				2994
				2995	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2996	self->str,
				2997	self->length
				2998	);
				2999	if (u)
				3000	for (i = 0; i < u->length; i++)
				3001	if (u->str[i] == u1) {
				3002	if (--maxcount < 0)
				3003	break;
				3004	u->str[i] = u2;
				3005	}
				3006	}
				3007
				3008	} else {
				3009	int n, i;
				3010	Py_UNICODE *p;
				3011
				3012	/* replace strings */
				3013	n = count(self, 0, self->length, str1);
				3014	if (n > maxcount)
				3015	n = maxcount;
				3016	if (n == 0) {
				3017	/* nothing to replace, return original string */
				3018	Py_INCREF(self);
				3019	u = self;
				3020	} else {
				3021	u = _PyUnicode_New(
				3022	self->length + n * (str2->length - str1->length));
				3023	if (u) {
				3024	i = 0;
				3025	p = u->str;
				3026	while (i <= self->length - str1->length)
				3027	if (Py_UNICODE_MATCH(self, i, str1)) {
				3028	/* replace string segment */
				3029	Py_UNICODE_COPY(p, str2->str, str2->length);
				3030	p += str2->length;
				3031	i += str1->length;
				3032	if (--n <= 0) {
				3033	/* copy remaining part */
				3034	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3035	break;
				3036	}
				3037	} else
				3038	*p++ = self->str[i++];
				3039	}
				3040	}
				3041	}
				3042
				3043	return (PyObject *) u;
				3044	}
				3045
				3046	/* --- Unicode Object Methods --------------------------------------------- */
				3047
				3048	static char title__doc__[] =
				3049	"S.title() -> unicode\n\
				3050	\n\
				3051	Return a titlecased version of S, i.e. words start with title case\n\
				3052	characters, all remaining cased characters have lower case.";
				3053
				3054	static PyObject*
				3055	unicode_title(PyUnicodeObject self, PyObject args)
				3056	{
				3057	if (!PyArg_NoArgs(args))
				3058	return NULL;
				3059	return fixup(self, fixtitle);
				3060	}
				3061
				3062	static char capitalize__doc__[] =
				3063	"S.capitalize() -> unicode\n\
				3064	\n\
				3065	Return a capitalized version of S, i.e. make the first character\n\
				3066	have upper case.";
				3067
				3068	static PyObject*
				3069	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3070	{
				3071	if (!PyArg_NoArgs(args))
				3072	return NULL;
				3073	return fixup(self, fixcapitalize);
				3074	}
				3075
				3076	#if 0
				3077	static char capwords__doc__[] =
				3078	"S.capwords() -> unicode\n\
				3079	\n\
				3080	Apply .capitalize() to all words in S and return the result with\n\
				3081	normalized whitespace (all whitespace strings are replaced by ' ').";
				3082
				3083	static PyObject*
				3084	unicode_capwords(PyUnicodeObject self, PyObject args)
				3085	{
				3086	PyObject *list;
				3087	PyObject *item;
				3088	int i;
				3089
				3090	if (!PyArg_NoArgs(args))
				3091	return NULL;
				3092
				3093	/* Split into words */
				3094	list = split(self, NULL, -1);
				3095	if (!list)
				3096	return NULL;
				3097
				3098	/* Capitalize each word */
				3099	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3100	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3101	fixcapitalize);
				3102	if (item == NULL)
				3103	goto onError;
				3104	Py_DECREF(PyList_GET_ITEM(list, i));
				3105	PyList_SET_ITEM(list, i, item);
				3106	}
				3107
				3108	/* Join the words to form a new string */
				3109	item = PyUnicode_Join(NULL, list);
				3110
				3111	onError:
				3112	Py_DECREF(list);
				3113	return (PyObject *)item;
				3114	}
				3115	#endif
				3116
				3117	static char center__doc__[] =
				3118	"S.center(width) -> unicode\n\
				3119	\n\
				3120	Return S centered in a Unicode string of length width. Padding is done\n\
				3121	using spaces.";
				3122
				3123	static PyObject *
				3124	unicode_center(PyUnicodeObject self, PyObject args)
				3125	{
				3126	int marg, left;
				3127	int width;
				3128
				3129	if (!PyArg_ParseTuple(args, "i:center", &width))
				3130	return NULL;
				3131
				3132	if (self->length >= width) {
				3133	Py_INCREF(self);
				3134	return (PyObject*) self;
				3135	}
				3136
				3137	marg = width - self->length;
				3138	left = marg / 2 + (marg & width & 1);
				3139
				3140	return (PyObject*) pad(self, left, marg - left, ' ');
				3141	}
				3142
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3143	/* speedy UTF-16 code point order comparison */
				3144	/* gleaned from: */
				3145	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3146
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3147	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3148	{
				3149	0, 0, 0, 0, 0, 0, 0, 0,
				3150	0, 0, 0, 0, 0, 0, 0, 0,
				3151	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3152	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3153	};
				3154
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3155	static int
				3156	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3157	{
				3158	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3159
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3160	Py_UNICODE *s1 = str1->str;
				3161	Py_UNICODE *s2 = str2->str;
				3162
				3163	len1 = str1->length;
				3164	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3165
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3166	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3167	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3168	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3169
				3170	c1 = *s1++;
				3171	c2 = *s2++;
				3172	if (c1 > (1<<11) * 26)
				3173	c1 += utf16Fixup[c1>>11];
				3174	if (c2 > (1<<11) * 26)
				3175	c2 += utf16Fixup[c2>>11];
				3176
				3177	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3178	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3179	if (diff)
				3180	return (diff < 0) ? -1 : (diff != 0);
				3181	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3182	}
				3183
				3184	return (len1 < len2) ? -1 : (len1 != len2);
				3185	}
				3186
				3187	int PyUnicode_Compare(PyObject *left,
				3188	PyObject *right)
				3189	{
				3190	PyUnicodeObject u = NULL, v = NULL;
				3191	int result;
				3192
				3193	/* Coerce the two arguments */
				3194	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3195	if (u == NULL)
				3196	goto onError;
				3197	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3198	if (v == NULL)
				3199	goto onError;
				3200
				3201	/* Shortcut for emtpy or interned objects */
				3202	if (v == u) {
				3203	Py_DECREF(u);
				3204	Py_DECREF(v);
				3205	return 0;
				3206	}
				3207
				3208	result = unicode_compare(u, v);
				3209
				3210	Py_DECREF(u);
				3211	Py_DECREF(v);
				3212	return result;
				3213
				3214	onError:
				3215	Py_XDECREF(u);
				3216	Py_XDECREF(v);
				3217	return -1;
				3218	}
				3219
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3220	int PyUnicode_Contains(PyObject *container,
				3221	PyObject *element)
				3222	{
				3223	PyUnicodeObject u = NULL, v = NULL;
				3224	int result;
				3225	register const Py_UNICODE p, e;
				3226	register Py_UNICODE ch;
				3227
				3228	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3229	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3230	if (v == NULL) {
				3231	PyErr_SetString(PyExc_TypeError,
				3232	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3233	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3234	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3235	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3236	if (u == NULL) {
				3237	Py_DECREF(v);
				3238	goto onError;
				3239	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3240
				3241	/* Check v in u */
				3242	if (PyUnicode_GET_SIZE(v) != 1) {
				3243	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3244	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3245	goto onError;
				3246	}
				3247	ch = *PyUnicode_AS_UNICODE(v);
				3248	p = PyUnicode_AS_UNICODE(u);
				3249	e = p + PyUnicode_GET_SIZE(u);
				3250	result = 0;
				3251	while (p < e) {
				3252	if (*p++ == ch) {
				3253	result = 1;
				3254	break;
				3255	}
				3256	}
				3257
				3258	Py_DECREF(u);
				3259	Py_DECREF(v);
				3260	return result;
				3261
				3262	onError:
				3263	Py_XDECREF(u);
				3264	Py_XDECREF(v);
				3265	return -1;
				3266	}
				3267
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3268	/* Concat to string or Unicode object giving a new Unicode object. */
				3269
				3270	PyObject PyUnicode_Concat(PyObject left,
				3271	PyObject *right)
				3272	{
				3273	PyUnicodeObject u = NULL, v = NULL, *w;
				3274
				3275	/* Coerce the two arguments */
				3276	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3277	if (u == NULL)
				3278	goto onError;
				3279	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3280	if (v == NULL)
				3281	goto onError;
				3282
				3283	/* Shortcuts */
				3284	if (v == unicode_empty) {
				3285	Py_DECREF(v);
				3286	return (PyObject *)u;
				3287	}
				3288	if (u == unicode_empty) {
				3289	Py_DECREF(u);
				3290	return (PyObject *)v;
				3291	}
				3292
				3293	/* Concat the two Unicode strings */
				3294	w = _PyUnicode_New(u->length + v->length);
				3295	if (w == NULL)
				3296	goto onError;
				3297	Py_UNICODE_COPY(w->str, u->str, u->length);
				3298	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3299
				3300	Py_DECREF(u);
				3301	Py_DECREF(v);
				3302	return (PyObject *)w;
				3303
				3304	onError:
				3305	Py_XDECREF(u);
				3306	Py_XDECREF(v);
				3307	return NULL;
				3308	}
				3309
				3310	static char count__doc__[] =
				3311	"S.count(sub[, start[, end]]) -> int\n\
				3312	\n\
				3313	Return the number of occurrences of substring sub in Unicode string\n\
				3314	S[start:end]. Optional arguments start and end are\n\
				3315	interpreted as in slice notation.";
				3316
				3317	static PyObject *
				3318	unicode_count(PyUnicodeObject self, PyObject args)
				3319	{
				3320	PyUnicodeObject *substring;
				3321	int start = 0;
				3322	int end = INT_MAX;
				3323	PyObject *result;
				3324
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3325	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3326	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3327	return NULL;
				3328
				3329	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3330	(PyObject *)substring);
				3331	if (substring == NULL)
				3332	return NULL;
				3333
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3334	if (start < 0)
				3335	start += self->length;
				3336	if (start < 0)
				3337	start = 0;
				3338	if (end > self->length)
				3339	end = self->length;
				3340	if (end < 0)
				3341	end += self->length;
				3342	if (end < 0)
				3343	end = 0;
				3344
				3345	result = PyInt_FromLong((long) count(self, start, end, substring));
				3346
				3347	Py_DECREF(substring);
				3348	return result;
				3349	}
				3350
				3351	static char encode__doc__[] =
				3352	"S.encode([encoding[,errors]]) -> string\n\
				3353	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3354	Return an encoded string version of S. Default encoding is the current\n\
				3355	default string encoding. errors may be given to set a different error\n\
				3356	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3357	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3358
				3359	static PyObject *
				3360	unicode_encode(PyUnicodeObject self, PyObject args)
				3361	{
				3362	char *encoding = NULL;
				3363	char *errors = NULL;
				3364	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3365	return NULL;
				3366	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3367	}
				3368
				3369	static char expandtabs__doc__[] =
				3370	"S.expandtabs([tabsize]) -> unicode\n\
				3371	\n\
				3372	Return a copy of S where all tab characters are expanded using spaces.\n\
				3373	If tabsize is not given, a tab size of 8 characters is assumed.";
				3374
				3375	static PyObject*
				3376	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3377	{
				3378	Py_UNICODE *e;
				3379	Py_UNICODE *p;
				3380	Py_UNICODE *q;
				3381	int i, j;
				3382	PyUnicodeObject *u;
				3383	int tabsize = 8;
				3384
				3385	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3386	return NULL;
				3387
				3388	/* First pass: determine size of ouput string */
				3389	i = j = 0;
				3390	e = self->str + self->length;
				3391	for (p = self->str; p < e; p++)
				3392	if (*p == '\t') {
				3393	if (tabsize > 0)
				3394	j += tabsize - (j % tabsize);
				3395	}
				3396	else {
				3397	j++;
				3398	if (p == '\n' \|\| p == '\r') {
				3399	i += j;
				3400	j = 0;
				3401	}
				3402	}
				3403
				3404	/* Second pass: create output string and fill it */
				3405	u = _PyUnicode_New(i + j);
				3406	if (!u)
				3407	return NULL;
				3408
				3409	j = 0;
				3410	q = u->str;
				3411
				3412	for (p = self->str; p < e; p++)
				3413	if (*p == '\t') {
				3414	if (tabsize > 0) {
				3415	i = tabsize - (j % tabsize);
				3416	j += i;
				3417	while (i--)
				3418	*q++ = ' ';
				3419	}
				3420	}
				3421	else {
				3422	j++;
				3423	q++ = p;
				3424	if (p == '\n' \|\| p == '\r')
				3425	j = 0;
				3426	}
				3427
				3428	return (PyObject*) u;
				3429	}
				3430
				3431	static char find__doc__[] =
				3432	"S.find(sub [,start [,end]]) -> int\n\
				3433	\n\
				3434	Return the lowest index in S where substring sub is found,\n\
				3435	such that sub is contained within s[start,end]. Optional\n\
				3436	arguments start and end are interpreted as in slice notation.\n\
				3437	\n\
				3438	Return -1 on failure.";
				3439
				3440	static PyObject *
				3441	unicode_find(PyUnicodeObject self, PyObject args)
				3442	{
				3443	PyUnicodeObject *substring;
				3444	int start = 0;
				3445	int end = INT_MAX;
				3446	PyObject *result;
				3447
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3448	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3449	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3450	return NULL;
				3451	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3452	(PyObject *)substring);
				3453	if (substring == NULL)
				3454	return NULL;
				3455
				3456	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3457
				3458	Py_DECREF(substring);
				3459	return result;
				3460	}
				3461
				3462	static PyObject *
				3463	unicode_getitem(PyUnicodeObject *self, int index)
				3464	{
				3465	if (index < 0 \|\| index >= self->length) {
				3466	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3467	return NULL;
				3468	}
				3469
				3470	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3471	}
				3472
				3473	static long
				3474	unicode_hash(PyUnicodeObject *self)
				3475	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3476	/* Since Unicode objects compare equal to their ASCII string
				3477	counterparts, they should use the individual character values
				3478	as basis for their hash value. This is needed to assure that
				3479	strings and Unicode objects behave in the same way as
				3480	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3481
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3482	register int len;
				3483	register Py_UNICODE *p;
				3484	register long x;
				3485
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3486	if (self->hash != -1)
				3487	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3488	len = PyUnicode_GET_SIZE(self);
				3489	p = PyUnicode_AS_UNICODE(self);
				3490	x = *p << 7;
				3491	while (--len >= 0)
				3492	x = (1000003x) ^ p++;
				3493	x ^= PyUnicode_GET_SIZE(self);
				3494	if (x == -1)
				3495	x = -2;
				3496	self->hash = x;
				3497	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3498	}
				3499
				3500	static char index__doc__[] =
				3501	"S.index(sub [,start [,end]]) -> int\n\
				3502	\n\
				3503	Like S.find() but raise ValueError when the substring is not found.";
				3504
				3505	static PyObject *
				3506	unicode_index(PyUnicodeObject self, PyObject args)
				3507	{
				3508	int result;
				3509	PyUnicodeObject *substring;
				3510	int start = 0;
				3511	int end = INT_MAX;
				3512
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3513	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3514	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3515	return NULL;
				3516
				3517	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3518	(PyObject *)substring);
				3519	if (substring == NULL)
				3520	return NULL;
				3521
				3522	result = findstring(self, substring, start, end, 1);
				3523
				3524	Py_DECREF(substring);
				3525	if (result < 0) {
				3526	PyErr_SetString(PyExc_ValueError, "substring not found");
				3527	return NULL;
				3528	}
				3529	return PyInt_FromLong(result);
				3530	}
				3531
				3532	static char islower__doc__[] =
				3533	"S.islower() -> int\n\
				3534	\n\
				3535	Return 1 if all cased characters in S are lowercase and there is\n\
				3536	at least one cased character in S, 0 otherwise.";
				3537
				3538	static PyObject*
				3539	unicode_islower(PyUnicodeObject self, PyObject args)
				3540	{
				3541	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3542	register const Py_UNICODE *e;
				3543	int cased;
				3544
				3545	if (!PyArg_NoArgs(args))
				3546	return NULL;
				3547
				3548	/* Shortcut for single character strings */
				3549	if (PyUnicode_GET_SIZE(self) == 1)
				3550	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3551
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3552	/* Special case for empty strings */
				3553	if (PyString_GET_SIZE(self) == 0)
				3554	return PyInt_FromLong(0);
				3555
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3556	e = p + PyUnicode_GET_SIZE(self);
				3557	cased = 0;
				3558	for (; p < e; p++) {
				3559	register const Py_UNICODE ch = *p;
				3560
				3561	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3562	return PyInt_FromLong(0);
				3563	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3564	cased = 1;
				3565	}
				3566	return PyInt_FromLong(cased);
				3567	}
				3568
				3569	static char isupper__doc__[] =
				3570	"S.isupper() -> int\n\
				3571	\n\
				3572	Return 1 if all cased characters in S are uppercase and there is\n\
				3573	at least one cased character in S, 0 otherwise.";
				3574
				3575	static PyObject*
				3576	unicode_isupper(PyUnicodeObject self, PyObject args)
				3577	{
				3578	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3579	register const Py_UNICODE *e;
				3580	int cased;
				3581
				3582	if (!PyArg_NoArgs(args))
				3583	return NULL;
				3584
				3585	/* Shortcut for single character strings */
				3586	if (PyUnicode_GET_SIZE(self) == 1)
				3587	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3588
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3589	/* Special case for empty strings */
				3590	if (PyString_GET_SIZE(self) == 0)
				3591	return PyInt_FromLong(0);
				3592
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3593	e = p + PyUnicode_GET_SIZE(self);
				3594	cased = 0;
				3595	for (; p < e; p++) {
				3596	register const Py_UNICODE ch = *p;
				3597
				3598	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3599	return PyInt_FromLong(0);
				3600	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3601	cased = 1;
				3602	}
				3603	return PyInt_FromLong(cased);
				3604	}
				3605
				3606	static char istitle__doc__[] =
				3607	"S.istitle() -> int\n\
				3608	\n\
				3609	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3610	may only follow uncased characters and lowercase characters only cased\n\
				3611	ones. Return 0 otherwise.";
				3612
				3613	static PyObject*
				3614	unicode_istitle(PyUnicodeObject self, PyObject args)
				3615	{
				3616	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3617	register const Py_UNICODE *e;
				3618	int cased, previous_is_cased;
				3619
				3620	if (!PyArg_NoArgs(args))
				3621	return NULL;
				3622
				3623	/* Shortcut for single character strings */
				3624	if (PyUnicode_GET_SIZE(self) == 1)
				3625	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3626	(Py_UNICODE_ISUPPER(*p) != 0));
				3627
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3628	/* Special case for empty strings */
				3629	if (PyString_GET_SIZE(self) == 0)
				3630	return PyInt_FromLong(0);
				3631
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3632	e = p + PyUnicode_GET_SIZE(self);
				3633	cased = 0;
				3634	previous_is_cased = 0;
				3635	for (; p < e; p++) {
				3636	register const Py_UNICODE ch = *p;
				3637
				3638	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3639	if (previous_is_cased)
				3640	return PyInt_FromLong(0);
				3641	previous_is_cased = 1;
				3642	cased = 1;
				3643	}
				3644	else if (Py_UNICODE_ISLOWER(ch)) {
				3645	if (!previous_is_cased)
				3646	return PyInt_FromLong(0);
				3647	previous_is_cased = 1;
				3648	cased = 1;
				3649	}
				3650	else
				3651	previous_is_cased = 0;
				3652	}
				3653	return PyInt_FromLong(cased);
				3654	}
				3655
				3656	static char isspace__doc__[] =
				3657	"S.isspace() -> int\n\
				3658	\n\
				3659	Return 1 if there are only whitespace characters in S,\n\
				3660	0 otherwise.";
				3661
				3662	static PyObject*
				3663	unicode_isspace(PyUnicodeObject self, PyObject args)
				3664	{
				3665	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3666	register const Py_UNICODE *e;
				3667
				3668	if (!PyArg_NoArgs(args))
				3669	return NULL;
				3670
				3671	/* Shortcut for single character strings */
				3672	if (PyUnicode_GET_SIZE(self) == 1 &&
				3673	Py_UNICODE_ISSPACE(*p))
				3674	return PyInt_FromLong(1);
				3675
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3676	/* Special case for empty strings */
				3677	if (PyString_GET_SIZE(self) == 0)
				3678	return PyInt_FromLong(0);
				3679
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3680	e = p + PyUnicode_GET_SIZE(self);
				3681	for (; p < e; p++) {
				3682	if (!Py_UNICODE_ISSPACE(*p))
				3683	return PyInt_FromLong(0);
				3684	}
				3685	return PyInt_FromLong(1);
				3686	}
				3687
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3688	static char isalpha__doc__[] =
				3689	"S.isalpha() -> int\n\
				3690	\n\
				3691	Return 1 if all characters in S are alphabetic\n\
				3692	and there is at least one character in S, 0 otherwise.";
				3693
				3694	static PyObject*
				3695	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3696	{
				3697	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3698	register const Py_UNICODE *e;
				3699
				3700	if (!PyArg_NoArgs(args))
				3701	return NULL;
				3702
				3703	/* Shortcut for single character strings */
				3704	if (PyUnicode_GET_SIZE(self) == 1 &&
				3705	Py_UNICODE_ISALPHA(*p))
				3706	return PyInt_FromLong(1);
				3707
				3708	/* Special case for empty strings */
				3709	if (PyString_GET_SIZE(self) == 0)
				3710	return PyInt_FromLong(0);
				3711
				3712	e = p + PyUnicode_GET_SIZE(self);
				3713	for (; p < e; p++) {
				3714	if (!Py_UNICODE_ISALPHA(*p))
				3715	return PyInt_FromLong(0);
				3716	}
				3717	return PyInt_FromLong(1);
				3718	}
				3719
				3720	static char isalnum__doc__[] =
				3721	"S.isalnum() -> int\n\
				3722	\n\
				3723	Return 1 if all characters in S are alphanumeric\n\
				3724	and there is at least one character in S, 0 otherwise.";
				3725
				3726	static PyObject*
				3727	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3728	{
				3729	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3730	register const Py_UNICODE *e;
				3731
				3732	if (!PyArg_NoArgs(args))
				3733	return NULL;
				3734
				3735	/* Shortcut for single character strings */
				3736	if (PyUnicode_GET_SIZE(self) == 1 &&
				3737	Py_UNICODE_ISALNUM(*p))
				3738	return PyInt_FromLong(1);
				3739
				3740	/* Special case for empty strings */
				3741	if (PyString_GET_SIZE(self) == 0)
				3742	return PyInt_FromLong(0);
				3743
				3744	e = p + PyUnicode_GET_SIZE(self);
				3745	for (; p < e; p++) {
				3746	if (!Py_UNICODE_ISALNUM(*p))
				3747	return PyInt_FromLong(0);
				3748	}
				3749	return PyInt_FromLong(1);
				3750	}
				3751
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3752	static char isdecimal__doc__[] =
				3753	"S.isdecimal() -> int\n\
				3754	\n\
				3755	Return 1 if there are only decimal characters in S,\n\
				3756	0 otherwise.";
				3757
				3758	static PyObject*
				3759	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3760	{
				3761	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3762	register const Py_UNICODE *e;
				3763
				3764	if (!PyArg_NoArgs(args))
				3765	return NULL;
				3766
				3767	/* Shortcut for single character strings */
				3768	if (PyUnicode_GET_SIZE(self) == 1 &&
				3769	Py_UNICODE_ISDECIMAL(*p))
				3770	return PyInt_FromLong(1);
				3771
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3772	/* Special case for empty strings */
				3773	if (PyString_GET_SIZE(self) == 0)
				3774	return PyInt_FromLong(0);
				3775
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3776	e = p + PyUnicode_GET_SIZE(self);
				3777	for (; p < e; p++) {
				3778	if (!Py_UNICODE_ISDECIMAL(*p))
				3779	return PyInt_FromLong(0);
				3780	}
				3781	return PyInt_FromLong(1);
				3782	}
				3783
				3784	static char isdigit__doc__[] =
				3785	"S.isdigit() -> int\n\
				3786	\n\
				3787	Return 1 if there are only digit characters in S,\n\
				3788	0 otherwise.";
				3789
				3790	static PyObject*
				3791	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3792	{
				3793	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3794	register const Py_UNICODE *e;
				3795
				3796	if (!PyArg_NoArgs(args))
				3797	return NULL;
				3798
				3799	/* Shortcut for single character strings */
				3800	if (PyUnicode_GET_SIZE(self) == 1 &&
				3801	Py_UNICODE_ISDIGIT(*p))
				3802	return PyInt_FromLong(1);
				3803
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3804	/* Special case for empty strings */
				3805	if (PyString_GET_SIZE(self) == 0)
				3806	return PyInt_FromLong(0);
				3807
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3808	e = p + PyUnicode_GET_SIZE(self);
				3809	for (; p < e; p++) {
				3810	if (!Py_UNICODE_ISDIGIT(*p))
				3811	return PyInt_FromLong(0);
				3812	}
				3813	return PyInt_FromLong(1);
				3814	}
				3815
				3816	static char isnumeric__doc__[] =
				3817	"S.isnumeric() -> int\n\
				3818	\n\
				3819	Return 1 if there are only numeric characters in S,\n\
				3820	0 otherwise.";
				3821
				3822	static PyObject*
				3823	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3824	{
				3825	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3826	register const Py_UNICODE *e;
				3827
				3828	if (!PyArg_NoArgs(args))
				3829	return NULL;
				3830
				3831	/* Shortcut for single character strings */
				3832	if (PyUnicode_GET_SIZE(self) == 1 &&
				3833	Py_UNICODE_ISNUMERIC(*p))
				3834	return PyInt_FromLong(1);
				3835
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3836	/* Special case for empty strings */
				3837	if (PyString_GET_SIZE(self) == 0)
				3838	return PyInt_FromLong(0);
				3839
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3840	e = p + PyUnicode_GET_SIZE(self);
				3841	for (; p < e; p++) {
				3842	if (!Py_UNICODE_ISNUMERIC(*p))
				3843	return PyInt_FromLong(0);
				3844	}
				3845	return PyInt_FromLong(1);
				3846	}
				3847
				3848	static char join__doc__[] =
				3849	"S.join(sequence) -> unicode\n\
				3850	\n\
				3851	Return a string which is the concatenation of the strings in the\n\
				3852	sequence. The separator between elements is S.";
				3853
				3854	static PyObject*
				3855	unicode_join(PyUnicodeObject self, PyObject args)
				3856	{
				3857	PyObject *data;
				3858	if (!PyArg_ParseTuple(args, "O:join", &data))
				3859	return NULL;
				3860
				3861	return PyUnicode_Join((PyObject *)self, data);
				3862	}
				3863
				3864	static int
				3865	unicode_length(PyUnicodeObject *self)
				3866	{
				3867	return self->length;
				3868	}
				3869
				3870	static char ljust__doc__[] =
				3871	"S.ljust(width) -> unicode\n\
				3872	\n\
				3873	Return S left justified in a Unicode string of length width. Padding is\n\
				3874	done using spaces.";
				3875
				3876	static PyObject *
				3877	unicode_ljust(PyUnicodeObject self, PyObject args)
				3878	{
				3879	int width;
				3880	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3881	return NULL;
				3882
				3883	if (self->length >= width) {
				3884	Py_INCREF(self);
				3885	return (PyObject*) self;
				3886	}
				3887
				3888	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3889	}
				3890
				3891	static char lower__doc__[] =
				3892	"S.lower() -> unicode\n\
				3893	\n\
				3894	Return a copy of the string S converted to lowercase.";
				3895
				3896	static PyObject*
				3897	unicode_lower(PyUnicodeObject self, PyObject args)
				3898	{
				3899	if (!PyArg_NoArgs(args))
				3900	return NULL;
				3901	return fixup(self, fixlower);
				3902	}
				3903
				3904	static char lstrip__doc__[] =
				3905	"S.lstrip() -> unicode\n\
				3906	\n\
				3907	Return a copy of the string S with leading whitespace removed.";
				3908
				3909	static PyObject *
				3910	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3911	{
				3912	if (!PyArg_NoArgs(args))
				3913	return NULL;
				3914	return strip(self, 1, 0);
				3915	}
				3916
				3917	static PyObject*
				3918	unicode_repeat(PyUnicodeObject *str, int len)
				3919	{
				3920	PyUnicodeObject *u;
				3921	Py_UNICODE *p;
				3922
				3923	if (len < 0)
				3924	len = 0;
				3925
				3926	if (len == 1) {
				3927	/* no repeat, return original string */
				3928	Py_INCREF(str);
				3929	return (PyObject*) str;
				3930	}
				3931
				3932	u = _PyUnicode_New(len * str->length);
				3933	if (!u)
				3934	return NULL;
				3935
				3936	p = u->str;
				3937
				3938	while (len-- > 0) {
				3939	Py_UNICODE_COPY(p, str->str, str->length);
				3940	p += str->length;
				3941	}
				3942
				3943	return (PyObject*) u;
				3944	}
				3945
				3946	PyObject PyUnicode_Replace(PyObject obj,
				3947	PyObject *subobj,
				3948	PyObject *replobj,
				3949	int maxcount)
				3950	{
				3951	PyObject *self;
				3952	PyObject *str1;
				3953	PyObject *str2;
				3954	PyObject *result;
				3955
				3956	self = PyUnicode_FromObject(obj);
				3957	if (self == NULL)
				3958	return NULL;
				3959	str1 = PyUnicode_FromObject(subobj);
				3960	if (str1 == NULL) {
				3961	Py_DECREF(self);
				3962	return NULL;
				3963	}
				3964	str2 = PyUnicode_FromObject(replobj);
				3965	if (str2 == NULL) {
				3966	Py_DECREF(self);
				3967	Py_DECREF(str1);
				3968	return NULL;
				3969	}
				3970	result = replace((PyUnicodeObject *)self,
				3971	(PyUnicodeObject *)str1,
				3972	(PyUnicodeObject *)str2,
				3973	maxcount);
				3974	Py_DECREF(self);
				3975	Py_DECREF(str1);
				3976	Py_DECREF(str2);
				3977	return result;
				3978	}
				3979
				3980	static char replace__doc__[] =
				3981	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3982	\n\
				3983	Return a copy of S with all occurrences of substring\n\
				3984	old replaced by new. If the optional argument maxsplit is\n\
				3985	given, only the first maxsplit occurrences are replaced.";
				3986
				3987	static PyObject*
				3988	unicode_replace(PyUnicodeObject self, PyObject args)
				3989	{
				3990	PyUnicodeObject *str1;
				3991	PyUnicodeObject *str2;
				3992	int maxcount = -1;
				3993	PyObject *result;
				3994
				3995	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3996	return NULL;
				3997	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3998	if (str1 == NULL)
				3999	return NULL;
				4000	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4001	if (str2 == NULL)
				4002	return NULL;
				4003
				4004	result = replace(self, str1, str2, maxcount);
				4005
				4006	Py_DECREF(str1);
				4007	Py_DECREF(str2);
				4008	return result;
				4009	}
				4010
				4011	static
				4012	PyObject unicode_repr(PyObject unicode)
				4013	{
				4014	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4015	PyUnicode_GET_SIZE(unicode),
				4016	1);
				4017	}
				4018
				4019	static char rfind__doc__[] =
				4020	"S.rfind(sub [,start [,end]]) -> int\n\
				4021	\n\
				4022	Return the highest index in S where substring sub is found,\n\
				4023	such that sub is contained within s[start,end]. Optional\n\
				4024	arguments start and end are interpreted as in slice notation.\n\
				4025	\n\
				4026	Return -1 on failure.";
				4027
				4028	static PyObject *
				4029	unicode_rfind(PyUnicodeObject self, PyObject args)
				4030	{
				4031	PyUnicodeObject *substring;
				4032	int start = 0;
				4033	int end = INT_MAX;
				4034	PyObject *result;
				4035
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4036	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4037	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4038	return NULL;
				4039	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4040	(PyObject *)substring);
				4041	if (substring == NULL)
				4042	return NULL;
				4043
				4044	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4045
				4046	Py_DECREF(substring);
				4047	return result;
				4048	}
				4049
				4050	static char rindex__doc__[] =
				4051	"S.rindex(sub [,start [,end]]) -> int\n\
				4052	\n\
				4053	Like S.rfind() but raise ValueError when the substring is not found.";
				4054
				4055	static PyObject *
				4056	unicode_rindex(PyUnicodeObject self, PyObject args)
				4057	{
				4058	int result;
				4059	PyUnicodeObject *substring;
				4060	int start = 0;
				4061	int end = INT_MAX;
				4062
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4063	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4064	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4065	return NULL;
				4066	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4067	(PyObject *)substring);
				4068	if (substring == NULL)
				4069	return NULL;
				4070
				4071	result = findstring(self, substring, start, end, -1);
				4072
				4073	Py_DECREF(substring);
				4074	if (result < 0) {
				4075	PyErr_SetString(PyExc_ValueError, "substring not found");
				4076	return NULL;
				4077	}
				4078	return PyInt_FromLong(result);
				4079	}
				4080
				4081	static char rjust__doc__[] =
				4082	"S.rjust(width) -> unicode\n\
				4083	\n\
				4084	Return S right justified in a Unicode string of length width. Padding is\n\
				4085	done using spaces.";
				4086
				4087	static PyObject *
				4088	unicode_rjust(PyUnicodeObject self, PyObject args)
				4089	{
				4090	int width;
				4091	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4092	return NULL;
				4093
				4094	if (self->length >= width) {
				4095	Py_INCREF(self);
				4096	return (PyObject*) self;
				4097	}
				4098
				4099	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4100	}
				4101
				4102	static char rstrip__doc__[] =
				4103	"S.rstrip() -> unicode\n\
				4104	\n\
				4105	Return a copy of the string S with trailing whitespace removed.";
				4106
				4107	static PyObject *
				4108	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4109	{
				4110	if (!PyArg_NoArgs(args))
				4111	return NULL;
				4112	return strip(self, 0, 1);
				4113	}
				4114
				4115	static PyObject*
				4116	unicode_slice(PyUnicodeObject *self, int start, int end)
				4117	{
				4118	/* standard clamping */
				4119	if (start < 0)
				4120	start = 0;
				4121	if (end < 0)
				4122	end = 0;
				4123	if (end > self->length)
				4124	end = self->length;
				4125	if (start == 0 && end == self->length) {
				4126	/* full slice, return original string */
				4127	Py_INCREF(self);
				4128	return (PyObject*) self;
				4129	}
				4130	if (start > end)
				4131	start = end;
				4132	/* copy slice */
				4133	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4134	end - start);
				4135	}
				4136
				4137	PyObject PyUnicode_Split(PyObject s,
				4138	PyObject *sep,
				4139	int maxsplit)
				4140	{
				4141	PyObject *result;
				4142
				4143	s = PyUnicode_FromObject(s);
				4144	if (s == NULL)
				4145	return NULL;
				4146	if (sep != NULL) {
				4147	sep = PyUnicode_FromObject(sep);
				4148	if (sep == NULL) {
				4149	Py_DECREF(s);
				4150	return NULL;
				4151	}
				4152	}
				4153
				4154	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4155
				4156	Py_DECREF(s);
				4157	Py_XDECREF(sep);
				4158	return result;
				4159	}
				4160
				4161	static char split__doc__[] =
				4162	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4163	\n\
				4164	Return a list of the words in S, using sep as the\n\
				4165	delimiter string. If maxsplit is given, at most maxsplit\n\
				4166	splits are done. If sep is not specified, any whitespace string\n\
				4167	is a separator.";
				4168
				4169	static PyObject*
				4170	unicode_split(PyUnicodeObject self, PyObject args)
				4171	{
				4172	PyObject *substring = Py_None;
				4173	int maxcount = -1;
				4174
				4175	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4176	return NULL;
				4177
				4178	if (substring == Py_None)
				4179	return split(self, NULL, maxcount);
				4180	else if (PyUnicode_Check(substring))
				4181	return split(self, (PyUnicodeObject *)substring, maxcount);
				4182	else
				4183	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4184	}
				4185
				4186	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4187	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4188	\n\
				4189	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4190	Line breaks are not included in the resulting list unless keepends\n\
				4191	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4192
				4193	static PyObject*
				4194	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4195	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4196	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4197
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4198	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4199	return NULL;
				4200
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4201	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4202	}
				4203
				4204	static
				4205	PyObject unicode_str(PyUnicodeObject self)
				4206	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4207	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4208	}
				4209
				4210	static char strip__doc__[] =
				4211	"S.strip() -> unicode\n\
				4212	\n\
				4213	Return a copy of S with leading and trailing whitespace removed.";
				4214
				4215	static PyObject *
				4216	unicode_strip(PyUnicodeObject self, PyObject args)
				4217	{
				4218	if (!PyArg_NoArgs(args))
				4219	return NULL;
				4220	return strip(self, 1, 1);
				4221	}
				4222
				4223	static char swapcase__doc__[] =
				4224	"S.swapcase() -> unicode\n\
				4225	\n\
				4226	Return a copy of S with uppercase characters converted to lowercase\n\
				4227	and vice versa.";
				4228
				4229	static PyObject*
				4230	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4231	{
				4232	if (!PyArg_NoArgs(args))
				4233	return NULL;
				4234	return fixup(self, fixswapcase);
				4235	}
				4236
				4237	static char translate__doc__[] =
				4238	"S.translate(table) -> unicode\n\
				4239	\n\
				4240	Return a copy of the string S, where all characters have been mapped\n\
				4241	through the given translation table, which must be a mapping of\n\
				4242	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4243	are left untouched. Characters mapped to None are deleted.";
				4244
				4245	static PyObject*
				4246	unicode_translate(PyUnicodeObject self, PyObject args)
				4247	{
				4248	PyObject *table;
				4249
				4250	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4251	return NULL;
				4252	return PyUnicode_TranslateCharmap(self->str,
				4253	self->length,
				4254	table,
				4255	"ignore");
				4256	}
				4257
				4258	static char upper__doc__[] =
				4259	"S.upper() -> unicode\n\
				4260	\n\
				4261	Return a copy of S converted to uppercase.";
				4262
				4263	static PyObject*
				4264	unicode_upper(PyUnicodeObject self, PyObject args)
				4265	{
				4266	if (!PyArg_NoArgs(args))
				4267	return NULL;
				4268	return fixup(self, fixupper);
				4269	}
				4270
				4271	#if 0
				4272	static char zfill__doc__[] =
				4273	"S.zfill(width) -> unicode\n\
				4274	\n\
				4275	Pad a numeric string x with zeros on the left, to fill a field\n\
				4276	of the specified width. The string x is never truncated.";
				4277
				4278	static PyObject *
				4279	unicode_zfill(PyUnicodeObject self, PyObject args)
				4280	{
				4281	int fill;
				4282	PyUnicodeObject *u;
				4283
				4284	int width;
				4285	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4286	return NULL;
				4287
				4288	if (self->length >= width) {
				4289	Py_INCREF(self);
				4290	return (PyObject*) self;
				4291	}
				4292
				4293	fill = width - self->length;
				4294
				4295	u = pad(self, fill, 0, '0');
				4296
				4297	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4298	/* move sign to beginning of string */
				4299	u->str[0] = u->str[fill];
				4300	u->str[fill] = '0';
				4301	}
				4302
				4303	return (PyObject*) u;
				4304	}
				4305	#endif
				4306
				4307	#if 0
				4308	static PyObject*
				4309	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4310	{
				4311	if (!PyArg_NoArgs(args))
				4312	return NULL;
				4313	return PyInt_FromLong(unicode_freelist_size);
				4314	}
				4315	#endif
				4316
				4317	static char startswith__doc__[] =
				4318	"S.startswith(prefix[, start[, end]]) -> int\n\
				4319	\n\
				4320	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4321	optional start, test S beginning at that position. With optional end, stop\n\
				4322	comparing S at that position.";
				4323
				4324	static PyObject *
				4325	unicode_startswith(PyUnicodeObject *self,
				4326	PyObject *args)
				4327	{
				4328	PyUnicodeObject *substring;
				4329	int start = 0;
				4330	int end = INT_MAX;
				4331	PyObject *result;
				4332
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4333	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4334	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4335	return NULL;
				4336	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4337	(PyObject *)substring);
				4338	if (substring == NULL)
				4339	return NULL;
				4340
				4341	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4342
				4343	Py_DECREF(substring);
				4344	return result;
				4345	}
				4346
				4347
				4348	static char endswith__doc__[] =
				4349	"S.endswith(suffix[, start[, end]]) -> int\n\
				4350	\n\
				4351	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4352	optional start, test S beginning at that position. With optional end, stop\n\
				4353	comparing S at that position.";
				4354
				4355	static PyObject *
				4356	unicode_endswith(PyUnicodeObject *self,
				4357	PyObject *args)
				4358	{
				4359	PyUnicodeObject *substring;
				4360	int start = 0;
				4361	int end = INT_MAX;
				4362	PyObject *result;
				4363
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4364	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4365	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4366	return NULL;
				4367	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4368	(PyObject *)substring);
				4369	if (substring == NULL)
				4370	return NULL;
				4371
				4372	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4373
				4374	Py_DECREF(substring);
				4375	return result;
				4376	}
				4377
				4378
				4379	static PyMethodDef unicode_methods[] = {
				4380
				4381	/* Order is according to common usage: often used methods should
				4382	appear first, since lookup is done sequentially. */
				4383
				4384	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4385	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4386	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4387	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4388	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4389	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4390	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4391	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4392	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4393	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4394	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4395	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4396	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4397	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4398	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4399	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4400	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4401	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4402	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4403	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4404	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4405	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4406	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4407	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4408	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4409	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4410	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4411	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4412	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4413	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4414	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4415	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4416	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4417	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4418	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4419	#if 0
				4420	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4421	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4422	#endif
				4423
				4424	#if 0
				4425	/* This one is just used for debugging the implementation. */
				4426	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4427	#endif
				4428
				4429	{NULL, NULL}
				4430	};
				4431
				4432	static PyObject *
				4433	unicode_getattr(PyUnicodeObject self, char name)
				4434	{
				4435	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4436	}
				4437
				4438	static PySequenceMethods unicode_as_sequence = {
				4439	(inquiry) unicode_length, /* sq_length */
				4440	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4441	(intargfunc) unicode_repeat, /* sq_repeat */
				4442	(intargfunc) unicode_getitem, /* sq_item */
				4443	(intintargfunc) unicode_slice, /* sq_slice */
				4444	0, /* sq_ass_item */
				4445	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4446	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4447	};
				4448
				4449	static int
				4450	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4451	int index,
				4452	const void **ptr)
				4453	{
				4454	if (index != 0) {
				4455	PyErr_SetString(PyExc_SystemError,
				4456	"accessing non-existent unicode segment");
				4457	return -1;
				4458	}
				4459	ptr = (void ) self->str;
				4460	return PyUnicode_GET_DATA_SIZE(self);
				4461	}
				4462
				4463	static int
				4464	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4465	const void **ptr)
				4466	{
				4467	PyErr_SetString(PyExc_TypeError,
				4468	"cannot use unicode as modifyable buffer");
				4469	return -1;
				4470	}
				4471
				4472	static int
				4473	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4474	int *lenp)
				4475	{
				4476	if (lenp)
				4477	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4478	return 1;
				4479	}
				4480
				4481	static int
				4482	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4483	int index,
				4484	const void **ptr)
				4485	{
				4486	PyObject *str;
				4487
				4488	if (index != 0) {
				4489	PyErr_SetString(PyExc_SystemError,
				4490	"accessing non-existent unicode segment");
				4491	return -1;
				4492	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	4493	str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4494	if (str == NULL)
				4495	return -1;
				4496	ptr = (void ) PyString_AS_STRING(str);
				4497	return PyString_GET_SIZE(str);
				4498	}
				4499
				4500	/* Helpers for PyUnicode_Format() */
				4501
				4502	static PyObject *
				4503	getnextarg(args, arglen, p_argidx)
				4504	PyObject *args;
				4505	int arglen;
				4506	int *p_argidx;
				4507	{
				4508	int argidx = *p_argidx;
				4509	if (argidx < arglen) {
				4510	(*p_argidx)++;
				4511	if (arglen < 0)
				4512	return args;
				4513	else
				4514	return PyTuple_GetItem(args, argidx);
				4515	}
				4516	PyErr_SetString(PyExc_TypeError,
				4517	"not enough arguments for format string");
				4518	return NULL;
				4519	}
				4520
				4521	#define F_LJUST (1<<0)
				4522	#define F_SIGN (1<<1)
				4523	#define F_BLANK (1<<2)
				4524	#define F_ALT (1<<3)
				4525	#define F_ZERO (1<<4)
				4526
				4527	static
				4528	#ifdef HAVE_STDARG_PROTOTYPES
				4529	int usprintf(register Py_UNICODE buffer, char format, ...)
				4530	#else
				4531	int usprintf(va_alist) va_dcl
				4532	#endif
				4533	{
				4534	register int i;
				4535	int len;
				4536	va_list va;
				4537	char *charbuffer;
				4538	#ifdef HAVE_STDARG_PROTOTYPES
				4539	va_start(va, format);
				4540	#else
				4541	Py_UNICODE *args;
				4542	char *format;
				4543
				4544	va_start(va);
				4545	buffer = va_arg(va, Py_UNICODE *);
				4546	format = va_arg(va, char *);
				4547	#endif
				4548
				4549	/* First, format the string as char array, then expand to Py_UNICODE
				4550	array. */
				4551	charbuffer = (char *)buffer;
				4552	len = vsprintf(charbuffer, format, va);
				4553	for (i = len - 1; i >= 0; i--)
				4554	buffer[i] = (Py_UNICODE) charbuffer[i];
				4555
				4556	va_end(va);
				4557	return len;
				4558	}
				4559
				4560	static int
				4561	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4562	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4563	int flags,
				4564	int prec,
				4565	int type,
				4566	PyObject *v)
				4567	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4568	/* fmt = '%#.' + `prec` + `type`
				4569	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4570	char fmt[20];
				4571	double x;
				4572
				4573	x = PyFloat_AsDouble(v);
				4574	if (x == -1.0 && PyErr_Occurred())
				4575	return -1;
				4576	if (prec < 0)
				4577	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4578	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4579	type = 'g';
				4580	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4581	/* worst case length calc to ensure no buffer overrun:
				4582	fmt = %#.<prec>g
				4583	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4584	for any double rep.)
				4585	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4586	If prec=0 the effective precision is 1 (the leading digit is
				4587	always given), therefore increase by one to 10+prec. */
				4588	if (buflen <= (size_t)10 + (size_t)prec) {
				4589	PyErr_SetString(PyExc_OverflowError,
				4590	"formatted float is too long (precision too long?)");
				4591	return -1;
				4592	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4593	return usprintf(buf, fmt, x);
				4594	}
				4595
				4596	static int
				4597	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4598	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4599	int flags,
				4600	int prec,
				4601	int type,
				4602	PyObject *v)
				4603	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4604	/* fmt = '%#.' + `prec` + 'l' + `type`
				4605	worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4606	char fmt[20];
				4607	long x;
				4608
				4609	x = PyInt_AsLong(v);
				4610	if (x == -1 && PyErr_Occurred())
				4611	return -1;
				4612	if (prec < 0)
				4613	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4614	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4615	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4616	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4617	PyErr_SetString(PyExc_OverflowError,
				4618	"formatted integer is too long (precision too long?)");
				4619	return -1;
				4620	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4621	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4622	return usprintf(buf, fmt, x);
				4623	}
				4624
				4625	static int
				4626	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4627	size_t buflen,
				4628	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4629	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4630	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4631	if (PyUnicode_Check(v)) {
				4632	if (PyUnicode_GET_SIZE(v) != 1)
				4633	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4634	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4635	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4636
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4637	else if (PyString_Check(v)) {
				4638	if (PyString_GET_SIZE(v) != 1)
				4639	goto onError;
				4640	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4641	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4642
				4643	else {
				4644	/* Integer input truncated to a character */
				4645	long x;
				4646	x = PyInt_AsLong(v);
				4647	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4648	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4649	buf[0] = (char) x;
				4650	}
				4651	buf[1] = '\0';
				4652	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4653
				4654	onError:
				4655	PyErr_SetString(PyExc_TypeError,
				4656	"%c requires int or char");
				4657	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4658	}
				4659
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4660	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4661
				4662	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4663	chars are formatted. XXX This is a magic number. Each formatting
				4664	routine does bounds checking to ensure no overflow, but a better
				4665	solution may be to malloc a buffer of appropriate size for each
				4666	format. For now, the current solution is sufficient.
				4667	*/
				4668	#define FORMATBUFLEN (size_t)120
				4669
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4670	PyObject PyUnicode_Format(PyObject format,
				4671	PyObject *args)
				4672	{
				4673	Py_UNICODE fmt, res;
				4674	int fmtcnt, rescnt, reslen, arglen, argidx;
				4675	int args_owned = 0;
				4676	PyUnicodeObject *result = NULL;
				4677	PyObject *dict = NULL;
				4678	PyObject *uformat;
				4679
				4680	if (format == NULL \|\| args == NULL) {
				4681	PyErr_BadInternalCall();
				4682	return NULL;
				4683	}
				4684	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4685	if (uformat == NULL)
				4686	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4687	fmt = PyUnicode_AS_UNICODE(uformat);
				4688	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4689
				4690	reslen = rescnt = fmtcnt + 100;
				4691	result = _PyUnicode_New(reslen);
				4692	if (result == NULL)
				4693	goto onError;
				4694	res = PyUnicode_AS_UNICODE(result);
				4695
				4696	if (PyTuple_Check(args)) {
				4697	arglen = PyTuple_Size(args);
				4698	argidx = 0;
				4699	}
				4700	else {
				4701	arglen = -1;
				4702	argidx = -2;
				4703	}
				4704	if (args->ob_type->tp_as_mapping)
				4705	dict = args;
				4706
				4707	while (--fmtcnt >= 0) {
				4708	if (*fmt != '%') {
				4709	if (--rescnt < 0) {
				4710	rescnt = fmtcnt + 100;
				4711	reslen += rescnt;
				4712	if (_PyUnicode_Resize(result, reslen) < 0)
				4713	return NULL;
				4714	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4715	--rescnt;
				4716	}
				4717	res++ = fmt++;
				4718	}
				4719	else {
				4720	/* Got a format specifier */
				4721	int flags = 0;
				4722	int width = -1;
				4723	int prec = -1;
				4724	int size = 0;
				4725	Py_UNICODE c = '\0';
				4726	Py_UNICODE fill;
				4727	PyObject *v = NULL;
				4728	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4729	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4730	Py_UNICODE sign;
				4731	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4732	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4733
				4734	fmt++;
				4735	if (*fmt == '(') {
				4736	Py_UNICODE *keystart;
				4737	int keylen;
				4738	PyObject *key;
				4739	int pcount = 1;
				4740
				4741	if (dict == NULL) {
				4742	PyErr_SetString(PyExc_TypeError,
				4743	"format requires a mapping");
				4744	goto onError;
				4745	}
				4746	++fmt;
				4747	--fmtcnt;
				4748	keystart = fmt;
				4749	/* Skip over balanced parentheses */
				4750	while (pcount > 0 && --fmtcnt >= 0) {
				4751	if (*fmt == ')')
				4752	--pcount;
				4753	else if (*fmt == '(')
				4754	++pcount;
				4755	fmt++;
				4756	}
				4757	keylen = fmt - keystart - 1;
				4758	if (fmtcnt < 0 \|\| pcount > 0) {
				4759	PyErr_SetString(PyExc_ValueError,
				4760	"incomplete format key");
				4761	goto onError;
				4762	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4763	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4764	then looked up since Python uses strings to hold
				4765	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4766	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4767	key = PyUnicode_EncodeUTF8(keystart,
				4768	keylen,
				4769	NULL);
				4770	if (key == NULL)
				4771	goto onError;
				4772	if (args_owned) {
				4773	Py_DECREF(args);
				4774	args_owned = 0;
				4775	}
				4776	args = PyObject_GetItem(dict, key);
				4777	Py_DECREF(key);
				4778	if (args == NULL) {
				4779	goto onError;
				4780	}
				4781	args_owned = 1;
				4782	arglen = -1;
				4783	argidx = -2;
				4784	}
				4785	while (--fmtcnt >= 0) {
				4786	switch (c = *fmt++) {
				4787	case '-': flags \|= F_LJUST; continue;
				4788	case '+': flags \|= F_SIGN; continue;
				4789	case ' ': flags \|= F_BLANK; continue;
				4790	case '#': flags \|= F_ALT; continue;
				4791	case '0': flags \|= F_ZERO; continue;
				4792	}
				4793	break;
				4794	}
				4795	if (c == '*') {
				4796	v = getnextarg(args, arglen, &argidx);
				4797	if (v == NULL)
				4798	goto onError;
				4799	if (!PyInt_Check(v)) {
				4800	PyErr_SetString(PyExc_TypeError,
				4801	"* wants int");
				4802	goto onError;
				4803	}
				4804	width = PyInt_AsLong(v);
				4805	if (width < 0) {
				4806	flags \|= F_LJUST;
				4807	width = -width;
				4808	}
				4809	if (--fmtcnt >= 0)
				4810	c = *fmt++;
				4811	}
				4812	else if (c >= '0' && c <= '9') {
				4813	width = c - '0';
				4814	while (--fmtcnt >= 0) {
				4815	c = *fmt++;
				4816	if (c < '0' \|\| c > '9')
				4817	break;
				4818	if ((width*10) / 10 != width) {
				4819	PyErr_SetString(PyExc_ValueError,
				4820	"width too big");
				4821	goto onError;
				4822	}
				4823	width = width*10 + (c - '0');
				4824	}
				4825	}
				4826	if (c == '.') {
				4827	prec = 0;
				4828	if (--fmtcnt >= 0)
				4829	c = *fmt++;
				4830	if (c == '*') {
				4831	v = getnextarg(args, arglen, &argidx);
				4832	if (v == NULL)
				4833	goto onError;
				4834	if (!PyInt_Check(v)) {
				4835	PyErr_SetString(PyExc_TypeError,
				4836	"* wants int");
				4837	goto onError;
				4838	}
				4839	prec = PyInt_AsLong(v);
				4840	if (prec < 0)
				4841	prec = 0;
				4842	if (--fmtcnt >= 0)
				4843	c = *fmt++;
				4844	}
				4845	else if (c >= '0' && c <= '9') {
				4846	prec = c - '0';
				4847	while (--fmtcnt >= 0) {
				4848	c = Py_CHARMASK(*fmt++);
				4849	if (c < '0' \|\| c > '9')
				4850	break;
				4851	if ((prec*10) / 10 != prec) {
				4852	PyErr_SetString(PyExc_ValueError,
				4853	"prec too big");
				4854	goto onError;
				4855	}
				4856	prec = prec*10 + (c - '0');
				4857	}
				4858	}
				4859	} /* prec */
				4860	if (fmtcnt >= 0) {
				4861	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4862	size = c;
				4863	if (--fmtcnt >= 0)
				4864	c = *fmt++;
				4865	}
				4866	}
				4867	if (fmtcnt < 0) {
				4868	PyErr_SetString(PyExc_ValueError,
				4869	"incomplete format");
				4870	goto onError;
				4871	}
				4872	if (c != '%') {
				4873	v = getnextarg(args, arglen, &argidx);
				4874	if (v == NULL)
				4875	goto onError;
				4876	}
				4877	sign = 0;
				4878	fill = ' ';
				4879	switch (c) {
				4880
				4881	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4882	pbuf = formatbuf;
				4883	/* presume that buffer length is at least 1 */
				4884	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4885	len = 1;
				4886	break;
				4887
				4888	case 's':
				4889	case 'r':
				4890	if (PyUnicode_Check(v) && c == 's') {
				4891	temp = v;
				4892	Py_INCREF(temp);
				4893	}
				4894	else {
				4895	PyObject *unicode;
				4896	if (c == 's')
				4897	temp = PyObject_Str(v);
				4898	else
				4899	temp = PyObject_Repr(v);
				4900	if (temp == NULL)
				4901	goto onError;
				4902	if (!PyString_Check(temp)) {
				4903	/* XXX Note: this should never happen, since
				4904	PyObject_Repr() and PyObject_Str() assure
				4905	this */
				4906	Py_DECREF(temp);
				4907	PyErr_SetString(PyExc_TypeError,
				4908	"%s argument has non-string str()");
				4909	goto onError;
				4910	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4911	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4912	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4913	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4914	"strict");
				4915	Py_DECREF(temp);
				4916	temp = unicode;
				4917	if (temp == NULL)
				4918	goto onError;
				4919	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4920	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4921	len = PyUnicode_GET_SIZE(temp);
				4922	if (prec >= 0 && len > prec)
				4923	len = prec;
				4924	break;
				4925
				4926	case 'i':
				4927	case 'd':
				4928	case 'u':
				4929	case 'o':
				4930	case 'x':
				4931	case 'X':
				4932	if (c == 'i')
				4933	c = 'd';
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4934	pbuf = formatbuf;
				4935	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4936	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4937	if (len < 0)
				4938	goto onError;
				4939	sign = (c == 'd');
				4940	if (flags & F_ZERO) {
				4941	fill = '0';
				4942	if ((flags&F_ALT) &&
				4943	(c == 'x' \|\| c == 'X') &&
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4944	pbuf[0] == '0' && pbuf[1] == c) {
				4945	res++ = pbuf++;
				4946	res++ = pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4947	rescnt -= 2;
				4948	len -= 2;
				4949	width -= 2;
				4950	if (width < 0)
				4951	width = 0;
				4952	}
				4953	}
				4954	break;
				4955
				4956	case 'e':
				4957	case 'E':
				4958	case 'f':
				4959	case 'g':
				4960	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4961	pbuf = formatbuf;
				4962	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4963	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4964	if (len < 0)
				4965	goto onError;
				4966	sign = 1;
				4967	if (flags&F_ZERO)
				4968	fill = '0';
				4969	break;
				4970
				4971	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4972	pbuf = formatbuf;
				4973	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4974	if (len < 0)
				4975	goto onError;
				4976	break;
				4977
				4978	default:
				4979	PyErr_Format(PyExc_ValueError,
				4980	"unsupported format character '%c' (0x%x)",
				4981	c, c);
				4982	goto onError;
				4983	}
				4984	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4985	if (pbuf == '-' \|\| pbuf == '+') {
				4986	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4987	len--;
				4988	}
				4989	else if (flags & F_SIGN)
				4990	sign = '+';
				4991	else if (flags & F_BLANK)
				4992	sign = ' ';
				4993	else
				4994	sign = 0;
				4995	}
				4996	if (width < len)
				4997	width = len;
				4998	if (rescnt < width + (sign != 0)) {
				4999	reslen -= rescnt;
				5000	rescnt = width + fmtcnt + 100;
				5001	reslen += rescnt;
				5002	if (_PyUnicode_Resize(result, reslen) < 0)
				5003	return NULL;
				5004	res = PyUnicode_AS_UNICODE(result)
				5005	+ reslen - rescnt;
				5006	}
				5007	if (sign) {
				5008	if (fill != ' ')
				5009	*res++ = sign;
				5010	rescnt--;
				5011	if (width > len)
				5012	width--;
				5013	}
				5014	if (width > len && !(flags & F_LJUST)) {
				5015	do {
				5016	--rescnt;
				5017	*res++ = fill;
				5018	} while (--width > len);
				5019	}
				5020	if (sign && fill == ' ')
				5021	*res++ = sign;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5022	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5023	res += len;
				5024	rescnt -= len;
				5025	while (--width >= len) {
				5026	--rescnt;
				5027	*res++ = ' ';
				5028	}
				5029	if (dict && (argidx < arglen) && c != '%') {
				5030	PyErr_SetString(PyExc_TypeError,
				5031	"not all arguments converted");
				5032	goto onError;
				5033	}
				5034	Py_XDECREF(temp);
				5035	} /* '%' */
				5036	} /* until end */
				5037	if (argidx < arglen && !dict) {
				5038	PyErr_SetString(PyExc_TypeError,
				5039	"not all arguments converted");
				5040	goto onError;
				5041	}
				5042
				5043	if (args_owned) {
				5044	Py_DECREF(args);
				5045	}
				5046	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5047	if (_PyUnicode_Resize(result, reslen - rescnt))
				5048	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5049	return (PyObject *)result;
				5050
				5051	onError:
				5052	Py_XDECREF(result);
				5053	Py_DECREF(uformat);
				5054	if (args_owned) {
				5055	Py_DECREF(args);
				5056	}
				5057	return NULL;
				5058	}
				5059
				5060	static PyBufferProcs unicode_as_buffer = {
				5061	(getreadbufferproc) unicode_buffer_getreadbuf,
				5062	(getwritebufferproc) unicode_buffer_getwritebuf,
				5063	(getsegcountproc) unicode_buffer_getsegcount,
				5064	(getcharbufferproc) unicode_buffer_getcharbuf,
				5065	};
				5066
				5067	PyTypeObject PyUnicode_Type = {
				5068	PyObject_HEAD_INIT(&PyType_Type)
				5069	0, /* ob_size */
				5070	"unicode", /* tp_name */
				5071	sizeof(PyUnicodeObject), /* tp_size */
				5072	0, /* tp_itemsize */
				5073	/* Slots */
				5074	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5075	0, /* tp_print */
				5076	(getattrfunc)unicode_getattr, /* tp_getattr */
				5077	0, /* tp_setattr */
				5078	(cmpfunc) unicode_compare, /* tp_compare */
				5079	(reprfunc) unicode_repr, /* tp_repr */
				5080	0, /* tp_as_number */
				5081	&unicode_as_sequence, /* tp_as_sequence */
				5082	0, /* tp_as_mapping */
				5083	(hashfunc) unicode_hash, /* tp_hash*/
				5084	0, /* tp_call*/
				5085	(reprfunc) unicode_str, /* tp_str */
				5086	(getattrofunc) NULL, /* tp_getattro */
				5087	(setattrofunc) NULL, /* tp_setattro */
				5088	&unicode_as_buffer, /* tp_as_buffer */
				5089	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5090	};
				5091
				5092	/* Initialize the Unicode implementation */
				5093
				5094	void _PyUnicode_Init()
				5095	{
				5096	/* Doublecheck the configuration... */
				5097	if (sizeof(Py_UNICODE) != 2)
				5098	Py_FatalError("Unicode configuration error: "
				5099	"sizeof(Py_UNICODE) != 2 bytes");
				5100
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5101	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5102	unicode_freelist = NULL;
				5103	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5104	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5105	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5106	}
				5107
				5108	/* Finalize the Unicode implementation */
				5109
				5110	void
				5111	_PyUnicode_Fini()
				5112	{
				5113	PyUnicodeObject *u = unicode_freelist;
				5114
				5115	while (u != NULL) {
				5116	PyUnicodeObject *v = u;
				5117	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5118	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5119	PyMem_DEL(v->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5120	Py_XDECREF(v->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5121	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5122	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5123	unicode_freelist = NULL;
				5124	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5125	Py_XDECREF(unicode_empty);
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5126	unicode_empty = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5127	}