Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 7737057614dcf2c913edf21d88382f2a030ea4d9 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	69	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	#if defined(HAVE_LIMITS_H)
				72	#include <limits.h>
				73	#else
				74	#define INT_MAX 2147483647
				75	#endif
				76
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	77	#ifdef MS_WIN32
				78	#include <windows.h>
				79	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	80
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	81	/* Limit for the Unicode object free list */
				82
				83	#define MAX_UNICODE_FREELIST_SIZE 1024
				84
				85	/* Limit for the Unicode object free list stay alive optimization.
				86
				87	The implementation will keep allocated Unicode memory intact for
				88	all objects on the free list having a size less than this
				89	limit. This reduces malloc() overhead for small Unicode objects.
				90
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	91	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	92	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	93	malloc()-overhead) bytes of unused garbage.
				94
				95	Setting the limit to 0 effectively turns the feature off.
				96
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	97	Note: This is an experimental feature ! If you get core dumps when
				98	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	99
				100	*/
				101
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	102	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	103
				104	/* Endianness switches; defaults to little endian */
				105
				106	#ifdef WORDS_BIGENDIAN
				107	# define BYTEORDER_IS_BIG_ENDIAN
				108	#else
				109	# define BYTEORDER_IS_LITTLE_ENDIAN
				110	#endif
				111
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	112	/* --- Globals ------------------------------------------------------------
				113
				114	The globals are initialized by the _PyUnicode_Init() API and should
				115	not be used before calling that API.
				116
				117	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118
				119	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	120	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	121
				122	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	123	static PyUnicodeObject *unicode_freelist;
				124	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	125
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	126	/* Default encoding to use and assume when NULL is passed as encoding
				127	parameter; it is initialized by _PyUnicode_Init().
				128
				129	Always use the PyUnicode_SetDefaultEncoding() and
				130	PyUnicode_GetDefaultEncoding() APIs to access this global.
				131
				132	*/
				133
				134	static char unicode_default_encoding[100];
				135
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	136	/* --- Unicode Object ----------------------------------------------------- */
				137
				138	static
				139	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				140	int length)
				141	{
				142	void *oldstr;
				143
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	144	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	145	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	146	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	147
				148	/* Resizing unicode_empty is not allowed. */
				149	if (unicode == unicode_empty) {
				150	PyErr_SetString(PyExc_SystemError,
				151	"can't resize empty unicode object");
				152	return -1;
				153	}
				154
				155	/* We allocate one more byte to make sure the string is
				156	Ux0000 terminated -- XXX is this needed ? */
				157	oldstr = unicode->str;
				158	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				159	if (!unicode->str) {
				160	unicode->str = oldstr;
				161	PyErr_NoMemory();
				162	return -1;
				163	}
				164	unicode->str[length] = 0;
				165	unicode->length = length;
				166
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	167	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	168	/* Reset the object caches */
				169	if (unicode->utf8str) {
				170	Py_DECREF(unicode->utf8str);
				171	unicode->utf8str = NULL;
				172	}
				173	unicode->hash = -1;
				174
				175	return 0;
				176	}
				177
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	178	int PyUnicode_Resize(PyObject **unicode,
				179	int length)
				180	{
				181	PyUnicodeObject *v;
				182
				183	if (unicode == NULL) {
				184	PyErr_BadInternalCall();
				185	return -1;
				186	}
				187	v = (PyUnicodeObject )unicode;
				188	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				189	PyErr_BadInternalCall();
				190	return -1;
				191	}
				192	return _PyUnicode_Resize(v, length);
				193	}
				194
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	195	/* We allocate one more byte to make sure the string is
				196	Ux0000 terminated -- XXX is this needed ?
				197
				198	XXX This allocator could further be enhanced by assuring that the
				199	free list never reduces its size below 1.
				200
				201	*/
				202
				203	static
				204	PyUnicodeObject *_PyUnicode_New(int length)
				205	{
				206	register PyUnicodeObject *unicode;
				207
				208	/* Optimization for empty strings */
				209	if (length == 0 && unicode_empty != NULL) {
				210	Py_INCREF(unicode_empty);
				211	return unicode_empty;
				212	}
				213
				214	/* Unicode freelist & memory allocation */
				215	if (unicode_freelist) {
				216	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	217	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	219	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	220	/* Keep-Alive optimization: we only upsize the buffer,
				221	never downsize it. */
				222	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	223	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	224	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	225	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	226	}
				227	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	228	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	230	}
				231	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	232	}
				233	else {
				234	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				235	if (unicode == NULL)
				236	return NULL;
				237	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				238	}
				239
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	240	if (!unicode->str) {
				241	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	242	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	243	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	244	unicode->str[length] = 0;
				245	unicode->length = length;
				246	unicode->hash = -1;
				247	unicode->utf8str = NULL;
				248	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	249
				250	onError:
				251	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	252	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	253	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	254	}
				255
				256	static
				257	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				258	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	259	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	260	/* Keep-Alive optimization */
				261	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	262	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	263	unicode->str = NULL;
				264	unicode->length = 0;
				265	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	266	if (unicode->utf8str) {
				267	Py_DECREF(unicode->utf8str);
				268	unicode->utf8str = NULL;
				269	}
				270	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	271	(PyUnicodeObject *)unicode = unicode_freelist;
				272	unicode_freelist = unicode;
				273	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	274	}
				275	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	276	PyMem_DEL(unicode->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	277	Py_XDECREF(unicode->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	278	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	279	}
				280	}
				281
				282	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				283	int size)
				284	{
				285	PyUnicodeObject *unicode;
				286
				287	unicode = _PyUnicode_New(size);
				288	if (!unicode)
				289	return NULL;
				290
				291	/* Copy the Unicode data into the new object */
				292	if (u != NULL)
				293	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				294
				295	return (PyObject *)unicode;
				296	}
				297
				298	#ifdef HAVE_WCHAR_H
				299
				300	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				301	int size)
				302	{
				303	PyUnicodeObject *unicode;
				304
				305	if (w == NULL) {
				306	PyErr_BadInternalCall();
				307	return NULL;
				308	}
				309
				310	unicode = _PyUnicode_New(size);
				311	if (!unicode)
				312	return NULL;
				313
				314	/* Copy the wchar_t data into the new object */
				315	#ifdef HAVE_USABLE_WCHAR_T
				316	memcpy(unicode->str, w, size * sizeof(wchar_t));
				317	#else
				318	{
				319	register Py_UNICODE *u;
				320	register int i;
				321	u = PyUnicode_AS_UNICODE(unicode);
				322	for (i = size; i >= 0; i--)
				323	u++ = w++;
				324	}
				325	#endif
				326
				327	return (PyObject *)unicode;
				328	}
				329
				330	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				331	register wchar_t *w,
				332	int size)
				333	{
				334	if (unicode == NULL) {
				335	PyErr_BadInternalCall();
				336	return -1;
				337	}
				338	if (size > PyUnicode_GET_SIZE(unicode))
				339	size = PyUnicode_GET_SIZE(unicode);
				340	#ifdef HAVE_USABLE_WCHAR_T
				341	memcpy(w, unicode->str, size * sizeof(wchar_t));
				342	#else
				343	{
				344	register Py_UNICODE *u;
				345	register int i;
				346	u = PyUnicode_AS_UNICODE(unicode);
				347	for (i = size; i >= 0; i--)
				348	w++ = u++;
				349	}
				350	#endif
				351
				352	return size;
				353	}
				354
				355	#endif
				356
				357	PyObject PyUnicode_FromObject(register PyObject obj)
				358	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	359	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				360	}
				361
				362	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				363	const char *encoding,
				364	const char *errors)
				365	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	366	const char *s;
				367	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	368	int owned = 0;
				369	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	370
				371	if (obj == NULL) {
				372	PyErr_BadInternalCall();
				373	return NULL;
				374	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	375
				376	/* Coerce object */
				377	if (PyInstance_Check(obj)) {
				378	PyObject *func;
				379	func = PyObject_GetAttrString(obj, "__str__");
				380	if (func == NULL) {
				381	PyErr_SetString(PyExc_TypeError,
				382	"coercing to Unicode: instance doesn't define __str__");
				383	return NULL;
				384	}
				385	obj = PyEval_CallObject(func, NULL);
				386	Py_DECREF(func);
				387	if (obj == NULL)
				388	return NULL;
				389	owned = 1;
				390	}
				391	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	392	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	393	v = obj;
				394	if (encoding) {
				395	PyErr_SetString(PyExc_TypeError,
				396	"decoding Unicode is not supported");
				397	return NULL;
				398	}
				399	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	400	}
				401	else if (PyString_Check(obj)) {
				402	s = PyString_AS_STRING(obj);
				403	len = PyString_GET_SIZE(obj);
				404	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	405	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				406	/* Overwrite the error message with something more useful in
				407	case of a TypeError. */
				408	if (PyErr_ExceptionMatches(PyExc_TypeError))
				409	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	410	"coercing to Unicode: need string or buffer");
				411	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	412	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	413
				414	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	415	if (len == 0) {
				416	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	417	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	418	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	419	else
				420	v = PyUnicode_Decode(s, len, encoding, errors);
				421	done:
				422	if (owned)
				423	Py_DECREF(obj);
				424	return v;
				425
				426	onError:
				427	if (owned)
				428	Py_DECREF(obj);
				429	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	430	}
				431
				432	PyObject PyUnicode_Decode(const char s,
				433	int size,
				434	const char *encoding,
				435	const char *errors)
				436	{
				437	PyObject buffer = NULL, unicode;
				438
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	439	if (encoding == NULL)
				440	encoding = PyUnicode_GetDefaultEncoding();
				441
				442	/* Shortcuts for common default encodings */
				443	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	444	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	445	else if (strcmp(encoding, "latin-1") == 0)
				446	return PyUnicode_DecodeLatin1(s, size, errors);
				447	else if (strcmp(encoding, "ascii") == 0)
				448	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	449
				450	/* Decode via the codec registry */
				451	buffer = PyBuffer_FromMemory((void *)s, size);
				452	if (buffer == NULL)
				453	goto onError;
				454	unicode = PyCodec_Decode(buffer, encoding, errors);
				455	if (unicode == NULL)
				456	goto onError;
				457	if (!PyUnicode_Check(unicode)) {
				458	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	459	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	460	unicode->ob_type->tp_name);
				461	Py_DECREF(unicode);
				462	goto onError;
				463	}
				464	Py_DECREF(buffer);
				465	return unicode;
				466
				467	onError:
				468	Py_XDECREF(buffer);
				469	return NULL;
				470	}
				471
				472	PyObject PyUnicode_Encode(const Py_UNICODE s,
				473	int size,
				474	const char *encoding,
				475	const char *errors)
				476	{
				477	PyObject v, unicode;
				478
				479	unicode = PyUnicode_FromUnicode(s, size);
				480	if (unicode == NULL)
				481	return NULL;
				482	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				483	Py_DECREF(unicode);
				484	return v;
				485	}
				486
				487	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				488	const char *encoding,
				489	const char *errors)
				490	{
				491	PyObject *v;
				492
				493	if (!PyUnicode_Check(unicode)) {
				494	PyErr_BadArgument();
				495	goto onError;
				496	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	497
				498	if (encoding == NULL)
				499	encoding = PyUnicode_GetDefaultEncoding();
				500
				501	/* Shortcuts for common default encodings */
				502	if (errors == NULL) {
				503	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	504	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	505	else if (strcmp(encoding, "latin-1") == 0)
				506	return PyUnicode_AsLatin1String(unicode);
				507	else if (strcmp(encoding, "ascii") == 0)
				508	return PyUnicode_AsASCIIString(unicode);
				509	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	510
				511	/* Encode via the codec registry */
				512	v = PyCodec_Encode(unicode, encoding, errors);
				513	if (v == NULL)
				514	goto onError;
				515	/* XXX Should we really enforce this ? */
				516	if (!PyString_Check(v)) {
				517	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	518	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	519	v->ob_type->tp_name);
				520	Py_DECREF(v);
				521	goto onError;
				522	}
				523	return v;
				524
				525	onError:
				526	return NULL;
				527	}
				528
				529	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				530	{
				531	if (!PyUnicode_Check(unicode)) {
				532	PyErr_BadArgument();
				533	goto onError;
				534	}
				535	return PyUnicode_AS_UNICODE(unicode);
				536
				537	onError:
				538	return NULL;
				539	}
				540
				541	int PyUnicode_GetSize(PyObject *unicode)
				542	{
				543	if (!PyUnicode_Check(unicode)) {
				544	PyErr_BadArgument();
				545	goto onError;
				546	}
				547	return PyUnicode_GET_SIZE(unicode);
				548
				549	onError:
				550	return -1;
				551	}
				552
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	553	const char *PyUnicode_GetDefaultEncoding()
				554	{
				555	return unicode_default_encoding;
				556	}
				557
				558	int PyUnicode_SetDefaultEncoding(const char *encoding)
				559	{
				560	PyObject *v;
				561
				562	/* Make sure the encoding is valid. As side effect, this also
				563	loads the encoding into the codec registry cache. */
				564	v = _PyCodec_Lookup(encoding);
				565	if (v == NULL)
				566	goto onError;
				567	Py_DECREF(v);
				568	strncpy(unicode_default_encoding,
				569	encoding,
				570	sizeof(unicode_default_encoding));
				571	return 0;
				572
				573	onError:
				574	return -1;
				575	}
				576
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	577	/* --- UTF-8 Codec -------------------------------------------------------- */
				578
				579	static
				580	char utf8_code_length[256] = {
				581	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				582	illegal prefix. see RFC 2279 for details */
				583	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				584	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				585	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				586	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				587	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				588	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				589	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				590	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				591	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				592	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				593	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				594	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				595	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				596	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				597	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				598	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				599	};
				600
				601	static
				602	int utf8_decoding_error(const char **source,
				603	Py_UNICODE **dest,
				604	const char *errors,
				605	const char *details)
				606	{
				607	if ((errors == NULL) \|\|
				608	(strcmp(errors,"strict") == 0)) {
				609	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	610	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	611	details);
				612	return -1;
				613	}
				614	else if (strcmp(errors,"ignore") == 0) {
				615	(*source)++;
				616	return 0;
				617	}
				618	else if (strcmp(errors,"replace") == 0) {
				619	(*source)++;
				620	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				621	(*dest)++;
				622	return 0;
				623	}
				624	else {
				625	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	626	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	627	errors);
				628	return -1;
				629	}
				630	}
				631
				632	#define UTF8_ERROR(details) do { \
				633	if (utf8_decoding_error(&s, &p, errors, details)) \
				634	goto onError; \
				635	continue; \
				636	} while (0)
				637
				638	PyObject PyUnicode_DecodeUTF8(const char s,
				639	int size,
				640	const char *errors)
				641	{
				642	int n;
				643	const char *e;
				644	PyUnicodeObject *unicode;
				645	Py_UNICODE *p;
				646
				647	/* Note: size will always be longer than the resulting Unicode
				648	character count */
				649	unicode = _PyUnicode_New(size);
				650	if (!unicode)
				651	return NULL;
				652	if (size == 0)
				653	return (PyObject *)unicode;
				654
				655	/* Unpack UTF-8 encoded data */
				656	p = unicode->str;
				657	e = s + size;
				658
				659	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	660	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	661
				662	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	663	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	664	s++;
				665	continue;
				666	}
				667
				668	n = utf8_code_length[ch];
				669
				670	if (s + n > e)
				671	UTF8_ERROR("unexpected end of data");
				672
				673	switch (n) {
				674
				675	case 0:
				676	UTF8_ERROR("unexpected code byte");
				677	break;
				678
				679	case 1:
				680	UTF8_ERROR("internal error");
				681	break;
				682
				683	case 2:
				684	if ((s[1] & 0xc0) != 0x80)
				685	UTF8_ERROR("invalid data");
				686	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				687	if (ch < 0x80)
				688	UTF8_ERROR("illegal encoding");
				689	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	690	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	691	break;
				692
				693	case 3:
				694	if ((s[1] & 0xc0) != 0x80 \|\|
				695	(s[2] & 0xc0) != 0x80)
				696	UTF8_ERROR("invalid data");
				697	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				698	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				699	UTF8_ERROR("illegal encoding");
				700	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	701	*p++ = (Py_UNICODE)ch;
				702	break;
				703
				704	case 4:
				705	if ((s[1] & 0xc0) != 0x80 \|\|
				706	(s[2] & 0xc0) != 0x80 \|\|
				707	(s[3] & 0xc0) != 0x80)
				708	UTF8_ERROR("invalid data");
				709	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				710	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				711	/* validate and convert to UTF-16 */
				712	if ((ch < 0x10000) \|\| /* minimum value allowed for 4 byte encoding */
				713	(ch > 0x10ffff)) /* maximum value allowed for UTF-16 */
				714	UTF8_ERROR("illegal encoding");
				715	/* compute and append the two surrogates: */
				716
				717	/* translate from 10000..10FFFF to 0..FFFF */
				718	ch -= 0x10000;
				719
				720	/* high surrogate = top 10 bits added to D800 */
				721	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				722
				723	/* low surrogate = bottom 10 bits added to DC00 */
				724	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	725	break;
				726
				727	default:
				728	/* Other sizes are only needed for UCS-4 */
				729	UTF8_ERROR("unsupported Unicode code range");
				730	}
				731	s += n;
				732	}
				733
				734	/* Adjust length */
				735	if (_PyUnicode_Resize(unicode, p - unicode->str))
				736	goto onError;
				737
				738	return (PyObject *)unicode;
				739
				740	onError:
				741	Py_DECREF(unicode);
				742	return NULL;
				743	}
				744
				745	#undef UTF8_ERROR
				746
				747	static
				748	int utf8_encoding_error(const Py_UNICODE **source,
				749	char **dest,
				750	const char *errors,
				751	const char *details)
				752	{
				753	if ((errors == NULL) \|\|
				754	(strcmp(errors,"strict") == 0)) {
				755	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	756	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	757	details);
				758	return -1;
				759	}
				760	else if (strcmp(errors,"ignore") == 0) {
				761	return 0;
				762	}
				763	else if (strcmp(errors,"replace") == 0) {
				764	**dest = '?';
				765	(*dest)++;
				766	return 0;
				767	}
				768	else {
				769	PyErr_Format(PyExc_ValueError,
				770	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	771	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	772	errors);
				773	return -1;
				774	}
				775	}
				776
				777	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				778	int size,
				779	const char *errors)
				780	{
				781	PyObject *v;
				782	char *p;
				783	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	784	Py_UCS4 ch2;
				785	unsigned int cbAllocated = 3 * size;
				786	unsigned int cbWritten = 0;
				787	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	788
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	789	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	790	if (v == NULL)
				791	return NULL;
				792	if (size == 0)
				793	goto done;
				794
				795	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	796	while (i < size) {
				797	Py_UCS4 ch = s[i++];
				798	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	799	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	800	cbWritten++;
				801	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	802	else if (ch < 0x0800) {
				803	*p++ = 0xc0 \| (ch >> 6);
				804	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	805	cbWritten += 2;
				806	}
				807	else {
				808	/* Check for high surrogate */
				809	if (0xD800 <= ch && ch <= 0xDBFF) {
				810	if (i != size) {
				811	ch2 = s[i];
				812	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				813
				814	if (cbWritten >= (cbAllocated - 4)) {
				815	/* Provide enough room for some more
				816	surrogates */
				817	cbAllocated += 4*10;
				818	if (_PyString_Resize(&v, cbAllocated))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	819	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	820	}
				821
				822	/* combine the two values */
				823	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				824
				825	*p++ = (char)((ch >> 18) \| 0xf0);
				826	*p++ = (char)(0x80 \| (ch >> 12) & 0x3f);
				827	i++;
				828	cbWritten += 4;
				829	}
				830	}
				831	}
				832	else {
				833	*p++ = (char)(0xe0 \| (ch >> 12));
				834	cbWritten += 3;
				835	}
				836	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				837	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	838	}
				839	}
				840	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	841	if (_PyString_Resize(&v, p - q))
				842	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	843
				844	done:
				845	return v;
				846
				847	onError:
				848	Py_DECREF(v);
				849	return NULL;
				850	}
				851
				852	/* Return a Python string holding the UTF-8 encoded value of the
				853	Unicode object.
				854
				855	The resulting string is cached in the Unicode object for subsequent
				856	usage by this function. The cached version is needed to implement
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	857	the character buffer interface and will live (at least) as long as
				858	the Unicode object itself.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	859
				860	The refcount of the string is not incremented.
				861
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	862	* Exported for internal use by the interpreter only !!! *
				863
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	864	*/
				865
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	866	PyObject _PyUnicode_AsUTF8String(PyObject unicode,
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	867	const char *errors)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	868	{
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	869	PyObject v = ((PyUnicodeObject )unicode)->utf8str;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	870
				871	if (v)
				872	return v;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	873	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				874	PyUnicode_GET_SIZE(unicode),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	875	errors);
				876	if (v && errors == NULL)
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	877	((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	878	return v;
				879	}
				880
				881	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				882	{
				883	PyObject *str;
				884
				885	if (!PyUnicode_Check(unicode)) {
				886	PyErr_BadArgument();
				887	return NULL;
				888	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	889	str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	890	if (str == NULL)
				891	return NULL;
				892	Py_INCREF(str);
				893	return str;
				894	}
				895
				896	/* --- UTF-16 Codec ------------------------------------------------------- */
				897
				898	static
				899	int utf16_decoding_error(const Py_UNICODE **source,
				900	Py_UNICODE **dest,
				901	const char *errors,
				902	const char *details)
				903	{
				904	if ((errors == NULL) \|\|
				905	(strcmp(errors,"strict") == 0)) {
				906	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	907	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	908	details);
				909	return -1;
				910	}
				911	else if (strcmp(errors,"ignore") == 0) {
				912	return 0;
				913	}
				914	else if (strcmp(errors,"replace") == 0) {
				915	if (dest) {
				916	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				917	(*dest)++;
				918	}
				919	return 0;
				920	}
				921	else {
				922	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	923	"UTF-16 decoding error; "
				924	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	925	errors);
				926	return -1;
				927	}
				928	}
				929
				930	#define UTF16_ERROR(details) do { \
				931	if (utf16_decoding_error(&q, &p, errors, details)) \
				932	goto onError; \
				933	continue; \
				934	} while(0)
				935
				936	PyObject PyUnicode_DecodeUTF16(const char s,
				937	int size,
				938	const char *errors,
				939	int *byteorder)
				940	{
				941	PyUnicodeObject *unicode;
				942	Py_UNICODE *p;
				943	const Py_UNICODE q, e;
				944	int bo = 0;
				945
				946	/* size should be an even number */
				947	if (size % sizeof(Py_UNICODE) != 0) {
				948	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				949	return NULL;
				950	/* The remaining input chars are ignored if we fall through
				951	here... */
				952	}
				953
				954	/* Note: size will always be longer than the resulting Unicode
				955	character count */
				956	unicode = _PyUnicode_New(size);
				957	if (!unicode)
				958	return NULL;
				959	if (size == 0)
				960	return (PyObject *)unicode;
				961
				962	/* Unpack UTF-16 encoded data */
				963	p = unicode->str;
				964	q = (Py_UNICODE *)s;
				965	e = q + (size / sizeof(Py_UNICODE));
				966
				967	if (byteorder)
				968	bo = *byteorder;
				969
				970	while (q < e) {
				971	register Py_UNICODE ch = *q++;
				972
				973	/* Check for BOM marks (U+FEFF) in the input and adjust
				974	current byte order setting accordingly. Swap input
				975	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				976	!) */
				977	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				978	if (ch == 0xFEFF) {
				979	bo = -1;
				980	continue;
				981	} else if (ch == 0xFFFE) {
				982	bo = 1;
				983	continue;
				984	}
				985	if (bo == 1)
				986	ch = (ch >> 8) \| (ch << 8);
				987	#else
				988	if (ch == 0xFEFF) {
				989	bo = 1;
				990	continue;
				991	} else if (ch == 0xFFFE) {
				992	bo = -1;
				993	continue;
				994	}
				995	if (bo == -1)
				996	ch = (ch >> 8) \| (ch << 8);
				997	#endif
				998	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				999	*p++ = ch;
				1000	continue;
				1001	}
				1002
				1003	/* UTF-16 code pair: */
				1004	if (q >= e)
				1005	UTF16_ERROR("unexpected end of data");
				1006	if (0xDC00 <= q && q <= 0xDFFF) {
				1007	q++;
				1008	if (0xD800 <= q && q <= 0xDBFF)
				1009	/* This is valid data (a UTF-16 surrogate pair), but
				1010	we are not able to store this information since our
				1011	Py_UNICODE type only has 16 bits... this might
				1012	change someday, even though it's unlikely. */
				1013	UTF16_ERROR("code pairs are not supported");
				1014	else
				1015	continue;
				1016	}
				1017	UTF16_ERROR("illegal encoding");
				1018	}
				1019
				1020	if (byteorder)
				1021	*byteorder = bo;
				1022
				1023	/* Adjust length */
				1024	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1025	goto onError;
				1026
				1027	return (PyObject *)unicode;
				1028
				1029	onError:
				1030	Py_DECREF(unicode);
				1031	return NULL;
				1032	}
				1033
				1034	#undef UTF16_ERROR
				1035
				1036	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1037	int size,
				1038	const char *errors,
				1039	int byteorder)
				1040	{
				1041	PyObject *v;
				1042	Py_UNICODE *p;
				1043	char *q;
				1044
				1045	/* We don't create UTF-16 pairs... */
				1046	v = PyString_FromStringAndSize(NULL,
				1047	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1048	if (v == NULL)
				1049	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1050
				1051	q = PyString_AS_STRING(v);
				1052	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1053	if (byteorder == 0)
				1054	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1055	if (size == 0)
				1056	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1057	if (byteorder == 0 \|\|
				1058	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1059	byteorder == -1
				1060	#else
				1061	byteorder == 1
				1062	#endif
				1063	)
				1064	memcpy(p, s, size * sizeof(Py_UNICODE));
				1065	else
				1066	while (size-- > 0) {
				1067	Py_UNICODE ch = *s++;
				1068	*p++ = (ch >> 8) \| (ch << 8);
				1069	}
				1070	done:
				1071	return v;
				1072	}
				1073
				1074	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1075	{
				1076	if (!PyUnicode_Check(unicode)) {
				1077	PyErr_BadArgument();
				1078	return NULL;
				1079	}
				1080	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1081	PyUnicode_GET_SIZE(unicode),
				1082	NULL,
				1083	0);
				1084	}
				1085
				1086	/* --- Unicode Escape Codec ----------------------------------------------- */
				1087
				1088	static
				1089	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1090	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1091	const char *errors,
				1092	const char *details)
				1093	{
				1094	if ((errors == NULL) \|\|
				1095	(strcmp(errors,"strict") == 0)) {
				1096	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1097	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1098	details);
				1099	return -1;
				1100	}
				1101	else if (strcmp(errors,"ignore") == 0) {
				1102	return 0;
				1103	}
				1104	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1105	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1106	return 0;
				1107	}
				1108	else {
				1109	PyErr_Format(PyExc_ValueError,
				1110	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1111	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1112	errors);
				1113	return -1;
				1114	}
				1115	}
				1116
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1117	static _Py_UCNHashAPI *pucnHash = NULL;
				1118
				1119	static
				1120	int mystrnicmp(const char s1, const char s2, size_t count)
				1121	{
				1122	char c1, c2;
				1123
				1124	if (count)
				1125	{
				1126	do
				1127	{
				1128	c1 = tolower(*(s1++));
				1129	c2 = tolower(*(s2++));
				1130	}
				1131	while(--count && c1 == c2);
				1132
				1133	return c1 - c2;
				1134	}
				1135
				1136	return 0;
				1137	}
				1138
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1139	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1140	int size,
				1141	const char *errors)
				1142	{
				1143	PyUnicodeObject *v;
				1144	Py_UNICODE p = NULL, buf = NULL;
				1145	const char *end;
				1146
				1147	/* Escaped strings will always be longer than the resulting
				1148	Unicode string, so we start with size here and then reduce the
				1149	length after conversion to the true value. */
				1150	v = _PyUnicode_New(size);
				1151	if (v == NULL)
				1152	goto onError;
				1153	if (size == 0)
				1154	return (PyObject *)v;
				1155	p = buf = PyUnicode_AS_UNICODE(v);
				1156	end = s + size;
				1157	while (s < end) {
				1158	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1159	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1160	int i;
				1161
				1162	/* Non-escape characters are interpreted as Unicode ordinals */
				1163	if (*s != '\\') {
				1164	p++ = (unsigned char)s++;
				1165	continue;
				1166	}
				1167
				1168	/* \ - Escapes */
				1169	s++;
				1170	switch (*s++) {
				1171
				1172	/* \x escapes */
				1173	case '\n': break;
				1174	case '\\': *p++ = '\\'; break;
				1175	case '\'': *p++ = '\''; break;
				1176	case '\"': *p++ = '\"'; break;
				1177	case 'b': *p++ = '\b'; break;
				1178	case 'f': p++ = '\014'; break; / FF */
				1179	case 't': *p++ = '\t'; break;
				1180	case 'n': *p++ = '\n'; break;
				1181	case 'r': *p++ = '\r'; break;
				1182	case 'v': p++ = '\013'; break; / VT */
				1183	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1184
				1185	/* \OOO (octal) escapes */
				1186	case '0': case '1': case '2': case '3':
				1187	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1188	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1189	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1190	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1191	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1192	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1193	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1194	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1195	break;
				1196
				1197	/* \xXXXX escape with 0-4 hex digits */
				1198	case 'x':
				1199	x = 0;
				1200	c = (unsigned char)*s;
				1201	if (isxdigit(c)) {
				1202	do {
				1203	x = (x<<4) & ~0xF;
				1204	if ('0' <= c && c <= '9')
				1205	x += c - '0';
				1206	else if ('a' <= c && c <= 'f')
				1207	x += 10 + c - 'a';
				1208	else
				1209	x += 10 + c - 'A';
				1210	c = (unsigned char)*++s;
				1211	} while (isxdigit(c));
				1212	*p++ = x;
				1213	} else {
				1214	*p++ = '\\';
				1215	*p++ = (unsigned char)s[-1];
				1216	}
				1217	break;
				1218
				1219	/* \uXXXX with 4 hex digits */
				1220	case 'u':
				1221	for (x = 0, i = 0; i < 4; i++) {
				1222	c = (unsigned char)s[i];
				1223	if (!isxdigit(c)) {
				1224	if (unicodeescape_decoding_error(&s, &x, errors,
				1225	"truncated \\uXXXX"))
				1226	goto onError;
				1227	i++;
				1228	break;
				1229	}
				1230	x = (x<<4) & ~0xF;
				1231	if (c >= '0' && c <= '9')
				1232	x += c - '0';
				1233	else if (c >= 'a' && c <= 'f')
				1234	x += 10 + c - 'a';
				1235	else
				1236	x += 10 + c - 'A';
				1237	}
				1238	s += i;
				1239	*p++ = x;
				1240	break;
				1241
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1242	case 'N':
				1243	/* Ok, we need to deal with Unicode Character Names now,
				1244	* make sure we've imported the hash table data...
				1245	*/
				1246	if (pucnHash == NULL)
				1247	{
				1248	PyObject mod = 0, v = 0;
				1249
				1250	mod = PyImport_ImportModule("ucnhash");
				1251	if (mod == NULL)
				1252	goto onError;
				1253	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1254	Py_DECREF(mod);
				1255	if (v == NULL)
				1256	{
				1257	goto onError;
				1258	}
				1259	pucnHash = PyCObject_AsVoidPtr(v);
				1260	Py_DECREF(v);
				1261	if (pucnHash == NULL)
				1262	{
				1263	goto onError;
				1264	}
				1265	}
				1266
				1267	if (*s == '{')
				1268	{
				1269	const char *start = s + 1;
				1270	const char *endBrace = start;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1271	Py_UCS4 value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1272	unsigned long j;
				1273
				1274	/* look for either the closing brace, or we
				1275	* exceed the maximum length of the unicode character names
				1276	*/
				1277	while (*endBrace != '}' &&
				1278	(unsigned int)(endBrace - start) <=
				1279	pucnHash->cchMax &&
				1280	endBrace < end)
				1281	{
				1282	endBrace++;
				1283	}
				1284	if (endBrace != end && *endBrace == '}')
				1285	{
				1286	j = pucnHash->hash(start, endBrace - start);
				1287	if (j > pucnHash->cKeys \|\|
				1288	mystrnicmp(
				1289	start,
				1290	((_Py_UnicodeCharacterName *)
				1291	(pucnHash->getValue(j)))->pszUCN,
				1292	(int)(endBrace - start)) != 0)
				1293	{
				1294	if (unicodeescape_decoding_error(
				1295	&s, &x, errors,
				1296	"Invalid Unicode Character Name"))
				1297	{
				1298	goto onError;
				1299	}
				1300	goto ucnFallthrough;
				1301	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1302	value = ((_Py_UnicodeCharacterName *)
				1303	(pucnHash->getValue(j)))->value;
				1304	if (value < 1<<16)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1305	{
				1306	/* In UCS-2 range, easy solution.. */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1307	*p++ = value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1308	}
				1309	else
				1310	{
				1311	/* Oops, its in UCS-4 space, */
				1312	/* compute and append the two surrogates: */
				1313	/* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1314	value -= 0x10000;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1315
				1316	/* high surrogate = top 10 bits added to D800 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1317	*p++ = 0xD800 + (value >> 10);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1318
				1319	/* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1320	*p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1321	}
				1322	s = endBrace + 1;
				1323	}
				1324	else
				1325	{
				1326	if (unicodeescape_decoding_error(
				1327	&s, &x, errors,
				1328	"Unicode name missing closing brace"))
				1329	goto onError;
				1330	goto ucnFallthrough;
				1331	}
				1332	break;
				1333	}
				1334	if (unicodeescape_decoding_error(
				1335	&s, &x, errors,
				1336	"Missing opening brace for Unicode Character Name escape"))
				1337	goto onError;
				1338	ucnFallthrough:
				1339	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1340	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1341	*p++ = '\\';
				1342	*p++ = (unsigned char)s[-1];
				1343	break;
				1344	}
				1345	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1346	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1347	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1348	return (PyObject *)v;
				1349
				1350	onError:
				1351	Py_XDECREF(v);
				1352	return NULL;
				1353	}
				1354
				1355	/* Return a Unicode-Escape string version of the Unicode object.
				1356
				1357	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1358	appropriate.
				1359
				1360	*/
				1361
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1362	static const Py_UNICODE findchar(const Py_UNICODE s,
				1363	int size,
				1364	Py_UNICODE ch);
				1365
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1366	static
				1367	PyObject unicodeescape_string(const Py_UNICODE s,
				1368	int size,
				1369	int quotes)
				1370	{
				1371	PyObject *repr;
				1372	char *p;
				1373	char *q;
				1374
				1375	static const char *hexdigit = "0123456789ABCDEF";
				1376
				1377	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1378	if (repr == NULL)
				1379	return NULL;
				1380
				1381	p = q = PyString_AS_STRING(repr);
				1382
				1383	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1384	*p++ = 'u';
				1385	*p++ = (findchar(s, size, '\'') &&
				1386	!findchar(s, size, '"')) ? '"' : '\'';
				1387	}
				1388	while (size-- > 0) {
				1389	Py_UNICODE ch = *s++;
				1390	/* Escape quotes */
				1391	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1392	*p++ = '\\';
				1393	*p++ = (char) ch;
				1394	}
				1395	/* Map 16-bit characters to '\uxxxx' */
				1396	else if (ch >= 256) {
				1397	*p++ = '\\';
				1398	*p++ = 'u';
				1399	*p++ = hexdigit[(ch >> 12) & 0xf];
				1400	*p++ = hexdigit[(ch >> 8) & 0xf];
				1401	*p++ = hexdigit[(ch >> 4) & 0xf];
				1402	*p++ = hexdigit[ch & 15];
				1403	}
				1404	/* Map non-printable US ASCII to '\ooo' */
				1405	else if (ch < ' ' \|\| ch >= 128) {
				1406	*p++ = '\\';
				1407	*p++ = hexdigit[(ch >> 6) & 7];
				1408	*p++ = hexdigit[(ch >> 3) & 7];
				1409	*p++ = hexdigit[ch & 7];
				1410	}
				1411	/* Copy everything else as-is */
				1412	else
				1413	*p++ = (char) ch;
				1414	}
				1415	if (quotes)
				1416	*p++ = q[1];
				1417
				1418	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1419	if (_PyString_Resize(&repr, p - q))
				1420	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1421
				1422	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1423
				1424	onError:
				1425	Py_DECREF(repr);
				1426	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1427	}
				1428
				1429	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1430	int size)
				1431	{
				1432	return unicodeescape_string(s, size, 0);
				1433	}
				1434
				1435	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1436	{
				1437	if (!PyUnicode_Check(unicode)) {
				1438	PyErr_BadArgument();
				1439	return NULL;
				1440	}
				1441	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1442	PyUnicode_GET_SIZE(unicode));
				1443	}
				1444
				1445	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1446
				1447	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1448	int size,
				1449	const char *errors)
				1450	{
				1451	PyUnicodeObject *v;
				1452	Py_UNICODE p, buf;
				1453	const char *end;
				1454	const char *bs;
				1455
				1456	/* Escaped strings will always be longer than the resulting
				1457	Unicode string, so we start with size here and then reduce the
				1458	length after conversion to the true value. */
				1459	v = _PyUnicode_New(size);
				1460	if (v == NULL)
				1461	goto onError;
				1462	if (size == 0)
				1463	return (PyObject *)v;
				1464	p = buf = PyUnicode_AS_UNICODE(v);
				1465	end = s + size;
				1466	while (s < end) {
				1467	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1468	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1469	int i;
				1470
				1471	/* Non-escape characters are interpreted as Unicode ordinals */
				1472	if (*s != '\\') {
				1473	p++ = (unsigned char)s++;
				1474	continue;
				1475	}
				1476
				1477	/* \u-escapes are only interpreted iff the number of leading
				1478	backslashes if odd */
				1479	bs = s;
				1480	for (;s < end;) {
				1481	if (*s != '\\')
				1482	break;
				1483	p++ = (unsigned char)s++;
				1484	}
				1485	if (((s - bs) & 1) == 0 \|\|
				1486	s >= end \|\|
				1487	*s != 'u') {
				1488	continue;
				1489	}
				1490	p--;
				1491	s++;
				1492
				1493	/* \uXXXX with 4 hex digits */
				1494	for (x = 0, i = 0; i < 4; i++) {
				1495	c = (unsigned char)s[i];
				1496	if (!isxdigit(c)) {
				1497	if (unicodeescape_decoding_error(&s, &x, errors,
				1498	"truncated \\uXXXX"))
				1499	goto onError;
				1500	i++;
				1501	break;
				1502	}
				1503	x = (x<<4) & ~0xF;
				1504	if (c >= '0' && c <= '9')
				1505	x += c - '0';
				1506	else if (c >= 'a' && c <= 'f')
				1507	x += 10 + c - 'a';
				1508	else
				1509	x += 10 + c - 'A';
				1510	}
				1511	s += i;
				1512	*p++ = x;
				1513	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1514	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1515	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1516	return (PyObject *)v;
				1517
				1518	onError:
				1519	Py_XDECREF(v);
				1520	return NULL;
				1521	}
				1522
				1523	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1524	int size)
				1525	{
				1526	PyObject *repr;
				1527	char *p;
				1528	char *q;
				1529
				1530	static const char *hexdigit = "0123456789ABCDEF";
				1531
				1532	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1533	if (repr == NULL)
				1534	return NULL;
				1535
				1536	p = q = PyString_AS_STRING(repr);
				1537	while (size-- > 0) {
				1538	Py_UNICODE ch = *s++;
				1539	/* Map 16-bit characters to '\uxxxx' */
				1540	if (ch >= 256) {
				1541	*p++ = '\\';
				1542	*p++ = 'u';
				1543	*p++ = hexdigit[(ch >> 12) & 0xf];
				1544	*p++ = hexdigit[(ch >> 8) & 0xf];
				1545	*p++ = hexdigit[(ch >> 4) & 0xf];
				1546	*p++ = hexdigit[ch & 15];
				1547	}
				1548	/* Copy everything else as-is */
				1549	else
				1550	*p++ = (char) ch;
				1551	}
				1552	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1553	if (_PyString_Resize(&repr, p - q))
				1554	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1555
				1556	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1557
				1558	onError:
				1559	Py_DECREF(repr);
				1560	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1561	}
				1562
				1563	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1564	{
				1565	if (!PyUnicode_Check(unicode)) {
				1566	PyErr_BadArgument();
				1567	return NULL;
				1568	}
				1569	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1570	PyUnicode_GET_SIZE(unicode));
				1571	}
				1572
				1573	/* --- Latin-1 Codec ------------------------------------------------------ */
				1574
				1575	PyObject PyUnicode_DecodeLatin1(const char s,
				1576	int size,
				1577	const char *errors)
				1578	{
				1579	PyUnicodeObject *v;
				1580	Py_UNICODE *p;
				1581
				1582	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1583	v = _PyUnicode_New(size);
				1584	if (v == NULL)
				1585	goto onError;
				1586	if (size == 0)
				1587	return (PyObject *)v;
				1588	p = PyUnicode_AS_UNICODE(v);
				1589	while (size-- > 0)
				1590	p++ = (unsigned char)s++;
				1591	return (PyObject *)v;
				1592
				1593	onError:
				1594	Py_XDECREF(v);
				1595	return NULL;
				1596	}
				1597
				1598	static
				1599	int latin1_encoding_error(const Py_UNICODE **source,
				1600	char **dest,
				1601	const char *errors,
				1602	const char *details)
				1603	{
				1604	if ((errors == NULL) \|\|
				1605	(strcmp(errors,"strict") == 0)) {
				1606	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1607	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1608	details);
				1609	return -1;
				1610	}
				1611	else if (strcmp(errors,"ignore") == 0) {
				1612	return 0;
				1613	}
				1614	else if (strcmp(errors,"replace") == 0) {
				1615	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1616	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1617	return 0;
				1618	}
				1619	else {
				1620	PyErr_Format(PyExc_ValueError,
				1621	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1622	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1623	errors);
				1624	return -1;
				1625	}
				1626	}
				1627
				1628	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1629	int size,
				1630	const char *errors)
				1631	{
				1632	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1633	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1634	repr = PyString_FromStringAndSize(NULL, size);
				1635	if (repr == NULL)
				1636	return NULL;
				1637
				1638	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1639	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1640	while (size-- > 0) {
				1641	Py_UNICODE ch = *p++;
				1642	if (ch >= 256) {
				1643	if (latin1_encoding_error(&p, &s, errors,
				1644	"ordinal not in range(256)"))
				1645	goto onError;
				1646	}
				1647	else
				1648	*s++ = (char)ch;
				1649	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1650	/* Resize if error handling skipped some characters */
				1651	if (s - start < PyString_GET_SIZE(repr))
				1652	if (_PyString_Resize(&repr, s - start))
				1653	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1654	return repr;
				1655
				1656	onError:
				1657	Py_DECREF(repr);
				1658	return NULL;
				1659	}
				1660
				1661	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1662	{
				1663	if (!PyUnicode_Check(unicode)) {
				1664	PyErr_BadArgument();
				1665	return NULL;
				1666	}
				1667	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1668	PyUnicode_GET_SIZE(unicode),
				1669	NULL);
				1670	}
				1671
				1672	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1673
				1674	static
				1675	int ascii_decoding_error(const char **source,
				1676	Py_UNICODE **dest,
				1677	const char *errors,
				1678	const char *details)
				1679	{
				1680	if ((errors == NULL) \|\|
				1681	(strcmp(errors,"strict") == 0)) {
				1682	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1683	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1684	details);
				1685	return -1;
				1686	}
				1687	else if (strcmp(errors,"ignore") == 0) {
				1688	return 0;
				1689	}
				1690	else if (strcmp(errors,"replace") == 0) {
				1691	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1692	(*dest)++;
				1693	return 0;
				1694	}
				1695	else {
				1696	PyErr_Format(PyExc_ValueError,
				1697	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1698	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1699	errors);
				1700	return -1;
				1701	}
				1702	}
				1703
				1704	PyObject PyUnicode_DecodeASCII(const char s,
				1705	int size,
				1706	const char *errors)
				1707	{
				1708	PyUnicodeObject *v;
				1709	Py_UNICODE *p;
				1710
				1711	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1712	v = _PyUnicode_New(size);
				1713	if (v == NULL)
				1714	goto onError;
				1715	if (size == 0)
				1716	return (PyObject *)v;
				1717	p = PyUnicode_AS_UNICODE(v);
				1718	while (size-- > 0) {
				1719	register unsigned char c;
				1720
				1721	c = (unsigned char)*s++;
				1722	if (c < 128)
				1723	*p++ = c;
				1724	else if (ascii_decoding_error(&s, &p, errors,
				1725	"ordinal not in range(128)"))
				1726	goto onError;
				1727	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1728	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1729	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1730	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1731	return (PyObject *)v;
				1732
				1733	onError:
				1734	Py_XDECREF(v);
				1735	return NULL;
				1736	}
				1737
				1738	static
				1739	int ascii_encoding_error(const Py_UNICODE **source,
				1740	char **dest,
				1741	const char *errors,
				1742	const char *details)
				1743	{
				1744	if ((errors == NULL) \|\|
				1745	(strcmp(errors,"strict") == 0)) {
				1746	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1747	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1748	details);
				1749	return -1;
				1750	}
				1751	else if (strcmp(errors,"ignore") == 0) {
				1752	return 0;
				1753	}
				1754	else if (strcmp(errors,"replace") == 0) {
				1755	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1756	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1757	return 0;
				1758	}
				1759	else {
				1760	PyErr_Format(PyExc_ValueError,
				1761	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1762	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1763	errors);
				1764	return -1;
				1765	}
				1766	}
				1767
				1768	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1769	int size,
				1770	const char *errors)
				1771	{
				1772	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1773	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1774	repr = PyString_FromStringAndSize(NULL, size);
				1775	if (repr == NULL)
				1776	return NULL;
				1777
				1778	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1779	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1780	while (size-- > 0) {
				1781	Py_UNICODE ch = *p++;
				1782	if (ch >= 128) {
				1783	if (ascii_encoding_error(&p, &s, errors,
				1784	"ordinal not in range(128)"))
				1785	goto onError;
				1786	}
				1787	else
				1788	*s++ = (char)ch;
				1789	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1790	/* Resize if error handling skipped some characters */
				1791	if (s - start < PyString_GET_SIZE(repr))
				1792	if (_PyString_Resize(&repr, s - start))
				1793	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1794	return repr;
				1795
				1796	onError:
				1797	Py_DECREF(repr);
				1798	return NULL;
				1799	}
				1800
				1801	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1802	{
				1803	if (!PyUnicode_Check(unicode)) {
				1804	PyErr_BadArgument();
				1805	return NULL;
				1806	}
				1807	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1808	PyUnicode_GET_SIZE(unicode),
				1809	NULL);
				1810	}
				1811
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1812	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1813
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1814	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1815
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1816	PyObject PyUnicode_DecodeMBCS(const char s,
				1817	int size,
				1818	const char *errors)
				1819	{
				1820	PyUnicodeObject *v;
				1821	Py_UNICODE *p;
				1822
				1823	/* First get the size of the result */
				1824	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1825	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1826	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1827
				1828	v = _PyUnicode_New(usize);
				1829	if (v == NULL)
				1830	return NULL;
				1831	if (usize == 0)
				1832	return (PyObject *)v;
				1833	p = PyUnicode_AS_UNICODE(v);
				1834	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1835	Py_DECREF(v);
				1836	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1837	}
				1838
				1839	return (PyObject *)v;
				1840	}
				1841
				1842	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1843	int size,
				1844	const char *errors)
				1845	{
				1846	PyObject *repr;
				1847	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1848	DWORD mbcssize;
				1849
				1850	/* If there are no characters, bail now! */
				1851	if (size==0)
				1852	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1853
				1854	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1855	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1856	if (mbcssize==0)
				1857	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1858
				1859	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1860	if (repr == NULL)
				1861	return NULL;
				1862	if (mbcssize==0)
				1863	return repr;
				1864
				1865	/* Do the conversion */
				1866	s = PyString_AS_STRING(repr);
				1867	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1868	Py_DECREF(repr);
				1869	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1870	}
				1871	return repr;
				1872	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1873
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1874	#endif /* MS_WIN32 */
				1875
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1876	/* --- Character Mapping Codec -------------------------------------------- */
				1877
				1878	static
				1879	int charmap_decoding_error(const char **source,
				1880	Py_UNICODE **dest,
				1881	const char *errors,
				1882	const char *details)
				1883	{
				1884	if ((errors == NULL) \|\|
				1885	(strcmp(errors,"strict") == 0)) {
				1886	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1887	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1888	details);
				1889	return -1;
				1890	}
				1891	else if (strcmp(errors,"ignore") == 0) {
				1892	return 0;
				1893	}
				1894	else if (strcmp(errors,"replace") == 0) {
				1895	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1896	(*dest)++;
				1897	return 0;
				1898	}
				1899	else {
				1900	PyErr_Format(PyExc_ValueError,
				1901	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1902	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1903	errors);
				1904	return -1;
				1905	}
				1906	}
				1907
				1908	PyObject PyUnicode_DecodeCharmap(const char s,
				1909	int size,
				1910	PyObject *mapping,
				1911	const char *errors)
				1912	{
				1913	PyUnicodeObject *v;
				1914	Py_UNICODE *p;
				1915
				1916	/* Default to Latin-1 */
				1917	if (mapping == NULL)
				1918	return PyUnicode_DecodeLatin1(s, size, errors);
				1919
				1920	v = _PyUnicode_New(size);
				1921	if (v == NULL)
				1922	goto onError;
				1923	if (size == 0)
				1924	return (PyObject *)v;
				1925	p = PyUnicode_AS_UNICODE(v);
				1926	while (size-- > 0) {
				1927	unsigned char ch = *s++;
				1928	PyObject w, x;
				1929
				1930	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1931	w = PyInt_FromLong((long)ch);
				1932	if (w == NULL)
				1933	goto onError;
				1934	x = PyObject_GetItem(mapping, w);
				1935	Py_DECREF(w);
				1936	if (x == NULL) {
				1937	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1938	/* No mapping found: default to Latin-1 mapping */
				1939	PyErr_Clear();
				1940	*p++ = (Py_UNICODE)ch;
				1941	continue;
				1942	}
				1943	goto onError;
				1944	}
				1945
				1946	/* Apply mapping */
				1947	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1948	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1949	if (value < 0 \|\| value > 65535) {
				1950	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1951	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1952	Py_DECREF(x);
				1953	goto onError;
				1954	}
				1955	*p++ = (Py_UNICODE)value;
				1956	}
				1957	else if (x == Py_None) {
				1958	/* undefined mapping */
				1959	if (charmap_decoding_error(&s, &p, errors,
				1960	"character maps to <undefined>")) {
				1961	Py_DECREF(x);
				1962	goto onError;
				1963	}
				1964	}
				1965	else if (PyUnicode_Check(x)) {
				1966	if (PyUnicode_GET_SIZE(x) != 1) {
				1967	/* 1-n mapping */
				1968	PyErr_SetString(PyExc_NotImplementedError,
				1969	"1-n mappings are currently not implemented");
				1970	Py_DECREF(x);
				1971	goto onError;
				1972	}
				1973	p++ = PyUnicode_AS_UNICODE(x);
				1974	}
				1975	else {
				1976	/* wrong return value */
				1977	PyErr_SetString(PyExc_TypeError,
				1978	"character mapping must return integer, None or unicode");
				1979	Py_DECREF(x);
				1980	goto onError;
				1981	}
				1982	Py_DECREF(x);
				1983	}
				1984	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1985	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1986	goto onError;
				1987	return (PyObject *)v;
				1988
				1989	onError:
				1990	Py_XDECREF(v);
				1991	return NULL;
				1992	}
				1993
				1994	static
				1995	int charmap_encoding_error(const Py_UNICODE **source,
				1996	char **dest,
				1997	const char *errors,
				1998	const char *details)
				1999	{
				2000	if ((errors == NULL) \|\|
				2001	(strcmp(errors,"strict") == 0)) {
				2002	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2003	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2004	details);
				2005	return -1;
				2006	}
				2007	else if (strcmp(errors,"ignore") == 0) {
				2008	return 0;
				2009	}
				2010	else if (strcmp(errors,"replace") == 0) {
				2011	**dest = '?';
				2012	(*dest)++;
				2013	return 0;
				2014	}
				2015	else {
				2016	PyErr_Format(PyExc_ValueError,
				2017	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2018	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2019	errors);
				2020	return -1;
				2021	}
				2022	}
				2023
				2024	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2025	int size,
				2026	PyObject *mapping,
				2027	const char *errors)
				2028	{
				2029	PyObject *v;
				2030	char *s;
				2031
				2032	/* Default to Latin-1 */
				2033	if (mapping == NULL)
				2034	return PyUnicode_EncodeLatin1(p, size, errors);
				2035
				2036	v = PyString_FromStringAndSize(NULL, size);
				2037	if (v == NULL)
				2038	return NULL;
				2039	s = PyString_AS_STRING(v);
				2040	while (size-- > 0) {
				2041	Py_UNICODE ch = *p++;
				2042	PyObject w, x;
				2043
				2044	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2045	w = PyInt_FromLong((long)ch);
				2046	if (w == NULL)
				2047	goto onError;
				2048	x = PyObject_GetItem(mapping, w);
				2049	Py_DECREF(w);
				2050	if (x == NULL) {
				2051	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2052	/* No mapping found: default to Latin-1 mapping if possible */
				2053	PyErr_Clear();
				2054	if (ch < 256) {
				2055	*s++ = (char)ch;
				2056	continue;
				2057	}
				2058	else if (!charmap_encoding_error(&p, &s, errors,
				2059	"missing character mapping"))
				2060	continue;
				2061	}
				2062	goto onError;
				2063	}
				2064
				2065	/* Apply mapping */
				2066	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2067	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2068	if (value < 0 \|\| value > 255) {
				2069	PyErr_SetString(PyExc_TypeError,
				2070	"character mapping must be in range(256)");
				2071	Py_DECREF(x);
				2072	goto onError;
				2073	}
				2074	*s++ = (char)value;
				2075	}
				2076	else if (x == Py_None) {
				2077	/* undefined mapping */
				2078	if (charmap_encoding_error(&p, &s, errors,
				2079	"character maps to <undefined>")) {
				2080	Py_DECREF(x);
				2081	goto onError;
				2082	}
				2083	}
				2084	else if (PyString_Check(x)) {
				2085	if (PyString_GET_SIZE(x) != 1) {
				2086	/* 1-n mapping */
				2087	PyErr_SetString(PyExc_NotImplementedError,
				2088	"1-n mappings are currently not implemented");
				2089	Py_DECREF(x);
				2090	goto onError;
				2091	}
				2092	s++ = PyString_AS_STRING(x);
				2093	}
				2094	else {
				2095	/* wrong return value */
				2096	PyErr_SetString(PyExc_TypeError,
				2097	"character mapping must return integer, None or unicode");
				2098	Py_DECREF(x);
				2099	goto onError;
				2100	}
				2101	Py_DECREF(x);
				2102	}
				2103	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2104	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2105	goto onError;
				2106	return v;
				2107
				2108	onError:
				2109	Py_DECREF(v);
				2110	return NULL;
				2111	}
				2112
				2113	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2114	PyObject *mapping)
				2115	{
				2116	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2117	PyErr_BadArgument();
				2118	return NULL;
				2119	}
				2120	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2121	PyUnicode_GET_SIZE(unicode),
				2122	mapping,
				2123	NULL);
				2124	}
				2125
				2126	static
				2127	int translate_error(const Py_UNICODE **source,
				2128	Py_UNICODE **dest,
				2129	const char *errors,
				2130	const char *details)
				2131	{
				2132	if ((errors == NULL) \|\|
				2133	(strcmp(errors,"strict") == 0)) {
				2134	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2135	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2136	details);
				2137	return -1;
				2138	}
				2139	else if (strcmp(errors,"ignore") == 0) {
				2140	return 0;
				2141	}
				2142	else if (strcmp(errors,"replace") == 0) {
				2143	**dest = '?';
				2144	(*dest)++;
				2145	return 0;
				2146	}
				2147	else {
				2148	PyErr_Format(PyExc_ValueError,
				2149	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2150	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2151	errors);
				2152	return -1;
				2153	}
				2154	}
				2155
				2156	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2157	int size,
				2158	PyObject *mapping,
				2159	const char *errors)
				2160	{
				2161	PyUnicodeObject *v;
				2162	Py_UNICODE *p;
				2163
				2164	if (mapping == NULL) {
				2165	PyErr_BadArgument();
				2166	return NULL;
				2167	}
				2168
				2169	/* Output will never be longer than input */
				2170	v = _PyUnicode_New(size);
				2171	if (v == NULL)
				2172	goto onError;
				2173	if (size == 0)
				2174	goto done;
				2175	p = PyUnicode_AS_UNICODE(v);
				2176	while (size-- > 0) {
				2177	Py_UNICODE ch = *s++;
				2178	PyObject w, x;
				2179
				2180	/* Get mapping */
				2181	w = PyInt_FromLong(ch);
				2182	if (w == NULL)
				2183	goto onError;
				2184	x = PyObject_GetItem(mapping, w);
				2185	Py_DECREF(w);
				2186	if (x == NULL) {
				2187	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2188	/* No mapping found: default to 1-1 mapping */
				2189	PyErr_Clear();
				2190	*p++ = ch;
				2191	continue;
				2192	}
				2193	goto onError;
				2194	}
				2195
				2196	/* Apply mapping */
				2197	if (PyInt_Check(x))
				2198	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2199	else if (x == Py_None) {
				2200	/* undefined mapping */
				2201	if (translate_error(&s, &p, errors,
				2202	"character maps to <undefined>")) {
				2203	Py_DECREF(x);
				2204	goto onError;
				2205	}
				2206	}
				2207	else if (PyUnicode_Check(x)) {
				2208	if (PyUnicode_GET_SIZE(x) != 1) {
				2209	/* 1-n mapping */
				2210	PyErr_SetString(PyExc_NotImplementedError,
				2211	"1-n mappings are currently not implemented");
				2212	Py_DECREF(x);
				2213	goto onError;
				2214	}
				2215	p++ = PyUnicode_AS_UNICODE(x);
				2216	}
				2217	else {
				2218	/* wrong return value */
				2219	PyErr_SetString(PyExc_TypeError,
				2220	"translate mapping must return integer, None or unicode");
				2221	Py_DECREF(x);
				2222	goto onError;
				2223	}
				2224	Py_DECREF(x);
				2225	}
				2226	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2227	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2228	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2229
				2230	done:
				2231	return (PyObject *)v;
				2232
				2233	onError:
				2234	Py_XDECREF(v);
				2235	return NULL;
				2236	}
				2237
				2238	PyObject PyUnicode_Translate(PyObject str,
				2239	PyObject *mapping,
				2240	const char *errors)
				2241	{
				2242	PyObject *result;
				2243
				2244	str = PyUnicode_FromObject(str);
				2245	if (str == NULL)
				2246	goto onError;
				2247	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2248	PyUnicode_GET_SIZE(str),
				2249	mapping,
				2250	errors);
				2251	Py_DECREF(str);
				2252	return result;
				2253
				2254	onError:
				2255	Py_XDECREF(str);
				2256	return NULL;
				2257	}
				2258
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2259	/* --- Decimal Encoder ---------------------------------------------------- */
				2260
				2261	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2262	int length,
				2263	char *output,
				2264	const char *errors)
				2265	{
				2266	Py_UNICODE p, end;
				2267
				2268	if (output == NULL) {
				2269	PyErr_BadArgument();
				2270	return -1;
				2271	}
				2272
				2273	p = s;
				2274	end = s + length;
				2275	while (p < end) {
				2276	register Py_UNICODE ch = *p++;
				2277	int decimal;
				2278
				2279	if (Py_UNICODE_ISSPACE(ch)) {
				2280	*output++ = ' ';
				2281	continue;
				2282	}
				2283	decimal = Py_UNICODE_TODECIMAL(ch);
				2284	if (decimal >= 0) {
				2285	*output++ = '0' + decimal;
				2286	continue;
				2287	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2288	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2289	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2290	continue;
				2291	}
				2292	/* All other characters are considered invalid */
				2293	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2294	PyErr_SetString(PyExc_ValueError,
				2295	"invalid decimal Unicode string");
				2296	goto onError;
				2297	}
				2298	else if (strcmp(errors, "ignore") == 0)
				2299	continue;
				2300	else if (strcmp(errors, "replace") == 0) {
				2301	*output++ = '?';
				2302	continue;
				2303	}
				2304	}
				2305	/* 0-terminate the output string */
				2306	*output++ = '\0';
				2307	return 0;
				2308
				2309	onError:
				2310	return -1;
				2311	}
				2312
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2313	/* --- Helpers ------------------------------------------------------------ */
				2314
				2315	static
				2316	int count(PyUnicodeObject *self,
				2317	int start,
				2318	int end,
				2319	PyUnicodeObject *substring)
				2320	{
				2321	int count = 0;
				2322
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2323	if (substring->length == 0)
				2324	return (end - start + 1);
				2325
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2326	end -= substring->length;
				2327
				2328	while (start <= end)
				2329	if (Py_UNICODE_MATCH(self, start, substring)) {
				2330	count++;
				2331	start += substring->length;
				2332	} else
				2333	start++;
				2334
				2335	return count;
				2336	}
				2337
				2338	int PyUnicode_Count(PyObject *str,
				2339	PyObject *substr,
				2340	int start,
				2341	int end)
				2342	{
				2343	int result;
				2344
				2345	str = PyUnicode_FromObject(str);
				2346	if (str == NULL)
				2347	return -1;
				2348	substr = PyUnicode_FromObject(substr);
				2349	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2350	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2351	return -1;
				2352	}
				2353
				2354	result = count((PyUnicodeObject *)str,
				2355	start, end,
				2356	(PyUnicodeObject *)substr);
				2357
				2358	Py_DECREF(str);
				2359	Py_DECREF(substr);
				2360	return result;
				2361	}
				2362
				2363	static
				2364	int findstring(PyUnicodeObject *self,
				2365	PyUnicodeObject *substring,
				2366	int start,
				2367	int end,
				2368	int direction)
				2369	{
				2370	if (start < 0)
				2371	start += self->length;
				2372	if (start < 0)
				2373	start = 0;
				2374
				2375	if (substring->length == 0)
				2376	return start;
				2377
				2378	if (end > self->length)
				2379	end = self->length;
				2380	if (end < 0)
				2381	end += self->length;
				2382	if (end < 0)
				2383	end = 0;
				2384
				2385	end -= substring->length;
				2386
				2387	if (direction < 0) {
				2388	for (; end >= start; end--)
				2389	if (Py_UNICODE_MATCH(self, end, substring))
				2390	return end;
				2391	} else {
				2392	for (; start <= end; start++)
				2393	if (Py_UNICODE_MATCH(self, start, substring))
				2394	return start;
				2395	}
				2396
				2397	return -1;
				2398	}
				2399
				2400	int PyUnicode_Find(PyObject *str,
				2401	PyObject *substr,
				2402	int start,
				2403	int end,
				2404	int direction)
				2405	{
				2406	int result;
				2407
				2408	str = PyUnicode_FromObject(str);
				2409	if (str == NULL)
				2410	return -1;
				2411	substr = PyUnicode_FromObject(substr);
				2412	if (substr == NULL) {
				2413	Py_DECREF(substr);
				2414	return -1;
				2415	}
				2416
				2417	result = findstring((PyUnicodeObject *)str,
				2418	(PyUnicodeObject *)substr,
				2419	start, end, direction);
				2420	Py_DECREF(str);
				2421	Py_DECREF(substr);
				2422	return result;
				2423	}
				2424
				2425	static
				2426	int tailmatch(PyUnicodeObject *self,
				2427	PyUnicodeObject *substring,
				2428	int start,
				2429	int end,
				2430	int direction)
				2431	{
				2432	if (start < 0)
				2433	start += self->length;
				2434	if (start < 0)
				2435	start = 0;
				2436
				2437	if (substring->length == 0)
				2438	return 1;
				2439
				2440	if (end > self->length)
				2441	end = self->length;
				2442	if (end < 0)
				2443	end += self->length;
				2444	if (end < 0)
				2445	end = 0;
				2446
				2447	end -= substring->length;
				2448	if (end < start)
				2449	return 0;
				2450
				2451	if (direction > 0) {
				2452	if (Py_UNICODE_MATCH(self, end, substring))
				2453	return 1;
				2454	} else {
				2455	if (Py_UNICODE_MATCH(self, start, substring))
				2456	return 1;
				2457	}
				2458
				2459	return 0;
				2460	}
				2461
				2462	int PyUnicode_Tailmatch(PyObject *str,
				2463	PyObject *substr,
				2464	int start,
				2465	int end,
				2466	int direction)
				2467	{
				2468	int result;
				2469
				2470	str = PyUnicode_FromObject(str);
				2471	if (str == NULL)
				2472	return -1;
				2473	substr = PyUnicode_FromObject(substr);
				2474	if (substr == NULL) {
				2475	Py_DECREF(substr);
				2476	return -1;
				2477	}
				2478
				2479	result = tailmatch((PyUnicodeObject *)str,
				2480	(PyUnicodeObject *)substr,
				2481	start, end, direction);
				2482	Py_DECREF(str);
				2483	Py_DECREF(substr);
				2484	return result;
				2485	}
				2486
				2487	static
				2488	const Py_UNICODE findchar(const Py_UNICODE s,
				2489	int size,
				2490	Py_UNICODE ch)
				2491	{
				2492	/* like wcschr, but doesn't stop at NULL characters */
				2493
				2494	while (size-- > 0) {
				2495	if (*s == ch)
				2496	return s;
				2497	s++;
				2498	}
				2499
				2500	return NULL;
				2501	}
				2502
				2503	/* Apply fixfct filter to the Unicode object self and return a
				2504	reference to the modified object */
				2505
				2506	static
				2507	PyObject fixup(PyUnicodeObject self,
				2508	int (fixfct)(PyUnicodeObject s))
				2509	{
				2510
				2511	PyUnicodeObject *u;
				2512
				2513	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2514	self->length);
				2515	if (u == NULL)
				2516	return NULL;
				2517	if (!fixfct(u)) {
				2518	/* fixfct should return TRUE if it modified the buffer. If
				2519	FALSE, return a reference to the original buffer instead
				2520	(to save space, not time) */
				2521	Py_INCREF(self);
				2522	Py_DECREF(u);
				2523	return (PyObject*) self;
				2524	}
				2525	return (PyObject*) u;
				2526	}
				2527
				2528	static
				2529	int fixupper(PyUnicodeObject *self)
				2530	{
				2531	int len = self->length;
				2532	Py_UNICODE *s = self->str;
				2533	int status = 0;
				2534
				2535	while (len-- > 0) {
				2536	register Py_UNICODE ch;
				2537
				2538	ch = Py_UNICODE_TOUPPER(*s);
				2539	if (ch != *s) {
				2540	status = 1;
				2541	*s = ch;
				2542	}
				2543	s++;
				2544	}
				2545
				2546	return status;
				2547	}
				2548
				2549	static
				2550	int fixlower(PyUnicodeObject *self)
				2551	{
				2552	int len = self->length;
				2553	Py_UNICODE *s = self->str;
				2554	int status = 0;
				2555
				2556	while (len-- > 0) {
				2557	register Py_UNICODE ch;
				2558
				2559	ch = Py_UNICODE_TOLOWER(*s);
				2560	if (ch != *s) {
				2561	status = 1;
				2562	*s = ch;
				2563	}
				2564	s++;
				2565	}
				2566
				2567	return status;
				2568	}
				2569
				2570	static
				2571	int fixswapcase(PyUnicodeObject *self)
				2572	{
				2573	int len = self->length;
				2574	Py_UNICODE *s = self->str;
				2575	int status = 0;
				2576
				2577	while (len-- > 0) {
				2578	if (Py_UNICODE_ISUPPER(*s)) {
				2579	s = Py_UNICODE_TOLOWER(s);
				2580	status = 1;
				2581	} else if (Py_UNICODE_ISLOWER(*s)) {
				2582	s = Py_UNICODE_TOUPPER(s);
				2583	status = 1;
				2584	}
				2585	s++;
				2586	}
				2587
				2588	return status;
				2589	}
				2590
				2591	static
				2592	int fixcapitalize(PyUnicodeObject *self)
				2593	{
				2594	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2595	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2596	return 1;
				2597	}
				2598	return 0;
				2599	}
				2600
				2601	static
				2602	int fixtitle(PyUnicodeObject *self)
				2603	{
				2604	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2605	register Py_UNICODE *e;
				2606	int previous_is_cased;
				2607
				2608	/* Shortcut for single character strings */
				2609	if (PyUnicode_GET_SIZE(self) == 1) {
				2610	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2611	if (*p != ch) {
				2612	*p = ch;
				2613	return 1;
				2614	}
				2615	else
				2616	return 0;
				2617	}
				2618
				2619	e = p + PyUnicode_GET_SIZE(self);
				2620	previous_is_cased = 0;
				2621	for (; p < e; p++) {
				2622	register const Py_UNICODE ch = *p;
				2623
				2624	if (previous_is_cased)
				2625	*p = Py_UNICODE_TOLOWER(ch);
				2626	else
				2627	*p = Py_UNICODE_TOTITLE(ch);
				2628
				2629	if (Py_UNICODE_ISLOWER(ch) \|\|
				2630	Py_UNICODE_ISUPPER(ch) \|\|
				2631	Py_UNICODE_ISTITLE(ch))
				2632	previous_is_cased = 1;
				2633	else
				2634	previous_is_cased = 0;
				2635	}
				2636	return 1;
				2637	}
				2638
				2639	PyObject PyUnicode_Join(PyObject separator,
				2640	PyObject *seq)
				2641	{
				2642	Py_UNICODE *sep;
				2643	int seplen;
				2644	PyUnicodeObject *res = NULL;
				2645	int reslen = 0;
				2646	Py_UNICODE *p;
				2647	int seqlen = 0;
				2648	int sz = 100;
				2649	int i;
				2650
				2651	seqlen = PySequence_Length(seq);
				2652	if (seqlen < 0 && PyErr_Occurred())
				2653	return NULL;
				2654
				2655	if (separator == NULL) {
				2656	Py_UNICODE blank = ' ';
				2657	sep = &blank;
				2658	seplen = 1;
				2659	}
				2660	else {
				2661	separator = PyUnicode_FromObject(separator);
				2662	if (separator == NULL)
				2663	return NULL;
				2664	sep = PyUnicode_AS_UNICODE(separator);
				2665	seplen = PyUnicode_GET_SIZE(separator);
				2666	}
				2667
				2668	res = _PyUnicode_New(sz);
				2669	if (res == NULL)
				2670	goto onError;
				2671	p = PyUnicode_AS_UNICODE(res);
				2672	reslen = 0;
				2673
				2674	for (i = 0; i < seqlen; i++) {
				2675	int itemlen;
				2676	PyObject *item;
				2677
				2678	item = PySequence_GetItem(seq, i);
				2679	if (item == NULL)
				2680	goto onError;
				2681	if (!PyUnicode_Check(item)) {
				2682	PyObject *v;
				2683	v = PyUnicode_FromObject(item);
				2684	Py_DECREF(item);
				2685	item = v;
				2686	if (item == NULL)
				2687	goto onError;
				2688	}
				2689	itemlen = PyUnicode_GET_SIZE(item);
				2690	while (reslen + itemlen + seplen >= sz) {
				2691	if (_PyUnicode_Resize(res, sz*2))
				2692	goto onError;
				2693	sz *= 2;
				2694	p = PyUnicode_AS_UNICODE(res) + reslen;
				2695	}
				2696	if (i > 0) {
				2697	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2698	p += seplen;
				2699	reslen += seplen;
				2700	}
				2701	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2702	p += itemlen;
				2703	reslen += itemlen;
				2704	Py_DECREF(item);
				2705	}
				2706	if (_PyUnicode_Resize(res, reslen))
				2707	goto onError;
				2708
				2709	Py_XDECREF(separator);
				2710	return (PyObject *)res;
				2711
				2712	onError:
				2713	Py_XDECREF(separator);
				2714	Py_DECREF(res);
				2715	return NULL;
				2716	}
				2717
				2718	static
				2719	PyUnicodeObject pad(PyUnicodeObject self,
				2720	int left,
				2721	int right,
				2722	Py_UNICODE fill)
				2723	{
				2724	PyUnicodeObject *u;
				2725
				2726	if (left < 0)
				2727	left = 0;
				2728	if (right < 0)
				2729	right = 0;
				2730
				2731	if (left == 0 && right == 0) {
				2732	Py_INCREF(self);
				2733	return self;
				2734	}
				2735
				2736	u = _PyUnicode_New(left + self->length + right);
				2737	if (u) {
				2738	if (left)
				2739	Py_UNICODE_FILL(u->str, fill, left);
				2740	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2741	if (right)
				2742	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2743	}
				2744
				2745	return u;
				2746	}
				2747
				2748	#define SPLIT_APPEND(data, left, right) \
				2749	str = PyUnicode_FromUnicode(data + left, right - left); \
				2750	if (!str) \
				2751	goto onError; \
				2752	if (PyList_Append(list, str)) { \
				2753	Py_DECREF(str); \
				2754	goto onError; \
				2755	} \
				2756	else \
				2757	Py_DECREF(str);
				2758
				2759	static
				2760	PyObject split_whitespace(PyUnicodeObject self,
				2761	PyObject *list,
				2762	int maxcount)
				2763	{
				2764	register int i;
				2765	register int j;
				2766	int len = self->length;
				2767	PyObject *str;
				2768
				2769	for (i = j = 0; i < len; ) {
				2770	/* find a token */
				2771	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2772	i++;
				2773	j = i;
				2774	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2775	i++;
				2776	if (j < i) {
				2777	if (maxcount-- <= 0)
				2778	break;
				2779	SPLIT_APPEND(self->str, j, i);
				2780	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2781	i++;
				2782	j = i;
				2783	}
				2784	}
				2785	if (j < len) {
				2786	SPLIT_APPEND(self->str, j, len);
				2787	}
				2788	return list;
				2789
				2790	onError:
				2791	Py_DECREF(list);
				2792	return NULL;
				2793	}
				2794
				2795	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2796	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2797	{
				2798	register int i;
				2799	register int j;
				2800	int len;
				2801	PyObject *list;
				2802	PyObject *str;
				2803	Py_UNICODE *data;
				2804
				2805	string = PyUnicode_FromObject(string);
				2806	if (string == NULL)
				2807	return NULL;
				2808	data = PyUnicode_AS_UNICODE(string);
				2809	len = PyUnicode_GET_SIZE(string);
				2810
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2811	list = PyList_New(0);
				2812	if (!list)
				2813	goto onError;
				2814
				2815	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2816	int eol;
				2817
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2818	/* Find a line and append it */
				2819	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2820	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2821
				2822	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2823	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2824	if (i < len) {
				2825	if (data[i] == '\r' && i + 1 < len &&
				2826	data[i+1] == '\n')
				2827	i += 2;
				2828	else
				2829	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2830	if (keepends)
				2831	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2832	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2833	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2834	j = i;
				2835	}
				2836	if (j < len) {
				2837	SPLIT_APPEND(data, j, len);
				2838	}
				2839
				2840	Py_DECREF(string);
				2841	return list;
				2842
				2843	onError:
				2844	Py_DECREF(list);
				2845	Py_DECREF(string);
				2846	return NULL;
				2847	}
				2848
				2849	static
				2850	PyObject split_char(PyUnicodeObject self,
				2851	PyObject *list,
				2852	Py_UNICODE ch,
				2853	int maxcount)
				2854	{
				2855	register int i;
				2856	register int j;
				2857	int len = self->length;
				2858	PyObject *str;
				2859
				2860	for (i = j = 0; i < len; ) {
				2861	if (self->str[i] == ch) {
				2862	if (maxcount-- <= 0)
				2863	break;
				2864	SPLIT_APPEND(self->str, j, i);
				2865	i = j = i + 1;
				2866	} else
				2867	i++;
				2868	}
				2869	if (j <= len) {
				2870	SPLIT_APPEND(self->str, j, len);
				2871	}
				2872	return list;
				2873
				2874	onError:
				2875	Py_DECREF(list);
				2876	return NULL;
				2877	}
				2878
				2879	static
				2880	PyObject split_substring(PyUnicodeObject self,
				2881	PyObject *list,
				2882	PyUnicodeObject *substring,
				2883	int maxcount)
				2884	{
				2885	register int i;
				2886	register int j;
				2887	int len = self->length;
				2888	int sublen = substring->length;
				2889	PyObject *str;
				2890
				2891	for (i = j = 0; i < len - sublen; ) {
				2892	if (Py_UNICODE_MATCH(self, i, substring)) {
				2893	if (maxcount-- <= 0)
				2894	break;
				2895	SPLIT_APPEND(self->str, j, i);
				2896	i = j = i + sublen;
				2897	} else
				2898	i++;
				2899	}
				2900	if (j <= len) {
				2901	SPLIT_APPEND(self->str, j, len);
				2902	}
				2903	return list;
				2904
				2905	onError:
				2906	Py_DECREF(list);
				2907	return NULL;
				2908	}
				2909
				2910	#undef SPLIT_APPEND
				2911
				2912	static
				2913	PyObject split(PyUnicodeObject self,
				2914	PyUnicodeObject *substring,
				2915	int maxcount)
				2916	{
				2917	PyObject *list;
				2918
				2919	if (maxcount < 0)
				2920	maxcount = INT_MAX;
				2921
				2922	list = PyList_New(0);
				2923	if (!list)
				2924	return NULL;
				2925
				2926	if (substring == NULL)
				2927	return split_whitespace(self,list,maxcount);
				2928
				2929	else if (substring->length == 1)
				2930	return split_char(self,list,substring->str[0],maxcount);
				2931
				2932	else if (substring->length == 0) {
				2933	Py_DECREF(list);
				2934	PyErr_SetString(PyExc_ValueError, "empty separator");
				2935	return NULL;
				2936	}
				2937	else
				2938	return split_substring(self,list,substring,maxcount);
				2939	}
				2940
				2941	static
				2942	PyObject strip(PyUnicodeObject self,
				2943	int left,
				2944	int right)
				2945	{
				2946	Py_UNICODE *p = self->str;
				2947	int start = 0;
				2948	int end = self->length;
				2949
				2950	if (left)
				2951	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2952	start++;
				2953
				2954	if (right)
				2955	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2956	end--;
				2957
				2958	if (start == 0 && end == self->length) {
				2959	/* couldn't strip anything off, return original string */
				2960	Py_INCREF(self);
				2961	return (PyObject*) self;
				2962	}
				2963
				2964	return (PyObject*) PyUnicode_FromUnicode(
				2965	self->str + start,
				2966	end - start
				2967	);
				2968	}
				2969
				2970	static
				2971	PyObject replace(PyUnicodeObject self,
				2972	PyUnicodeObject *str1,
				2973	PyUnicodeObject *str2,
				2974	int maxcount)
				2975	{
				2976	PyUnicodeObject *u;
				2977
				2978	if (maxcount < 0)
				2979	maxcount = INT_MAX;
				2980
				2981	if (str1->length == 1 && str2->length == 1) {
				2982	int i;
				2983
				2984	/* replace characters */
				2985	if (!findchar(self->str, self->length, str1->str[0])) {
				2986	/* nothing to replace, return original string */
				2987	Py_INCREF(self);
				2988	u = self;
				2989	} else {
				2990	Py_UNICODE u1 = str1->str[0];
				2991	Py_UNICODE u2 = str2->str[0];
				2992
				2993	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2994	self->str,
				2995	self->length
				2996	);
				2997	if (u)
				2998	for (i = 0; i < u->length; i++)
				2999	if (u->str[i] == u1) {
				3000	if (--maxcount < 0)
				3001	break;
				3002	u->str[i] = u2;
				3003	}
				3004	}
				3005
				3006	} else {
				3007	int n, i;
				3008	Py_UNICODE *p;
				3009
				3010	/* replace strings */
				3011	n = count(self, 0, self->length, str1);
				3012	if (n > maxcount)
				3013	n = maxcount;
				3014	if (n == 0) {
				3015	/* nothing to replace, return original string */
				3016	Py_INCREF(self);
				3017	u = self;
				3018	} else {
				3019	u = _PyUnicode_New(
				3020	self->length + n * (str2->length - str1->length));
				3021	if (u) {
				3022	i = 0;
				3023	p = u->str;
				3024	while (i <= self->length - str1->length)
				3025	if (Py_UNICODE_MATCH(self, i, str1)) {
				3026	/* replace string segment */
				3027	Py_UNICODE_COPY(p, str2->str, str2->length);
				3028	p += str2->length;
				3029	i += str1->length;
				3030	if (--n <= 0) {
				3031	/* copy remaining part */
				3032	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3033	break;
				3034	}
				3035	} else
				3036	*p++ = self->str[i++];
				3037	}
				3038	}
				3039	}
				3040
				3041	return (PyObject *) u;
				3042	}
				3043
				3044	/* --- Unicode Object Methods --------------------------------------------- */
				3045
				3046	static char title__doc__[] =
				3047	"S.title() -> unicode\n\
				3048	\n\
				3049	Return a titlecased version of S, i.e. words start with title case\n\
				3050	characters, all remaining cased characters have lower case.";
				3051
				3052	static PyObject*
				3053	unicode_title(PyUnicodeObject self, PyObject args)
				3054	{
				3055	if (!PyArg_NoArgs(args))
				3056	return NULL;
				3057	return fixup(self, fixtitle);
				3058	}
				3059
				3060	static char capitalize__doc__[] =
				3061	"S.capitalize() -> unicode\n\
				3062	\n\
				3063	Return a capitalized version of S, i.e. make the first character\n\
				3064	have upper case.";
				3065
				3066	static PyObject*
				3067	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3068	{
				3069	if (!PyArg_NoArgs(args))
				3070	return NULL;
				3071	return fixup(self, fixcapitalize);
				3072	}
				3073
				3074	#if 0
				3075	static char capwords__doc__[] =
				3076	"S.capwords() -> unicode\n\
				3077	\n\
				3078	Apply .capitalize() to all words in S and return the result with\n\
				3079	normalized whitespace (all whitespace strings are replaced by ' ').";
				3080
				3081	static PyObject*
				3082	unicode_capwords(PyUnicodeObject self, PyObject args)
				3083	{
				3084	PyObject *list;
				3085	PyObject *item;
				3086	int i;
				3087
				3088	if (!PyArg_NoArgs(args))
				3089	return NULL;
				3090
				3091	/* Split into words */
				3092	list = split(self, NULL, -1);
				3093	if (!list)
				3094	return NULL;
				3095
				3096	/* Capitalize each word */
				3097	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3098	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3099	fixcapitalize);
				3100	if (item == NULL)
				3101	goto onError;
				3102	Py_DECREF(PyList_GET_ITEM(list, i));
				3103	PyList_SET_ITEM(list, i, item);
				3104	}
				3105
				3106	/* Join the words to form a new string */
				3107	item = PyUnicode_Join(NULL, list);
				3108
				3109	onError:
				3110	Py_DECREF(list);
				3111	return (PyObject *)item;
				3112	}
				3113	#endif
				3114
				3115	static char center__doc__[] =
				3116	"S.center(width) -> unicode\n\
				3117	\n\
				3118	Return S centered in a Unicode string of length width. Padding is done\n\
				3119	using spaces.";
				3120
				3121	static PyObject *
				3122	unicode_center(PyUnicodeObject self, PyObject args)
				3123	{
				3124	int marg, left;
				3125	int width;
				3126
				3127	if (!PyArg_ParseTuple(args, "i:center", &width))
				3128	return NULL;
				3129
				3130	if (self->length >= width) {
				3131	Py_INCREF(self);
				3132	return (PyObject*) self;
				3133	}
				3134
				3135	marg = width - self->length;
				3136	left = marg / 2 + (marg & width & 1);
				3137
				3138	return (PyObject*) pad(self, left, marg - left, ' ');
				3139	}
				3140
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3141	/* speedy UTF-16 code point order comparison */
				3142	/* gleaned from: */
				3143	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3144
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3145	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3146	{
				3147	0, 0, 0, 0, 0, 0, 0, 0,
				3148	0, 0, 0, 0, 0, 0, 0, 0,
				3149	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3150	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3151	};
				3152
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3153	static int
				3154	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3155	{
				3156	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3157
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3158	Py_UNICODE *s1 = str1->str;
				3159	Py_UNICODE *s2 = str2->str;
				3160
				3161	len1 = str1->length;
				3162	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3163
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3164	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3165	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3166	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3167
				3168	c1 = *s1++;
				3169	c2 = *s2++;
				3170	if (c1 > (1<<11) * 26)
				3171	c1 += utf16Fixup[c1>>11];
				3172	if (c2 > (1<<11) * 26)
				3173	c2 += utf16Fixup[c2>>11];
				3174
				3175	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3176	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3177	if (diff)
				3178	return (diff < 0) ? -1 : (diff != 0);
				3179	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3180	}
				3181
				3182	return (len1 < len2) ? -1 : (len1 != len2);
				3183	}
				3184
				3185	int PyUnicode_Compare(PyObject *left,
				3186	PyObject *right)
				3187	{
				3188	PyUnicodeObject u = NULL, v = NULL;
				3189	int result;
				3190
				3191	/* Coerce the two arguments */
				3192	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3193	if (u == NULL)
				3194	goto onError;
				3195	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3196	if (v == NULL)
				3197	goto onError;
				3198
				3199	/* Shortcut for emtpy or interned objects */
				3200	if (v == u) {
				3201	Py_DECREF(u);
				3202	Py_DECREF(v);
				3203	return 0;
				3204	}
				3205
				3206	result = unicode_compare(u, v);
				3207
				3208	Py_DECREF(u);
				3209	Py_DECREF(v);
				3210	return result;
				3211
				3212	onError:
				3213	Py_XDECREF(u);
				3214	Py_XDECREF(v);
				3215	return -1;
				3216	}
				3217
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3218	int PyUnicode_Contains(PyObject *container,
				3219	PyObject *element)
				3220	{
				3221	PyUnicodeObject u = NULL, v = NULL;
				3222	int result;
				3223	register const Py_UNICODE p, e;
				3224	register Py_UNICODE ch;
				3225
				3226	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3227	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3228	if (v == NULL) {
				3229	PyErr_SetString(PyExc_TypeError,
				3230	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3231	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3232	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3233	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3234	if (u == NULL) {
				3235	Py_DECREF(v);
				3236	goto onError;
				3237	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3238
				3239	/* Check v in u */
				3240	if (PyUnicode_GET_SIZE(v) != 1) {
				3241	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3242	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3243	goto onError;
				3244	}
				3245	ch = *PyUnicode_AS_UNICODE(v);
				3246	p = PyUnicode_AS_UNICODE(u);
				3247	e = p + PyUnicode_GET_SIZE(u);
				3248	result = 0;
				3249	while (p < e) {
				3250	if (*p++ == ch) {
				3251	result = 1;
				3252	break;
				3253	}
				3254	}
				3255
				3256	Py_DECREF(u);
				3257	Py_DECREF(v);
				3258	return result;
				3259
				3260	onError:
				3261	Py_XDECREF(u);
				3262	Py_XDECREF(v);
				3263	return -1;
				3264	}
				3265
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3266	/* Concat to string or Unicode object giving a new Unicode object. */
				3267
				3268	PyObject PyUnicode_Concat(PyObject left,
				3269	PyObject *right)
				3270	{
				3271	PyUnicodeObject u = NULL, v = NULL, *w;
				3272
				3273	/* Coerce the two arguments */
				3274	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3275	if (u == NULL)
				3276	goto onError;
				3277	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3278	if (v == NULL)
				3279	goto onError;
				3280
				3281	/* Shortcuts */
				3282	if (v == unicode_empty) {
				3283	Py_DECREF(v);
				3284	return (PyObject *)u;
				3285	}
				3286	if (u == unicode_empty) {
				3287	Py_DECREF(u);
				3288	return (PyObject *)v;
				3289	}
				3290
				3291	/* Concat the two Unicode strings */
				3292	w = _PyUnicode_New(u->length + v->length);
				3293	if (w == NULL)
				3294	goto onError;
				3295	Py_UNICODE_COPY(w->str, u->str, u->length);
				3296	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3297
				3298	Py_DECREF(u);
				3299	Py_DECREF(v);
				3300	return (PyObject *)w;
				3301
				3302	onError:
				3303	Py_XDECREF(u);
				3304	Py_XDECREF(v);
				3305	return NULL;
				3306	}
				3307
				3308	static char count__doc__[] =
				3309	"S.count(sub[, start[, end]]) -> int\n\
				3310	\n\
				3311	Return the number of occurrences of substring sub in Unicode string\n\
				3312	S[start:end]. Optional arguments start and end are\n\
				3313	interpreted as in slice notation.";
				3314
				3315	static PyObject *
				3316	unicode_count(PyUnicodeObject self, PyObject args)
				3317	{
				3318	PyUnicodeObject *substring;
				3319	int start = 0;
				3320	int end = INT_MAX;
				3321	PyObject *result;
				3322
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3323	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3324	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3325	return NULL;
				3326
				3327	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3328	(PyObject *)substring);
				3329	if (substring == NULL)
				3330	return NULL;
				3331
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3332	if (start < 0)
				3333	start += self->length;
				3334	if (start < 0)
				3335	start = 0;
				3336	if (end > self->length)
				3337	end = self->length;
				3338	if (end < 0)
				3339	end += self->length;
				3340	if (end < 0)
				3341	end = 0;
				3342
				3343	result = PyInt_FromLong((long) count(self, start, end, substring));
				3344
				3345	Py_DECREF(substring);
				3346	return result;
				3347	}
				3348
				3349	static char encode__doc__[] =
				3350	"S.encode([encoding[,errors]]) -> string\n\
				3351	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3352	Return an encoded string version of S. Default encoding is the current\n\
				3353	default string encoding. errors may be given to set a different error\n\
				3354	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3355	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3356
				3357	static PyObject *
				3358	unicode_encode(PyUnicodeObject self, PyObject args)
				3359	{
				3360	char *encoding = NULL;
				3361	char *errors = NULL;
				3362	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3363	return NULL;
				3364	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3365	}
				3366
				3367	static char expandtabs__doc__[] =
				3368	"S.expandtabs([tabsize]) -> unicode\n\
				3369	\n\
				3370	Return a copy of S where all tab characters are expanded using spaces.\n\
				3371	If tabsize is not given, a tab size of 8 characters is assumed.";
				3372
				3373	static PyObject*
				3374	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3375	{
				3376	Py_UNICODE *e;
				3377	Py_UNICODE *p;
				3378	Py_UNICODE *q;
				3379	int i, j;
				3380	PyUnicodeObject *u;
				3381	int tabsize = 8;
				3382
				3383	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3384	return NULL;
				3385
				3386	/* First pass: determine size of ouput string */
				3387	i = j = 0;
				3388	e = self->str + self->length;
				3389	for (p = self->str; p < e; p++)
				3390	if (*p == '\t') {
				3391	if (tabsize > 0)
				3392	j += tabsize - (j % tabsize);
				3393	}
				3394	else {
				3395	j++;
				3396	if (p == '\n' \|\| p == '\r') {
				3397	i += j;
				3398	j = 0;
				3399	}
				3400	}
				3401
				3402	/* Second pass: create output string and fill it */
				3403	u = _PyUnicode_New(i + j);
				3404	if (!u)
				3405	return NULL;
				3406
				3407	j = 0;
				3408	q = u->str;
				3409
				3410	for (p = self->str; p < e; p++)
				3411	if (*p == '\t') {
				3412	if (tabsize > 0) {
				3413	i = tabsize - (j % tabsize);
				3414	j += i;
				3415	while (i--)
				3416	*q++ = ' ';
				3417	}
				3418	}
				3419	else {
				3420	j++;
				3421	q++ = p;
				3422	if (p == '\n' \|\| p == '\r')
				3423	j = 0;
				3424	}
				3425
				3426	return (PyObject*) u;
				3427	}
				3428
				3429	static char find__doc__[] =
				3430	"S.find(sub [,start [,end]]) -> int\n\
				3431	\n\
				3432	Return the lowest index in S where substring sub is found,\n\
				3433	such that sub is contained within s[start,end]. Optional\n\
				3434	arguments start and end are interpreted as in slice notation.\n\
				3435	\n\
				3436	Return -1 on failure.";
				3437
				3438	static PyObject *
				3439	unicode_find(PyUnicodeObject self, PyObject args)
				3440	{
				3441	PyUnicodeObject *substring;
				3442	int start = 0;
				3443	int end = INT_MAX;
				3444	PyObject *result;
				3445
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3446	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3447	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3448	return NULL;
				3449	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3450	(PyObject *)substring);
				3451	if (substring == NULL)
				3452	return NULL;
				3453
				3454	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3455
				3456	Py_DECREF(substring);
				3457	return result;
				3458	}
				3459
				3460	static PyObject *
				3461	unicode_getitem(PyUnicodeObject *self, int index)
				3462	{
				3463	if (index < 0 \|\| index >= self->length) {
				3464	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3465	return NULL;
				3466	}
				3467
				3468	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3469	}
				3470
				3471	static long
				3472	unicode_hash(PyUnicodeObject *self)
				3473	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3474	/* Since Unicode objects compare equal to their ASCII string
				3475	counterparts, they should use the individual character values
				3476	as basis for their hash value. This is needed to assure that
				3477	strings and Unicode objects behave in the same way as
				3478	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3479
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3480	register int len;
				3481	register Py_UNICODE *p;
				3482	register long x;
				3483
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3484	if (self->hash != -1)
				3485	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3486	len = PyUnicode_GET_SIZE(self);
				3487	p = PyUnicode_AS_UNICODE(self);
				3488	x = *p << 7;
				3489	while (--len >= 0)
				3490	x = (1000003x) ^ p++;
				3491	x ^= PyUnicode_GET_SIZE(self);
				3492	if (x == -1)
				3493	x = -2;
				3494	self->hash = x;
				3495	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3496	}
				3497
				3498	static char index__doc__[] =
				3499	"S.index(sub [,start [,end]]) -> int\n\
				3500	\n\
				3501	Like S.find() but raise ValueError when the substring is not found.";
				3502
				3503	static PyObject *
				3504	unicode_index(PyUnicodeObject self, PyObject args)
				3505	{
				3506	int result;
				3507	PyUnicodeObject *substring;
				3508	int start = 0;
				3509	int end = INT_MAX;
				3510
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3511	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3512	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3513	return NULL;
				3514
				3515	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3516	(PyObject *)substring);
				3517	if (substring == NULL)
				3518	return NULL;
				3519
				3520	result = findstring(self, substring, start, end, 1);
				3521
				3522	Py_DECREF(substring);
				3523	if (result < 0) {
				3524	PyErr_SetString(PyExc_ValueError, "substring not found");
				3525	return NULL;
				3526	}
				3527	return PyInt_FromLong(result);
				3528	}
				3529
				3530	static char islower__doc__[] =
				3531	"S.islower() -> int\n\
				3532	\n\
				3533	Return 1 if all cased characters in S are lowercase and there is\n\
				3534	at least one cased character in S, 0 otherwise.";
				3535
				3536	static PyObject*
				3537	unicode_islower(PyUnicodeObject self, PyObject args)
				3538	{
				3539	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3540	register const Py_UNICODE *e;
				3541	int cased;
				3542
				3543	if (!PyArg_NoArgs(args))
				3544	return NULL;
				3545
				3546	/* Shortcut for single character strings */
				3547	if (PyUnicode_GET_SIZE(self) == 1)
				3548	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3549
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3550	/* Special case for empty strings */
				3551	if (PyString_GET_SIZE(self) == 0)
				3552	return PyInt_FromLong(0);
				3553
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3554	e = p + PyUnicode_GET_SIZE(self);
				3555	cased = 0;
				3556	for (; p < e; p++) {
				3557	register const Py_UNICODE ch = *p;
				3558
				3559	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3560	return PyInt_FromLong(0);
				3561	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3562	cased = 1;
				3563	}
				3564	return PyInt_FromLong(cased);
				3565	}
				3566
				3567	static char isupper__doc__[] =
				3568	"S.isupper() -> int\n\
				3569	\n\
				3570	Return 1 if all cased characters in S are uppercase and there is\n\
				3571	at least one cased character in S, 0 otherwise.";
				3572
				3573	static PyObject*
				3574	unicode_isupper(PyUnicodeObject self, PyObject args)
				3575	{
				3576	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3577	register const Py_UNICODE *e;
				3578	int cased;
				3579
				3580	if (!PyArg_NoArgs(args))
				3581	return NULL;
				3582
				3583	/* Shortcut for single character strings */
				3584	if (PyUnicode_GET_SIZE(self) == 1)
				3585	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3586
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3587	/* Special case for empty strings */
				3588	if (PyString_GET_SIZE(self) == 0)
				3589	return PyInt_FromLong(0);
				3590
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3591	e = p + PyUnicode_GET_SIZE(self);
				3592	cased = 0;
				3593	for (; p < e; p++) {
				3594	register const Py_UNICODE ch = *p;
				3595
				3596	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3597	return PyInt_FromLong(0);
				3598	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3599	cased = 1;
				3600	}
				3601	return PyInt_FromLong(cased);
				3602	}
				3603
				3604	static char istitle__doc__[] =
				3605	"S.istitle() -> int\n\
				3606	\n\
				3607	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3608	may only follow uncased characters and lowercase characters only cased\n\
				3609	ones. Return 0 otherwise.";
				3610
				3611	static PyObject*
				3612	unicode_istitle(PyUnicodeObject self, PyObject args)
				3613	{
				3614	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3615	register const Py_UNICODE *e;
				3616	int cased, previous_is_cased;
				3617
				3618	if (!PyArg_NoArgs(args))
				3619	return NULL;
				3620
				3621	/* Shortcut for single character strings */
				3622	if (PyUnicode_GET_SIZE(self) == 1)
				3623	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3624	(Py_UNICODE_ISUPPER(*p) != 0));
				3625
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3626	/* Special case for empty strings */
				3627	if (PyString_GET_SIZE(self) == 0)
				3628	return PyInt_FromLong(0);
				3629
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3630	e = p + PyUnicode_GET_SIZE(self);
				3631	cased = 0;
				3632	previous_is_cased = 0;
				3633	for (; p < e; p++) {
				3634	register const Py_UNICODE ch = *p;
				3635
				3636	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3637	if (previous_is_cased)
				3638	return PyInt_FromLong(0);
				3639	previous_is_cased = 1;
				3640	cased = 1;
				3641	}
				3642	else if (Py_UNICODE_ISLOWER(ch)) {
				3643	if (!previous_is_cased)
				3644	return PyInt_FromLong(0);
				3645	previous_is_cased = 1;
				3646	cased = 1;
				3647	}
				3648	else
				3649	previous_is_cased = 0;
				3650	}
				3651	return PyInt_FromLong(cased);
				3652	}
				3653
				3654	static char isspace__doc__[] =
				3655	"S.isspace() -> int\n\
				3656	\n\
				3657	Return 1 if there are only whitespace characters in S,\n\
				3658	0 otherwise.";
				3659
				3660	static PyObject*
				3661	unicode_isspace(PyUnicodeObject self, PyObject args)
				3662	{
				3663	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3664	register const Py_UNICODE *e;
				3665
				3666	if (!PyArg_NoArgs(args))
				3667	return NULL;
				3668
				3669	/* Shortcut for single character strings */
				3670	if (PyUnicode_GET_SIZE(self) == 1 &&
				3671	Py_UNICODE_ISSPACE(*p))
				3672	return PyInt_FromLong(1);
				3673
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3674	/* Special case for empty strings */
				3675	if (PyString_GET_SIZE(self) == 0)
				3676	return PyInt_FromLong(0);
				3677
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3678	e = p + PyUnicode_GET_SIZE(self);
				3679	for (; p < e; p++) {
				3680	if (!Py_UNICODE_ISSPACE(*p))
				3681	return PyInt_FromLong(0);
				3682	}
				3683	return PyInt_FromLong(1);
				3684	}
				3685
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3686	static char isalpha__doc__[] =
				3687	"S.isalpha() -> int\n\
				3688	\n\
				3689	Return 1 if all characters in S are alphabetic\n\
				3690	and there is at least one character in S, 0 otherwise.";
				3691
				3692	static PyObject*
				3693	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3694	{
				3695	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3696	register const Py_UNICODE *e;
				3697
				3698	if (!PyArg_NoArgs(args))
				3699	return NULL;
				3700
				3701	/* Shortcut for single character strings */
				3702	if (PyUnicode_GET_SIZE(self) == 1 &&
				3703	Py_UNICODE_ISALPHA(*p))
				3704	return PyInt_FromLong(1);
				3705
				3706	/* Special case for empty strings */
				3707	if (PyString_GET_SIZE(self) == 0)
				3708	return PyInt_FromLong(0);
				3709
				3710	e = p + PyUnicode_GET_SIZE(self);
				3711	for (; p < e; p++) {
				3712	if (!Py_UNICODE_ISALPHA(*p))
				3713	return PyInt_FromLong(0);
				3714	}
				3715	return PyInt_FromLong(1);
				3716	}
				3717
				3718	static char isalnum__doc__[] =
				3719	"S.isalnum() -> int\n\
				3720	\n\
				3721	Return 1 if all characters in S are alphanumeric\n\
				3722	and there is at least one character in S, 0 otherwise.";
				3723
				3724	static PyObject*
				3725	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3726	{
				3727	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3728	register const Py_UNICODE *e;
				3729
				3730	if (!PyArg_NoArgs(args))
				3731	return NULL;
				3732
				3733	/* Shortcut for single character strings */
				3734	if (PyUnicode_GET_SIZE(self) == 1 &&
				3735	Py_UNICODE_ISALNUM(*p))
				3736	return PyInt_FromLong(1);
				3737
				3738	/* Special case for empty strings */
				3739	if (PyString_GET_SIZE(self) == 0)
				3740	return PyInt_FromLong(0);
				3741
				3742	e = p + PyUnicode_GET_SIZE(self);
				3743	for (; p < e; p++) {
				3744	if (!Py_UNICODE_ISALNUM(*p))
				3745	return PyInt_FromLong(0);
				3746	}
				3747	return PyInt_FromLong(1);
				3748	}
				3749
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3750	static char isdecimal__doc__[] =
				3751	"S.isdecimal() -> int\n\
				3752	\n\
				3753	Return 1 if there are only decimal characters in S,\n\
				3754	0 otherwise.";
				3755
				3756	static PyObject*
				3757	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3758	{
				3759	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3760	register const Py_UNICODE *e;
				3761
				3762	if (!PyArg_NoArgs(args))
				3763	return NULL;
				3764
				3765	/* Shortcut for single character strings */
				3766	if (PyUnicode_GET_SIZE(self) == 1 &&
				3767	Py_UNICODE_ISDECIMAL(*p))
				3768	return PyInt_FromLong(1);
				3769
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3770	/* Special case for empty strings */
				3771	if (PyString_GET_SIZE(self) == 0)
				3772	return PyInt_FromLong(0);
				3773
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3774	e = p + PyUnicode_GET_SIZE(self);
				3775	for (; p < e; p++) {
				3776	if (!Py_UNICODE_ISDECIMAL(*p))
				3777	return PyInt_FromLong(0);
				3778	}
				3779	return PyInt_FromLong(1);
				3780	}
				3781
				3782	static char isdigit__doc__[] =
				3783	"S.isdigit() -> int\n\
				3784	\n\
				3785	Return 1 if there are only digit characters in S,\n\
				3786	0 otherwise.";
				3787
				3788	static PyObject*
				3789	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3790	{
				3791	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3792	register const Py_UNICODE *e;
				3793
				3794	if (!PyArg_NoArgs(args))
				3795	return NULL;
				3796
				3797	/* Shortcut for single character strings */
				3798	if (PyUnicode_GET_SIZE(self) == 1 &&
				3799	Py_UNICODE_ISDIGIT(*p))
				3800	return PyInt_FromLong(1);
				3801
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3802	/* Special case for empty strings */
				3803	if (PyString_GET_SIZE(self) == 0)
				3804	return PyInt_FromLong(0);
				3805
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3806	e = p + PyUnicode_GET_SIZE(self);
				3807	for (; p < e; p++) {
				3808	if (!Py_UNICODE_ISDIGIT(*p))
				3809	return PyInt_FromLong(0);
				3810	}
				3811	return PyInt_FromLong(1);
				3812	}
				3813
				3814	static char isnumeric__doc__[] =
				3815	"S.isnumeric() -> int\n\
				3816	\n\
				3817	Return 1 if there are only numeric characters in S,\n\
				3818	0 otherwise.";
				3819
				3820	static PyObject*
				3821	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3822	{
				3823	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3824	register const Py_UNICODE *e;
				3825
				3826	if (!PyArg_NoArgs(args))
				3827	return NULL;
				3828
				3829	/* Shortcut for single character strings */
				3830	if (PyUnicode_GET_SIZE(self) == 1 &&
				3831	Py_UNICODE_ISNUMERIC(*p))
				3832	return PyInt_FromLong(1);
				3833
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3834	/* Special case for empty strings */
				3835	if (PyString_GET_SIZE(self) == 0)
				3836	return PyInt_FromLong(0);
				3837
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3838	e = p + PyUnicode_GET_SIZE(self);
				3839	for (; p < e; p++) {
				3840	if (!Py_UNICODE_ISNUMERIC(*p))
				3841	return PyInt_FromLong(0);
				3842	}
				3843	return PyInt_FromLong(1);
				3844	}
				3845
				3846	static char join__doc__[] =
				3847	"S.join(sequence) -> unicode\n\
				3848	\n\
				3849	Return a string which is the concatenation of the strings in the\n\
				3850	sequence. The separator between elements is S.";
				3851
				3852	static PyObject*
				3853	unicode_join(PyUnicodeObject self, PyObject args)
				3854	{
				3855	PyObject *data;
				3856	if (!PyArg_ParseTuple(args, "O:join", &data))
				3857	return NULL;
				3858
				3859	return PyUnicode_Join((PyObject *)self, data);
				3860	}
				3861
				3862	static int
				3863	unicode_length(PyUnicodeObject *self)
				3864	{
				3865	return self->length;
				3866	}
				3867
				3868	static char ljust__doc__[] =
				3869	"S.ljust(width) -> unicode\n\
				3870	\n\
				3871	Return S left justified in a Unicode string of length width. Padding is\n\
				3872	done using spaces.";
				3873
				3874	static PyObject *
				3875	unicode_ljust(PyUnicodeObject self, PyObject args)
				3876	{
				3877	int width;
				3878	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3879	return NULL;
				3880
				3881	if (self->length >= width) {
				3882	Py_INCREF(self);
				3883	return (PyObject*) self;
				3884	}
				3885
				3886	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3887	}
				3888
				3889	static char lower__doc__[] =
				3890	"S.lower() -> unicode\n\
				3891	\n\
				3892	Return a copy of the string S converted to lowercase.";
				3893
				3894	static PyObject*
				3895	unicode_lower(PyUnicodeObject self, PyObject args)
				3896	{
				3897	if (!PyArg_NoArgs(args))
				3898	return NULL;
				3899	return fixup(self, fixlower);
				3900	}
				3901
				3902	static char lstrip__doc__[] =
				3903	"S.lstrip() -> unicode\n\
				3904	\n\
				3905	Return a copy of the string S with leading whitespace removed.";
				3906
				3907	static PyObject *
				3908	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3909	{
				3910	if (!PyArg_NoArgs(args))
				3911	return NULL;
				3912	return strip(self, 1, 0);
				3913	}
				3914
				3915	static PyObject*
				3916	unicode_repeat(PyUnicodeObject *str, int len)
				3917	{
				3918	PyUnicodeObject *u;
				3919	Py_UNICODE *p;
				3920
				3921	if (len < 0)
				3922	len = 0;
				3923
				3924	if (len == 1) {
				3925	/* no repeat, return original string */
				3926	Py_INCREF(str);
				3927	return (PyObject*) str;
				3928	}
				3929
				3930	u = _PyUnicode_New(len * str->length);
				3931	if (!u)
				3932	return NULL;
				3933
				3934	p = u->str;
				3935
				3936	while (len-- > 0) {
				3937	Py_UNICODE_COPY(p, str->str, str->length);
				3938	p += str->length;
				3939	}
				3940
				3941	return (PyObject*) u;
				3942	}
				3943
				3944	PyObject PyUnicode_Replace(PyObject obj,
				3945	PyObject *subobj,
				3946	PyObject *replobj,
				3947	int maxcount)
				3948	{
				3949	PyObject *self;
				3950	PyObject *str1;
				3951	PyObject *str2;
				3952	PyObject *result;
				3953
				3954	self = PyUnicode_FromObject(obj);
				3955	if (self == NULL)
				3956	return NULL;
				3957	str1 = PyUnicode_FromObject(subobj);
				3958	if (str1 == NULL) {
				3959	Py_DECREF(self);
				3960	return NULL;
				3961	}
				3962	str2 = PyUnicode_FromObject(replobj);
				3963	if (str2 == NULL) {
				3964	Py_DECREF(self);
				3965	Py_DECREF(str1);
				3966	return NULL;
				3967	}
				3968	result = replace((PyUnicodeObject *)self,
				3969	(PyUnicodeObject *)str1,
				3970	(PyUnicodeObject *)str2,
				3971	maxcount);
				3972	Py_DECREF(self);
				3973	Py_DECREF(str1);
				3974	Py_DECREF(str2);
				3975	return result;
				3976	}
				3977
				3978	static char replace__doc__[] =
				3979	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3980	\n\
				3981	Return a copy of S with all occurrences of substring\n\
				3982	old replaced by new. If the optional argument maxsplit is\n\
				3983	given, only the first maxsplit occurrences are replaced.";
				3984
				3985	static PyObject*
				3986	unicode_replace(PyUnicodeObject self, PyObject args)
				3987	{
				3988	PyUnicodeObject *str1;
				3989	PyUnicodeObject *str2;
				3990	int maxcount = -1;
				3991	PyObject *result;
				3992
				3993	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3994	return NULL;
				3995	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3996	if (str1 == NULL)
				3997	return NULL;
				3998	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3999	if (str2 == NULL)
				4000	return NULL;
				4001
				4002	result = replace(self, str1, str2, maxcount);
				4003
				4004	Py_DECREF(str1);
				4005	Py_DECREF(str2);
				4006	return result;
				4007	}
				4008
				4009	static
				4010	PyObject unicode_repr(PyObject unicode)
				4011	{
				4012	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4013	PyUnicode_GET_SIZE(unicode),
				4014	1);
				4015	}
				4016
				4017	static char rfind__doc__[] =
				4018	"S.rfind(sub [,start [,end]]) -> int\n\
				4019	\n\
				4020	Return the highest index in S where substring sub is found,\n\
				4021	such that sub is contained within s[start,end]. Optional\n\
				4022	arguments start and end are interpreted as in slice notation.\n\
				4023	\n\
				4024	Return -1 on failure.";
				4025
				4026	static PyObject *
				4027	unicode_rfind(PyUnicodeObject self, PyObject args)
				4028	{
				4029	PyUnicodeObject *substring;
				4030	int start = 0;
				4031	int end = INT_MAX;
				4032	PyObject *result;
				4033
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4034	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4035	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4036	return NULL;
				4037	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4038	(PyObject *)substring);
				4039	if (substring == NULL)
				4040	return NULL;
				4041
				4042	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4043
				4044	Py_DECREF(substring);
				4045	return result;
				4046	}
				4047
				4048	static char rindex__doc__[] =
				4049	"S.rindex(sub [,start [,end]]) -> int\n\
				4050	\n\
				4051	Like S.rfind() but raise ValueError when the substring is not found.";
				4052
				4053	static PyObject *
				4054	unicode_rindex(PyUnicodeObject self, PyObject args)
				4055	{
				4056	int result;
				4057	PyUnicodeObject *substring;
				4058	int start = 0;
				4059	int end = INT_MAX;
				4060
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4061	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4062	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4063	return NULL;
				4064	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4065	(PyObject *)substring);
				4066	if (substring == NULL)
				4067	return NULL;
				4068
				4069	result = findstring(self, substring, start, end, -1);
				4070
				4071	Py_DECREF(substring);
				4072	if (result < 0) {
				4073	PyErr_SetString(PyExc_ValueError, "substring not found");
				4074	return NULL;
				4075	}
				4076	return PyInt_FromLong(result);
				4077	}
				4078
				4079	static char rjust__doc__[] =
				4080	"S.rjust(width) -> unicode\n\
				4081	\n\
				4082	Return S right justified in a Unicode string of length width. Padding is\n\
				4083	done using spaces.";
				4084
				4085	static PyObject *
				4086	unicode_rjust(PyUnicodeObject self, PyObject args)
				4087	{
				4088	int width;
				4089	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4090	return NULL;
				4091
				4092	if (self->length >= width) {
				4093	Py_INCREF(self);
				4094	return (PyObject*) self;
				4095	}
				4096
				4097	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4098	}
				4099
				4100	static char rstrip__doc__[] =
				4101	"S.rstrip() -> unicode\n\
				4102	\n\
				4103	Return a copy of the string S with trailing whitespace removed.";
				4104
				4105	static PyObject *
				4106	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4107	{
				4108	if (!PyArg_NoArgs(args))
				4109	return NULL;
				4110	return strip(self, 0, 1);
				4111	}
				4112
				4113	static PyObject*
				4114	unicode_slice(PyUnicodeObject *self, int start, int end)
				4115	{
				4116	/* standard clamping */
				4117	if (start < 0)
				4118	start = 0;
				4119	if (end < 0)
				4120	end = 0;
				4121	if (end > self->length)
				4122	end = self->length;
				4123	if (start == 0 && end == self->length) {
				4124	/* full slice, return original string */
				4125	Py_INCREF(self);
				4126	return (PyObject*) self;
				4127	}
				4128	if (start > end)
				4129	start = end;
				4130	/* copy slice */
				4131	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4132	end - start);
				4133	}
				4134
				4135	PyObject PyUnicode_Split(PyObject s,
				4136	PyObject *sep,
				4137	int maxsplit)
				4138	{
				4139	PyObject *result;
				4140
				4141	s = PyUnicode_FromObject(s);
				4142	if (s == NULL)
				4143	return NULL;
				4144	if (sep != NULL) {
				4145	sep = PyUnicode_FromObject(sep);
				4146	if (sep == NULL) {
				4147	Py_DECREF(s);
				4148	return NULL;
				4149	}
				4150	}
				4151
				4152	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4153
				4154	Py_DECREF(s);
				4155	Py_XDECREF(sep);
				4156	return result;
				4157	}
				4158
				4159	static char split__doc__[] =
				4160	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4161	\n\
				4162	Return a list of the words in S, using sep as the\n\
				4163	delimiter string. If maxsplit is given, at most maxsplit\n\
				4164	splits are done. If sep is not specified, any whitespace string\n\
				4165	is a separator.";
				4166
				4167	static PyObject*
				4168	unicode_split(PyUnicodeObject self, PyObject args)
				4169	{
				4170	PyObject *substring = Py_None;
				4171	int maxcount = -1;
				4172
				4173	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4174	return NULL;
				4175
				4176	if (substring == Py_None)
				4177	return split(self, NULL, maxcount);
				4178	else if (PyUnicode_Check(substring))
				4179	return split(self, (PyUnicodeObject *)substring, maxcount);
				4180	else
				4181	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4182	}
				4183
				4184	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4185	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4186	\n\
				4187	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4188	Line breaks are not included in the resulting list unless keepends\n\
				4189	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4190
				4191	static PyObject*
				4192	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4193	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4194	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4195
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4196	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4197	return NULL;
				4198
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4199	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4200	}
				4201
				4202	static
				4203	PyObject unicode_str(PyUnicodeObject self)
				4204	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4205	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4206	}
				4207
				4208	static char strip__doc__[] =
				4209	"S.strip() -> unicode\n\
				4210	\n\
				4211	Return a copy of S with leading and trailing whitespace removed.";
				4212
				4213	static PyObject *
				4214	unicode_strip(PyUnicodeObject self, PyObject args)
				4215	{
				4216	if (!PyArg_NoArgs(args))
				4217	return NULL;
				4218	return strip(self, 1, 1);
				4219	}
				4220
				4221	static char swapcase__doc__[] =
				4222	"S.swapcase() -> unicode\n\
				4223	\n\
				4224	Return a copy of S with uppercase characters converted to lowercase\n\
				4225	and vice versa.";
				4226
				4227	static PyObject*
				4228	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4229	{
				4230	if (!PyArg_NoArgs(args))
				4231	return NULL;
				4232	return fixup(self, fixswapcase);
				4233	}
				4234
				4235	static char translate__doc__[] =
				4236	"S.translate(table) -> unicode\n\
				4237	\n\
				4238	Return a copy of the string S, where all characters have been mapped\n\
				4239	through the given translation table, which must be a mapping of\n\
				4240	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4241	are left untouched. Characters mapped to None are deleted.";
				4242
				4243	static PyObject*
				4244	unicode_translate(PyUnicodeObject self, PyObject args)
				4245	{
				4246	PyObject *table;
				4247
				4248	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4249	return NULL;
				4250	return PyUnicode_TranslateCharmap(self->str,
				4251	self->length,
				4252	table,
				4253	"ignore");
				4254	}
				4255
				4256	static char upper__doc__[] =
				4257	"S.upper() -> unicode\n\
				4258	\n\
				4259	Return a copy of S converted to uppercase.";
				4260
				4261	static PyObject*
				4262	unicode_upper(PyUnicodeObject self, PyObject args)
				4263	{
				4264	if (!PyArg_NoArgs(args))
				4265	return NULL;
				4266	return fixup(self, fixupper);
				4267	}
				4268
				4269	#if 0
				4270	static char zfill__doc__[] =
				4271	"S.zfill(width) -> unicode\n\
				4272	\n\
				4273	Pad a numeric string x with zeros on the left, to fill a field\n\
				4274	of the specified width. The string x is never truncated.";
				4275
				4276	static PyObject *
				4277	unicode_zfill(PyUnicodeObject self, PyObject args)
				4278	{
				4279	int fill;
				4280	PyUnicodeObject *u;
				4281
				4282	int width;
				4283	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4284	return NULL;
				4285
				4286	if (self->length >= width) {
				4287	Py_INCREF(self);
				4288	return (PyObject*) self;
				4289	}
				4290
				4291	fill = width - self->length;
				4292
				4293	u = pad(self, fill, 0, '0');
				4294
				4295	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4296	/* move sign to beginning of string */
				4297	u->str[0] = u->str[fill];
				4298	u->str[fill] = '0';
				4299	}
				4300
				4301	return (PyObject*) u;
				4302	}
				4303	#endif
				4304
				4305	#if 0
				4306	static PyObject*
				4307	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4308	{
				4309	if (!PyArg_NoArgs(args))
				4310	return NULL;
				4311	return PyInt_FromLong(unicode_freelist_size);
				4312	}
				4313	#endif
				4314
				4315	static char startswith__doc__[] =
				4316	"S.startswith(prefix[, start[, end]]) -> int\n\
				4317	\n\
				4318	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4319	optional start, test S beginning at that position. With optional end, stop\n\
				4320	comparing S at that position.";
				4321
				4322	static PyObject *
				4323	unicode_startswith(PyUnicodeObject *self,
				4324	PyObject *args)
				4325	{
				4326	PyUnicodeObject *substring;
				4327	int start = 0;
				4328	int end = INT_MAX;
				4329	PyObject *result;
				4330
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4331	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4332	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4333	return NULL;
				4334	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4335	(PyObject *)substring);
				4336	if (substring == NULL)
				4337	return NULL;
				4338
				4339	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4340
				4341	Py_DECREF(substring);
				4342	return result;
				4343	}
				4344
				4345
				4346	static char endswith__doc__[] =
				4347	"S.endswith(suffix[, start[, end]]) -> int\n\
				4348	\n\
				4349	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4350	optional start, test S beginning at that position. With optional end, stop\n\
				4351	comparing S at that position.";
				4352
				4353	static PyObject *
				4354	unicode_endswith(PyUnicodeObject *self,
				4355	PyObject *args)
				4356	{
				4357	PyUnicodeObject *substring;
				4358	int start = 0;
				4359	int end = INT_MAX;
				4360	PyObject *result;
				4361
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4362	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4363	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4364	return NULL;
				4365	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4366	(PyObject *)substring);
				4367	if (substring == NULL)
				4368	return NULL;
				4369
				4370	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4371
				4372	Py_DECREF(substring);
				4373	return result;
				4374	}
				4375
				4376
				4377	static PyMethodDef unicode_methods[] = {
				4378
				4379	/* Order is according to common usage: often used methods should
				4380	appear first, since lookup is done sequentially. */
				4381
				4382	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4383	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4384	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4385	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4386	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4387	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4388	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4389	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4390	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4391	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4392	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4393	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4394	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4395	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4396	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4397	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4398	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4399	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4400	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4401	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4402	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4403	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4404	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4405	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4406	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4407	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4408	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4409	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4410	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4411	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4412	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4413	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4414	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4415	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4416	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4417	#if 0
				4418	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4419	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4420	#endif
				4421
				4422	#if 0
				4423	/* This one is just used for debugging the implementation. */
				4424	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4425	#endif
				4426
				4427	{NULL, NULL}
				4428	};
				4429
				4430	static PyObject *
				4431	unicode_getattr(PyUnicodeObject self, char name)
				4432	{
				4433	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4434	}
				4435
				4436	static PySequenceMethods unicode_as_sequence = {
				4437	(inquiry) unicode_length, /* sq_length */
				4438	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4439	(intargfunc) unicode_repeat, /* sq_repeat */
				4440	(intargfunc) unicode_getitem, /* sq_item */
				4441	(intintargfunc) unicode_slice, /* sq_slice */
				4442	0, /* sq_ass_item */
				4443	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4444	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4445	};
				4446
				4447	static int
				4448	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4449	int index,
				4450	const void **ptr)
				4451	{
				4452	if (index != 0) {
				4453	PyErr_SetString(PyExc_SystemError,
				4454	"accessing non-existent unicode segment");
				4455	return -1;
				4456	}
				4457	ptr = (void ) self->str;
				4458	return PyUnicode_GET_DATA_SIZE(self);
				4459	}
				4460
				4461	static int
				4462	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4463	const void **ptr)
				4464	{
				4465	PyErr_SetString(PyExc_TypeError,
				4466	"cannot use unicode as modifyable buffer");
				4467	return -1;
				4468	}
				4469
				4470	static int
				4471	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4472	int *lenp)
				4473	{
				4474	if (lenp)
				4475	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4476	return 1;
				4477	}
				4478
				4479	static int
				4480	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4481	int index,
				4482	const void **ptr)
				4483	{
				4484	PyObject *str;
				4485
				4486	if (index != 0) {
				4487	PyErr_SetString(PyExc_SystemError,
				4488	"accessing non-existent unicode segment");
				4489	return -1;
				4490	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	4491	str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4492	if (str == NULL)
				4493	return -1;
				4494	ptr = (void ) PyString_AS_STRING(str);
				4495	return PyString_GET_SIZE(str);
				4496	}
				4497
				4498	/* Helpers for PyUnicode_Format() */
				4499
				4500	static PyObject *
				4501	getnextarg(args, arglen, p_argidx)
				4502	PyObject *args;
				4503	int arglen;
				4504	int *p_argidx;
				4505	{
				4506	int argidx = *p_argidx;
				4507	if (argidx < arglen) {
				4508	(*p_argidx)++;
				4509	if (arglen < 0)
				4510	return args;
				4511	else
				4512	return PyTuple_GetItem(args, argidx);
				4513	}
				4514	PyErr_SetString(PyExc_TypeError,
				4515	"not enough arguments for format string");
				4516	return NULL;
				4517	}
				4518
				4519	#define F_LJUST (1<<0)
				4520	#define F_SIGN (1<<1)
				4521	#define F_BLANK (1<<2)
				4522	#define F_ALT (1<<3)
				4523	#define F_ZERO (1<<4)
				4524
				4525	static
				4526	#ifdef HAVE_STDARG_PROTOTYPES
				4527	int usprintf(register Py_UNICODE buffer, char format, ...)
				4528	#else
				4529	int usprintf(va_alist) va_dcl
				4530	#endif
				4531	{
				4532	register int i;
				4533	int len;
				4534	va_list va;
				4535	char *charbuffer;
				4536	#ifdef HAVE_STDARG_PROTOTYPES
				4537	va_start(va, format);
				4538	#else
				4539	Py_UNICODE *args;
				4540	char *format;
				4541
				4542	va_start(va);
				4543	buffer = va_arg(va, Py_UNICODE *);
				4544	format = va_arg(va, char *);
				4545	#endif
				4546
				4547	/* First, format the string as char array, then expand to Py_UNICODE
				4548	array. */
				4549	charbuffer = (char *)buffer;
				4550	len = vsprintf(charbuffer, format, va);
				4551	for (i = len - 1; i >= 0; i--)
				4552	buffer[i] = (Py_UNICODE) charbuffer[i];
				4553
				4554	va_end(va);
				4555	return len;
				4556	}
				4557
				4558	static int
				4559	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4560	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4561	int flags,
				4562	int prec,
				4563	int type,
				4564	PyObject *v)
				4565	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4566	/* fmt = '%#.' + `prec` + `type`
				4567	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4568	char fmt[20];
				4569	double x;
				4570
				4571	x = PyFloat_AsDouble(v);
				4572	if (x == -1.0 && PyErr_Occurred())
				4573	return -1;
				4574	if (prec < 0)
				4575	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4576	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4577	type = 'g';
				4578	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4579	/* worst case length calc to ensure no buffer overrun:
				4580	fmt = %#.<prec>g
				4581	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4582	for any double rep.)
				4583	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4584	If prec=0 the effective precision is 1 (the leading digit is
				4585	always given), therefore increase by one to 10+prec. */
				4586	if (buflen <= (size_t)10 + (size_t)prec) {
				4587	PyErr_SetString(PyExc_OverflowError,
				4588	"formatted float is too long (precision too long?)");
				4589	return -1;
				4590	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4591	return usprintf(buf, fmt, x);
				4592	}
				4593
				4594	static int
				4595	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4596	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4597	int flags,
				4598	int prec,
				4599	int type,
				4600	PyObject *v)
				4601	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4602	/* fmt = '%#.' + `prec` + 'l' + `type`
				4603	worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4604	char fmt[20];
				4605	long x;
				4606
				4607	x = PyInt_AsLong(v);
				4608	if (x == -1 && PyErr_Occurred())
				4609	return -1;
				4610	if (prec < 0)
				4611	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4612	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4613	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4614	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4615	PyErr_SetString(PyExc_OverflowError,
				4616	"formatted integer is too long (precision too long?)");
				4617	return -1;
				4618	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4619	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4620	return usprintf(buf, fmt, x);
				4621	}
				4622
				4623	static int
				4624	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4625	size_t buflen,
				4626	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4627	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4628	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4629	if (PyUnicode_Check(v)) {
				4630	if (PyUnicode_GET_SIZE(v) != 1)
				4631	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4632	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4633	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4634
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4635	else if (PyString_Check(v)) {
				4636	if (PyString_GET_SIZE(v) != 1)
				4637	goto onError;
				4638	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4639	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4640
				4641	else {
				4642	/* Integer input truncated to a character */
				4643	long x;
				4644	x = PyInt_AsLong(v);
				4645	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4646	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4647	buf[0] = (char) x;
				4648	}
				4649	buf[1] = '\0';
				4650	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4651
				4652	onError:
				4653	PyErr_SetString(PyExc_TypeError,
				4654	"%c requires int or char");
				4655	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4656	}
				4657
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4658	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4659
				4660	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4661	chars are formatted. XXX This is a magic number. Each formatting
				4662	routine does bounds checking to ensure no overflow, but a better
				4663	solution may be to malloc a buffer of appropriate size for each
				4664	format. For now, the current solution is sufficient.
				4665	*/
				4666	#define FORMATBUFLEN (size_t)120
				4667
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4668	PyObject PyUnicode_Format(PyObject format,
				4669	PyObject *args)
				4670	{
				4671	Py_UNICODE fmt, res;
				4672	int fmtcnt, rescnt, reslen, arglen, argidx;
				4673	int args_owned = 0;
				4674	PyUnicodeObject *result = NULL;
				4675	PyObject *dict = NULL;
				4676	PyObject *uformat;
				4677
				4678	if (format == NULL \|\| args == NULL) {
				4679	PyErr_BadInternalCall();
				4680	return NULL;
				4681	}
				4682	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4683	if (uformat == NULL)
				4684	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4685	fmt = PyUnicode_AS_UNICODE(uformat);
				4686	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4687
				4688	reslen = rescnt = fmtcnt + 100;
				4689	result = _PyUnicode_New(reslen);
				4690	if (result == NULL)
				4691	goto onError;
				4692	res = PyUnicode_AS_UNICODE(result);
				4693
				4694	if (PyTuple_Check(args)) {
				4695	arglen = PyTuple_Size(args);
				4696	argidx = 0;
				4697	}
				4698	else {
				4699	arglen = -1;
				4700	argidx = -2;
				4701	}
				4702	if (args->ob_type->tp_as_mapping)
				4703	dict = args;
				4704
				4705	while (--fmtcnt >= 0) {
				4706	if (*fmt != '%') {
				4707	if (--rescnt < 0) {
				4708	rescnt = fmtcnt + 100;
				4709	reslen += rescnt;
				4710	if (_PyUnicode_Resize(result, reslen) < 0)
				4711	return NULL;
				4712	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4713	--rescnt;
				4714	}
				4715	res++ = fmt++;
				4716	}
				4717	else {
				4718	/* Got a format specifier */
				4719	int flags = 0;
				4720	int width = -1;
				4721	int prec = -1;
				4722	int size = 0;
				4723	Py_UNICODE c = '\0';
				4724	Py_UNICODE fill;
				4725	PyObject *v = NULL;
				4726	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4727	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4728	Py_UNICODE sign;
				4729	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4730	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4731
				4732	fmt++;
				4733	if (*fmt == '(') {
				4734	Py_UNICODE *keystart;
				4735	int keylen;
				4736	PyObject *key;
				4737	int pcount = 1;
				4738
				4739	if (dict == NULL) {
				4740	PyErr_SetString(PyExc_TypeError,
				4741	"format requires a mapping");
				4742	goto onError;
				4743	}
				4744	++fmt;
				4745	--fmtcnt;
				4746	keystart = fmt;
				4747	/* Skip over balanced parentheses */
				4748	while (pcount > 0 && --fmtcnt >= 0) {
				4749	if (*fmt == ')')
				4750	--pcount;
				4751	else if (*fmt == '(')
				4752	++pcount;
				4753	fmt++;
				4754	}
				4755	keylen = fmt - keystart - 1;
				4756	if (fmtcnt < 0 \|\| pcount > 0) {
				4757	PyErr_SetString(PyExc_ValueError,
				4758	"incomplete format key");
				4759	goto onError;
				4760	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4761	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4762	then looked up since Python uses strings to hold
				4763	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4764	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4765	key = PyUnicode_EncodeUTF8(keystart,
				4766	keylen,
				4767	NULL);
				4768	if (key == NULL)
				4769	goto onError;
				4770	if (args_owned) {
				4771	Py_DECREF(args);
				4772	args_owned = 0;
				4773	}
				4774	args = PyObject_GetItem(dict, key);
				4775	Py_DECREF(key);
				4776	if (args == NULL) {
				4777	goto onError;
				4778	}
				4779	args_owned = 1;
				4780	arglen = -1;
				4781	argidx = -2;
				4782	}
				4783	while (--fmtcnt >= 0) {
				4784	switch (c = *fmt++) {
				4785	case '-': flags \|= F_LJUST; continue;
				4786	case '+': flags \|= F_SIGN; continue;
				4787	case ' ': flags \|= F_BLANK; continue;
				4788	case '#': flags \|= F_ALT; continue;
				4789	case '0': flags \|= F_ZERO; continue;
				4790	}
				4791	break;
				4792	}
				4793	if (c == '*') {
				4794	v = getnextarg(args, arglen, &argidx);
				4795	if (v == NULL)
				4796	goto onError;
				4797	if (!PyInt_Check(v)) {
				4798	PyErr_SetString(PyExc_TypeError,
				4799	"* wants int");
				4800	goto onError;
				4801	}
				4802	width = PyInt_AsLong(v);
				4803	if (width < 0) {
				4804	flags \|= F_LJUST;
				4805	width = -width;
				4806	}
				4807	if (--fmtcnt >= 0)
				4808	c = *fmt++;
				4809	}
				4810	else if (c >= '0' && c <= '9') {
				4811	width = c - '0';
				4812	while (--fmtcnt >= 0) {
				4813	c = *fmt++;
				4814	if (c < '0' \|\| c > '9')
				4815	break;
				4816	if ((width*10) / 10 != width) {
				4817	PyErr_SetString(PyExc_ValueError,
				4818	"width too big");
				4819	goto onError;
				4820	}
				4821	width = width*10 + (c - '0');
				4822	}
				4823	}
				4824	if (c == '.') {
				4825	prec = 0;
				4826	if (--fmtcnt >= 0)
				4827	c = *fmt++;
				4828	if (c == '*') {
				4829	v = getnextarg(args, arglen, &argidx);
				4830	if (v == NULL)
				4831	goto onError;
				4832	if (!PyInt_Check(v)) {
				4833	PyErr_SetString(PyExc_TypeError,
				4834	"* wants int");
				4835	goto onError;
				4836	}
				4837	prec = PyInt_AsLong(v);
				4838	if (prec < 0)
				4839	prec = 0;
				4840	if (--fmtcnt >= 0)
				4841	c = *fmt++;
				4842	}
				4843	else if (c >= '0' && c <= '9') {
				4844	prec = c - '0';
				4845	while (--fmtcnt >= 0) {
				4846	c = Py_CHARMASK(*fmt++);
				4847	if (c < '0' \|\| c > '9')
				4848	break;
				4849	if ((prec*10) / 10 != prec) {
				4850	PyErr_SetString(PyExc_ValueError,
				4851	"prec too big");
				4852	goto onError;
				4853	}
				4854	prec = prec*10 + (c - '0');
				4855	}
				4856	}
				4857	} /* prec */
				4858	if (fmtcnt >= 0) {
				4859	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4860	size = c;
				4861	if (--fmtcnt >= 0)
				4862	c = *fmt++;
				4863	}
				4864	}
				4865	if (fmtcnt < 0) {
				4866	PyErr_SetString(PyExc_ValueError,
				4867	"incomplete format");
				4868	goto onError;
				4869	}
				4870	if (c != '%') {
				4871	v = getnextarg(args, arglen, &argidx);
				4872	if (v == NULL)
				4873	goto onError;
				4874	}
				4875	sign = 0;
				4876	fill = ' ';
				4877	switch (c) {
				4878
				4879	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4880	pbuf = formatbuf;
				4881	/* presume that buffer length is at least 1 */
				4882	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4883	len = 1;
				4884	break;
				4885
				4886	case 's':
				4887	case 'r':
				4888	if (PyUnicode_Check(v) && c == 's') {
				4889	temp = v;
				4890	Py_INCREF(temp);
				4891	}
				4892	else {
				4893	PyObject *unicode;
				4894	if (c == 's')
				4895	temp = PyObject_Str(v);
				4896	else
				4897	temp = PyObject_Repr(v);
				4898	if (temp == NULL)
				4899	goto onError;
				4900	if (!PyString_Check(temp)) {
				4901	/* XXX Note: this should never happen, since
				4902	PyObject_Repr() and PyObject_Str() assure
				4903	this */
				4904	Py_DECREF(temp);
				4905	PyErr_SetString(PyExc_TypeError,
				4906	"%s argument has non-string str()");
				4907	goto onError;
				4908	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4909	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4910	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4911	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4912	"strict");
				4913	Py_DECREF(temp);
				4914	temp = unicode;
				4915	if (temp == NULL)
				4916	goto onError;
				4917	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4918	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4919	len = PyUnicode_GET_SIZE(temp);
				4920	if (prec >= 0 && len > prec)
				4921	len = prec;
				4922	break;
				4923
				4924	case 'i':
				4925	case 'd':
				4926	case 'u':
				4927	case 'o':
				4928	case 'x':
				4929	case 'X':
				4930	if (c == 'i')
				4931	c = 'd';
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4932	pbuf = formatbuf;
				4933	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4934	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4935	if (len < 0)
				4936	goto onError;
				4937	sign = (c == 'd');
				4938	if (flags & F_ZERO) {
				4939	fill = '0';
				4940	if ((flags&F_ALT) &&
				4941	(c == 'x' \|\| c == 'X') &&
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4942	pbuf[0] == '0' && pbuf[1] == c) {
				4943	res++ = pbuf++;
				4944	res++ = pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4945	rescnt -= 2;
				4946	len -= 2;
				4947	width -= 2;
				4948	if (width < 0)
				4949	width = 0;
				4950	}
				4951	}
				4952	break;
				4953
				4954	case 'e':
				4955	case 'E':
				4956	case 'f':
				4957	case 'g':
				4958	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4959	pbuf = formatbuf;
				4960	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4961	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4962	if (len < 0)
				4963	goto onError;
				4964	sign = 1;
				4965	if (flags&F_ZERO)
				4966	fill = '0';
				4967	break;
				4968
				4969	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4970	pbuf = formatbuf;
				4971	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4972	if (len < 0)
				4973	goto onError;
				4974	break;
				4975
				4976	default:
				4977	PyErr_Format(PyExc_ValueError,
				4978	"unsupported format character '%c' (0x%x)",
				4979	c, c);
				4980	goto onError;
				4981	}
				4982	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4983	if (pbuf == '-' \|\| pbuf == '+') {
				4984	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4985	len--;
				4986	}
				4987	else if (flags & F_SIGN)
				4988	sign = '+';
				4989	else if (flags & F_BLANK)
				4990	sign = ' ';
				4991	else
				4992	sign = 0;
				4993	}
				4994	if (width < len)
				4995	width = len;
				4996	if (rescnt < width + (sign != 0)) {
				4997	reslen -= rescnt;
				4998	rescnt = width + fmtcnt + 100;
				4999	reslen += rescnt;
				5000	if (_PyUnicode_Resize(result, reslen) < 0)
				5001	return NULL;
				5002	res = PyUnicode_AS_UNICODE(result)
				5003	+ reslen - rescnt;
				5004	}
				5005	if (sign) {
				5006	if (fill != ' ')
				5007	*res++ = sign;
				5008	rescnt--;
				5009	if (width > len)
				5010	width--;
				5011	}
				5012	if (width > len && !(flags & F_LJUST)) {
				5013	do {
				5014	--rescnt;
				5015	*res++ = fill;
				5016	} while (--width > len);
				5017	}
				5018	if (sign && fill == ' ')
				5019	*res++ = sign;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5020	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5021	res += len;
				5022	rescnt -= len;
				5023	while (--width >= len) {
				5024	--rescnt;
				5025	*res++ = ' ';
				5026	}
				5027	if (dict && (argidx < arglen) && c != '%') {
				5028	PyErr_SetString(PyExc_TypeError,
				5029	"not all arguments converted");
				5030	goto onError;
				5031	}
				5032	Py_XDECREF(temp);
				5033	} /* '%' */
				5034	} /* until end */
				5035	if (argidx < arglen && !dict) {
				5036	PyErr_SetString(PyExc_TypeError,
				5037	"not all arguments converted");
				5038	goto onError;
				5039	}
				5040
				5041	if (args_owned) {
				5042	Py_DECREF(args);
				5043	}
				5044	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5045	if (_PyUnicode_Resize(result, reslen - rescnt))
				5046	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5047	return (PyObject *)result;
				5048
				5049	onError:
				5050	Py_XDECREF(result);
				5051	Py_DECREF(uformat);
				5052	if (args_owned) {
				5053	Py_DECREF(args);
				5054	}
				5055	return NULL;
				5056	}
				5057
				5058	static PyBufferProcs unicode_as_buffer = {
				5059	(getreadbufferproc) unicode_buffer_getreadbuf,
				5060	(getwritebufferproc) unicode_buffer_getwritebuf,
				5061	(getsegcountproc) unicode_buffer_getsegcount,
				5062	(getcharbufferproc) unicode_buffer_getcharbuf,
				5063	};
				5064
				5065	PyTypeObject PyUnicode_Type = {
				5066	PyObject_HEAD_INIT(&PyType_Type)
				5067	0, /* ob_size */
				5068	"unicode", /* tp_name */
				5069	sizeof(PyUnicodeObject), /* tp_size */
				5070	0, /* tp_itemsize */
				5071	/* Slots */
				5072	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5073	0, /* tp_print */
				5074	(getattrfunc)unicode_getattr, /* tp_getattr */
				5075	0, /* tp_setattr */
				5076	(cmpfunc) unicode_compare, /* tp_compare */
				5077	(reprfunc) unicode_repr, /* tp_repr */
				5078	0, /* tp_as_number */
				5079	&unicode_as_sequence, /* tp_as_sequence */
				5080	0, /* tp_as_mapping */
				5081	(hashfunc) unicode_hash, /* tp_hash*/
				5082	0, /* tp_call*/
				5083	(reprfunc) unicode_str, /* tp_str */
				5084	(getattrofunc) NULL, /* tp_getattro */
				5085	(setattrofunc) NULL, /* tp_setattro */
				5086	&unicode_as_buffer, /* tp_as_buffer */
				5087	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5088	};
				5089
				5090	/* Initialize the Unicode implementation */
				5091
				5092	void _PyUnicode_Init()
				5093	{
				5094	/* Doublecheck the configuration... */
				5095	if (sizeof(Py_UNICODE) != 2)
				5096	Py_FatalError("Unicode configuration error: "
				5097	"sizeof(Py_UNICODE) != 2 bytes");
				5098
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5099	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5100	unicode_freelist = NULL;
				5101	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5102	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5103	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5104	}
				5105
				5106	/* Finalize the Unicode implementation */
				5107
				5108	void
				5109	_PyUnicode_Fini()
				5110	{
				5111	PyUnicodeObject *u = unicode_freelist;
				5112
				5113	while (u != NULL) {
				5114	PyUnicodeObject *v = u;
				5115	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5116	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5117	PyMem_DEL(v->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5118	Py_XDECREF(v->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5119	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5120	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5121	unicode_freelist = NULL;
				5122	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5123	Py_XDECREF(unicode_empty);
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5124	unicode_empty = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5125	}