Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 359a9872ff52fa7585c20be4948b2549ec6e5cac [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	69	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	#if defined(HAVE_LIMITS_H)
				72	#include <limits.h>
				73	#else
				74	#define INT_MAX 2147483647
				75	#endif
				76
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	77	#ifdef MS_WIN32
				78	#include <windows.h>
				79	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	80
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	81	/* Limit for the Unicode object free list */
				82
				83	#define MAX_UNICODE_FREELIST_SIZE 1024
				84
				85	/* Limit for the Unicode object free list stay alive optimization.
				86
				87	The implementation will keep allocated Unicode memory intact for
				88	all objects on the free list having a size less than this
				89	limit. This reduces malloc() overhead for small Unicode objects.
				90
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	91	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	92	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	93	malloc()-overhead) bytes of unused garbage.
				94
				95	Setting the limit to 0 effectively turns the feature off.
				96
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	97	Note: This is an experimental feature ! If you get core dumps when
				98	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	99
				100	*/
				101
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	102	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	103
				104	/* Endianness switches; defaults to little endian */
				105
				106	#ifdef WORDS_BIGENDIAN
				107	# define BYTEORDER_IS_BIG_ENDIAN
				108	#else
				109	# define BYTEORDER_IS_LITTLE_ENDIAN
				110	#endif
				111
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	112	/* --- Globals ------------------------------------------------------------
				113
				114	The globals are initialized by the _PyUnicode_Init() API and should
				115	not be used before calling that API.
				116
				117	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118
				119	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	120	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	121
				122	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	123	static PyUnicodeObject *unicode_freelist;
				124	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	125
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	126	/* Default encoding to use and assume when NULL is passed as encoding
				127	parameter; it is initialized by _PyUnicode_Init().
				128
				129	Always use the PyUnicode_SetDefaultEncoding() and
				130	PyUnicode_GetDefaultEncoding() APIs to access this global.
				131
				132	*/
				133
				134	static char unicode_default_encoding[100];
				135
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	136	/* --- Unicode Object ----------------------------------------------------- */
				137
				138	static
				139	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				140	int length)
				141	{
				142	void *oldstr;
				143
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	144	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	145	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	146	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	147
				148	/* Resizing unicode_empty is not allowed. */
				149	if (unicode == unicode_empty) {
				150	PyErr_SetString(PyExc_SystemError,
				151	"can't resize empty unicode object");
				152	return -1;
				153	}
				154
				155	/* We allocate one more byte to make sure the string is
				156	Ux0000 terminated -- XXX is this needed ? */
				157	oldstr = unicode->str;
				158	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				159	if (!unicode->str) {
				160	unicode->str = oldstr;
				161	PyErr_NoMemory();
				162	return -1;
				163	}
				164	unicode->str[length] = 0;
				165	unicode->length = length;
				166
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	167	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	168	/* Reset the object caches */
				169	if (unicode->utf8str) {
				170	Py_DECREF(unicode->utf8str);
				171	unicode->utf8str = NULL;
				172	}
				173	unicode->hash = -1;
				174
				175	return 0;
				176	}
				177
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	178	int PyUnicode_Resize(PyObject **unicode,
				179	int length)
				180	{
				181	PyUnicodeObject *v;
				182
				183	if (unicode == NULL) {
				184	PyErr_BadInternalCall();
				185	return -1;
				186	}
				187	v = (PyUnicodeObject )unicode;
				188	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				189	PyErr_BadInternalCall();
				190	return -1;
				191	}
				192	return _PyUnicode_Resize(v, length);
				193	}
				194
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	195	/* We allocate one more byte to make sure the string is
				196	Ux0000 terminated -- XXX is this needed ?
				197
				198	XXX This allocator could further be enhanced by assuring that the
				199	free list never reduces its size below 1.
				200
				201	*/
				202
				203	static
				204	PyUnicodeObject *_PyUnicode_New(int length)
				205	{
				206	register PyUnicodeObject *unicode;
				207
				208	/* Optimization for empty strings */
				209	if (length == 0 && unicode_empty != NULL) {
				210	Py_INCREF(unicode_empty);
				211	return unicode_empty;
				212	}
				213
				214	/* Unicode freelist & memory allocation */
				215	if (unicode_freelist) {
				216	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	217	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	219	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	220	/* Keep-Alive optimization: we only upsize the buffer,
				221	never downsize it. */
				222	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	223	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	224	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	225	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	226	}
				227	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	228	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	230	}
				231	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	232	}
				233	else {
				234	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				235	if (unicode == NULL)
				236	return NULL;
				237	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				238	}
				239
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	240	if (!unicode->str) {
				241	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	242	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	243	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	244	unicode->str[length] = 0;
				245	unicode->length = length;
				246	unicode->hash = -1;
				247	unicode->utf8str = NULL;
				248	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	249
				250	onError:
				251	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	252	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	253	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	254	}
				255
				256	static
				257	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				258	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	259	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	260	/* Keep-Alive optimization */
				261	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	262	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	263	unicode->str = NULL;
				264	unicode->length = 0;
				265	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	266	if (unicode->utf8str) {
				267	Py_DECREF(unicode->utf8str);
				268	unicode->utf8str = NULL;
				269	}
				270	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	271	(PyUnicodeObject *)unicode = unicode_freelist;
				272	unicode_freelist = unicode;
				273	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	274	}
				275	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	276	PyMem_DEL(unicode->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	277	Py_XDECREF(unicode->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	278	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	279	}
				280	}
				281
				282	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				283	int size)
				284	{
				285	PyUnicodeObject *unicode;
				286
				287	unicode = _PyUnicode_New(size);
				288	if (!unicode)
				289	return NULL;
				290
				291	/* Copy the Unicode data into the new object */
				292	if (u != NULL)
				293	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				294
				295	return (PyObject *)unicode;
				296	}
				297
				298	#ifdef HAVE_WCHAR_H
				299
				300	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				301	int size)
				302	{
				303	PyUnicodeObject *unicode;
				304
				305	if (w == NULL) {
				306	PyErr_BadInternalCall();
				307	return NULL;
				308	}
				309
				310	unicode = _PyUnicode_New(size);
				311	if (!unicode)
				312	return NULL;
				313
				314	/* Copy the wchar_t data into the new object */
				315	#ifdef HAVE_USABLE_WCHAR_T
				316	memcpy(unicode->str, w, size * sizeof(wchar_t));
				317	#else
				318	{
				319	register Py_UNICODE *u;
				320	register int i;
				321	u = PyUnicode_AS_UNICODE(unicode);
				322	for (i = size; i >= 0; i--)
				323	u++ = w++;
				324	}
				325	#endif
				326
				327	return (PyObject *)unicode;
				328	}
				329
				330	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				331	register wchar_t *w,
				332	int size)
				333	{
				334	if (unicode == NULL) {
				335	PyErr_BadInternalCall();
				336	return -1;
				337	}
				338	if (size > PyUnicode_GET_SIZE(unicode))
				339	size = PyUnicode_GET_SIZE(unicode);
				340	#ifdef HAVE_USABLE_WCHAR_T
				341	memcpy(w, unicode->str, size * sizeof(wchar_t));
				342	#else
				343	{
				344	register Py_UNICODE *u;
				345	register int i;
				346	u = PyUnicode_AS_UNICODE(unicode);
				347	for (i = size; i >= 0; i--)
				348	w++ = u++;
				349	}
				350	#endif
				351
				352	return size;
				353	}
				354
				355	#endif
				356
				357	PyObject PyUnicode_FromObject(register PyObject obj)
				358	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	359	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				360	}
				361
				362	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				363	const char *encoding,
				364	const char *errors)
				365	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	366	const char *s;
				367	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	368	int owned = 0;
				369	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	370
				371	if (obj == NULL) {
				372	PyErr_BadInternalCall();
				373	return NULL;
				374	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	375
				376	/* Coerce object */
				377	if (PyInstance_Check(obj)) {
				378	PyObject *func;
				379	func = PyObject_GetAttrString(obj, "__str__");
				380	if (func == NULL) {
				381	PyErr_SetString(PyExc_TypeError,
				382	"coercing to Unicode: instance doesn't define __str__");
				383	return NULL;
				384	}
				385	obj = PyEval_CallObject(func, NULL);
				386	Py_DECREF(func);
				387	if (obj == NULL)
				388	return NULL;
				389	owned = 1;
				390	}
				391	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	392	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	393	v = obj;
				394	if (encoding) {
				395	PyErr_SetString(PyExc_TypeError,
				396	"decoding Unicode is not supported");
				397	return NULL;
				398	}
				399	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	400	}
				401	else if (PyString_Check(obj)) {
				402	s = PyString_AS_STRING(obj);
				403	len = PyString_GET_SIZE(obj);
				404	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	405	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				406	/* Overwrite the error message with something more useful in
				407	case of a TypeError. */
				408	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	409	PyErr_Format(PyExc_TypeError,
				410	"coercing to Unicode: need string or buffer, "
				411	"%.80s found",
				412	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	413	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	414	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	415
				416	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	417	if (len == 0) {
				418	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	419	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	420	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	421	else
				422	v = PyUnicode_Decode(s, len, encoding, errors);
				423	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	424	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	425	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	426	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	427	return v;
				428
				429	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	430	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	431	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	432	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	433	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	434	}
				435
				436	PyObject PyUnicode_Decode(const char s,
				437	int size,
				438	const char *encoding,
				439	const char *errors)
				440	{
				441	PyObject buffer = NULL, unicode;
				442
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	443	if (encoding == NULL)
				444	encoding = PyUnicode_GetDefaultEncoding();
				445
				446	/* Shortcuts for common default encodings */
				447	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	448	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	449	else if (strcmp(encoding, "latin-1") == 0)
				450	return PyUnicode_DecodeLatin1(s, size, errors);
				451	else if (strcmp(encoding, "ascii") == 0)
				452	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	453
				454	/* Decode via the codec registry */
				455	buffer = PyBuffer_FromMemory((void *)s, size);
				456	if (buffer == NULL)
				457	goto onError;
				458	unicode = PyCodec_Decode(buffer, encoding, errors);
				459	if (unicode == NULL)
				460	goto onError;
				461	if (!PyUnicode_Check(unicode)) {
				462	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	463	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	464	unicode->ob_type->tp_name);
				465	Py_DECREF(unicode);
				466	goto onError;
				467	}
				468	Py_DECREF(buffer);
				469	return unicode;
				470
				471	onError:
				472	Py_XDECREF(buffer);
				473	return NULL;
				474	}
				475
				476	PyObject PyUnicode_Encode(const Py_UNICODE s,
				477	int size,
				478	const char *encoding,
				479	const char *errors)
				480	{
				481	PyObject v, unicode;
				482
				483	unicode = PyUnicode_FromUnicode(s, size);
				484	if (unicode == NULL)
				485	return NULL;
				486	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				487	Py_DECREF(unicode);
				488	return v;
				489	}
				490
				491	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				492	const char *encoding,
				493	const char *errors)
				494	{
				495	PyObject *v;
				496
				497	if (!PyUnicode_Check(unicode)) {
				498	PyErr_BadArgument();
				499	goto onError;
				500	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	501
				502	if (encoding == NULL)
				503	encoding = PyUnicode_GetDefaultEncoding();
				504
				505	/* Shortcuts for common default encodings */
				506	if (errors == NULL) {
				507	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	508	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	509	else if (strcmp(encoding, "latin-1") == 0)
				510	return PyUnicode_AsLatin1String(unicode);
				511	else if (strcmp(encoding, "ascii") == 0)
				512	return PyUnicode_AsASCIIString(unicode);
				513	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	514
				515	/* Encode via the codec registry */
				516	v = PyCodec_Encode(unicode, encoding, errors);
				517	if (v == NULL)
				518	goto onError;
				519	/* XXX Should we really enforce this ? */
				520	if (!PyString_Check(v)) {
				521	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	522	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	523	v->ob_type->tp_name);
				524	Py_DECREF(v);
				525	goto onError;
				526	}
				527	return v;
				528
				529	onError:
				530	return NULL;
				531	}
				532
				533	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				534	{
				535	if (!PyUnicode_Check(unicode)) {
				536	PyErr_BadArgument();
				537	goto onError;
				538	}
				539	return PyUnicode_AS_UNICODE(unicode);
				540
				541	onError:
				542	return NULL;
				543	}
				544
				545	int PyUnicode_GetSize(PyObject *unicode)
				546	{
				547	if (!PyUnicode_Check(unicode)) {
				548	PyErr_BadArgument();
				549	goto onError;
				550	}
				551	return PyUnicode_GET_SIZE(unicode);
				552
				553	onError:
				554	return -1;
				555	}
				556
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame^]	557	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	558	{
				559	return unicode_default_encoding;
				560	}
				561
				562	int PyUnicode_SetDefaultEncoding(const char *encoding)
				563	{
				564	PyObject *v;
				565
				566	/* Make sure the encoding is valid. As side effect, this also
				567	loads the encoding into the codec registry cache. */
				568	v = _PyCodec_Lookup(encoding);
				569	if (v == NULL)
				570	goto onError;
				571	Py_DECREF(v);
				572	strncpy(unicode_default_encoding,
				573	encoding,
				574	sizeof(unicode_default_encoding));
				575	return 0;
				576
				577	onError:
				578	return -1;
				579	}
				580
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	581	/* --- UTF-8 Codec -------------------------------------------------------- */
				582
				583	static
				584	char utf8_code_length[256] = {
				585	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				586	illegal prefix. see RFC 2279 for details */
				587	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				588	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				589	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				590	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				591	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				592	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				593	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				594	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				595	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				596	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				597	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				598	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				599	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				600	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				601	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				602	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				603	};
				604
				605	static
				606	int utf8_decoding_error(const char **source,
				607	Py_UNICODE **dest,
				608	const char *errors,
				609	const char *details)
				610	{
				611	if ((errors == NULL) \|\|
				612	(strcmp(errors,"strict") == 0)) {
				613	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	614	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	615	details);
				616	return -1;
				617	}
				618	else if (strcmp(errors,"ignore") == 0) {
				619	(*source)++;
				620	return 0;
				621	}
				622	else if (strcmp(errors,"replace") == 0) {
				623	(*source)++;
				624	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				625	(*dest)++;
				626	return 0;
				627	}
				628	else {
				629	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	630	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	631	errors);
				632	return -1;
				633	}
				634	}
				635
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	636	PyObject PyUnicode_DecodeUTF8(const char s,
				637	int size,
				638	const char *errors)
				639	{
				640	int n;
				641	const char *e;
				642	PyUnicodeObject *unicode;
				643	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	644	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	645
				646	/* Note: size will always be longer than the resulting Unicode
				647	character count */
				648	unicode = _PyUnicode_New(size);
				649	if (!unicode)
				650	return NULL;
				651	if (size == 0)
				652	return (PyObject *)unicode;
				653
				654	/* Unpack UTF-8 encoded data */
				655	p = unicode->str;
				656	e = s + size;
				657
				658	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	659	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	660
				661	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	662	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	663	s++;
				664	continue;
				665	}
				666
				667	n = utf8_code_length[ch];
				668
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	669	if (s + n > e) {
				670	errmsg = "unexpected end of data";
				671	goto utf8Error;
				672	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	673
				674	switch (n) {
				675
				676	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	677	errmsg = "unexpected code byte";
				678	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	679	break;
				680
				681	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	682	errmsg = "internal error";
				683	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	684	break;
				685
				686	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	687	if ((s[1] & 0xc0) != 0x80) {
				688	errmsg = "invalid data";
				689	goto utf8Error;
				690	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	691	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	692	if (ch < 0x80) {
				693	errmsg = "illegal encoding";
				694	goto utf8Error;
				695	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	696	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	697	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	698	break;
				699
				700	case 3:
				701	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	702	(s[2] & 0xc0) != 0x80) {
				703	errmsg = "invalid data";
				704	goto utf8Error;
				705	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	706	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	707	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				708	errmsg = "illegal encoding";
				709	goto utf8Error;
				710	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	711	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	712	*p++ = (Py_UNICODE)ch;
				713	break;
				714
				715	case 4:
				716	if ((s[1] & 0xc0) != 0x80 \|\|
				717	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	718	(s[3] & 0xc0) != 0x80) {
				719	errmsg = "invalid data";
				720	goto utf8Error;
				721	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	722	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				723	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				724	/* validate and convert to UTF-16 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	725	if ((ch < 0x10000) \|\| /* minimum value allowed for 4
				726	byte encoding */
				727	(ch > 0x10ffff)) { /* maximum value allowed for
				728	UTF-16 */
				729	errmsg = "illegal encoding";
				730	goto utf8Error;
				731	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	732	/* compute and append the two surrogates: */
				733
				734	/* translate from 10000..10FFFF to 0..FFFF */
				735	ch -= 0x10000;
				736
				737	/* high surrogate = top 10 bits added to D800 */
				738	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				739
				740	/* low surrogate = bottom 10 bits added to DC00 */
				741	*p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	742	break;
				743
				744	default:
				745	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	746	errmsg = "unsupported Unicode code range";
				747	goto utf8Error;
				748	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	749	}
				750	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	751	continue;
				752
				753	utf8Error:
				754	if (utf8_decoding_error(&s, &p, errors, errmsg))
				755	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	756	}
				757
				758	/* Adjust length */
				759	if (_PyUnicode_Resize(unicode, p - unicode->str))
				760	goto onError;
				761
				762	return (PyObject *)unicode;
				763
				764	onError:
				765	Py_DECREF(unicode);
				766	return NULL;
				767	}
				768
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	769	/* Not used anymore, now that the encoder supports UTF-16
				770	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	771	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	772	static
				773	int utf8_encoding_error(const Py_UNICODE **source,
				774	char **dest,
				775	const char *errors,
				776	const char *details)
				777	{
				778	if ((errors == NULL) \|\|
				779	(strcmp(errors,"strict") == 0)) {
				780	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	781	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	782	details);
				783	return -1;
				784	}
				785	else if (strcmp(errors,"ignore") == 0) {
				786	return 0;
				787	}
				788	else if (strcmp(errors,"replace") == 0) {
				789	**dest = '?';
				790	(*dest)++;
				791	return 0;
				792	}
				793	else {
				794	PyErr_Format(PyExc_ValueError,
				795	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	796	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	797	errors);
				798	return -1;
				799	}
				800	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	801	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	802
				803	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				804	int size,
				805	const char *errors)
				806	{
				807	PyObject *v;
				808	char *p;
				809	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	810	Py_UCS4 ch2;
				811	unsigned int cbAllocated = 3 * size;
				812	unsigned int cbWritten = 0;
				813	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	814
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	815	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	816	if (v == NULL)
				817	return NULL;
				818	if (size == 0)
				819	goto done;
				820
				821	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	822	while (i < size) {
				823	Py_UCS4 ch = s[i++];
				824	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	825	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	826	cbWritten++;
				827	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	828	else if (ch < 0x0800) {
				829	*p++ = 0xc0 \| (ch >> 6);
				830	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	831	cbWritten += 2;
				832	}
				833	else {
				834	/* Check for high surrogate */
				835	if (0xD800 <= ch && ch <= 0xDBFF) {
				836	if (i != size) {
				837	ch2 = s[i];
				838	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				839
				840	if (cbWritten >= (cbAllocated - 4)) {
				841	/* Provide enough room for some more
				842	surrogates */
				843	cbAllocated += 4*10;
				844	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	845	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	846	}
				847
				848	/* combine the two values */
				849	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				850
				851	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	852	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	853	i++;
				854	cbWritten += 4;
				855	}
				856	}
				857	}
				858	else {
				859	*p++ = (char)(0xe0 \| (ch >> 12));
				860	cbWritten += 3;
				861	}
				862	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				863	*p++ = (char)(0x80 \| (ch & 0x3f));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	864	}
				865	}
				866	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	867	if (_PyString_Resize(&v, p - q))
				868	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	869
				870	done:
				871	return v;
				872
				873	onError:
				874	Py_DECREF(v);
				875	return NULL;
				876	}
				877
				878	/* Return a Python string holding the UTF-8 encoded value of the
				879	Unicode object.
				880
				881	The resulting string is cached in the Unicode object for subsequent
				882	usage by this function. The cached version is needed to implement
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	883	the character buffer interface and will live (at least) as long as
				884	the Unicode object itself.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	885
				886	The refcount of the string is not incremented.
				887
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	888	* Exported for internal use by the interpreter only !!! *
				889
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	890	*/
				891
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	892	PyObject _PyUnicode_AsUTF8String(PyObject unicode,
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	893	const char *errors)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	894	{
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	895	PyObject v = ((PyUnicodeObject )unicode)->utf8str;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	896
				897	if (v)
				898	return v;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	899	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				900	PyUnicode_GET_SIZE(unicode),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	901	errors);
				902	if (v && errors == NULL)
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	903	((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	904	return v;
				905	}
				906
				907	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				908	{
				909	PyObject *str;
				910
				911	if (!PyUnicode_Check(unicode)) {
				912	PyErr_BadArgument();
				913	return NULL;
				914	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	915	str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	916	if (str == NULL)
				917	return NULL;
				918	Py_INCREF(str);
				919	return str;
				920	}
				921
				922	/* --- UTF-16 Codec ------------------------------------------------------- */
				923
				924	static
				925	int utf16_decoding_error(const Py_UNICODE **source,
				926	Py_UNICODE **dest,
				927	const char *errors,
				928	const char *details)
				929	{
				930	if ((errors == NULL) \|\|
				931	(strcmp(errors,"strict") == 0)) {
				932	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	933	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	934	details);
				935	return -1;
				936	}
				937	else if (strcmp(errors,"ignore") == 0) {
				938	return 0;
				939	}
				940	else if (strcmp(errors,"replace") == 0) {
				941	if (dest) {
				942	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				943	(*dest)++;
				944	}
				945	return 0;
				946	}
				947	else {
				948	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	949	"UTF-16 decoding error; "
				950	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	951	errors);
				952	return -1;
				953	}
				954	}
				955
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	956	PyObject PyUnicode_DecodeUTF16(const char s,
				957	int size,
				958	const char *errors,
				959	int *byteorder)
				960	{
				961	PyUnicodeObject *unicode;
				962	Py_UNICODE *p;
				963	const Py_UNICODE q, e;
				964	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	965	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	966
				967	/* size should be an even number */
				968	if (size % sizeof(Py_UNICODE) != 0) {
				969	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				970	return NULL;
				971	/* The remaining input chars are ignored if we fall through
				972	here... */
				973	}
				974
				975	/* Note: size will always be longer than the resulting Unicode
				976	character count */
				977	unicode = _PyUnicode_New(size);
				978	if (!unicode)
				979	return NULL;
				980	if (size == 0)
				981	return (PyObject *)unicode;
				982
				983	/* Unpack UTF-16 encoded data */
				984	p = unicode->str;
				985	q = (Py_UNICODE *)s;
				986	e = q + (size / sizeof(Py_UNICODE));
				987
				988	if (byteorder)
				989	bo = *byteorder;
				990
				991	while (q < e) {
				992	register Py_UNICODE ch = *q++;
				993
				994	/* Check for BOM marks (U+FEFF) in the input and adjust
				995	current byte order setting accordingly. Swap input
				996	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				997	!) */
				998	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				999	if (ch == 0xFEFF) {
				1000	bo = -1;
				1001	continue;
				1002	} else if (ch == 0xFFFE) {
				1003	bo = 1;
				1004	continue;
				1005	}
				1006	if (bo == 1)
				1007	ch = (ch >> 8) \| (ch << 8);
				1008	#else
				1009	if (ch == 0xFEFF) {
				1010	bo = 1;
				1011	continue;
				1012	} else if (ch == 0xFFFE) {
				1013	bo = -1;
				1014	continue;
				1015	}
				1016	if (bo == -1)
				1017	ch = (ch >> 8) \| (ch << 8);
				1018	#endif
				1019	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1020	*p++ = ch;
				1021	continue;
				1022	}
				1023
				1024	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1025	if (q >= e) {
				1026	errmsg = "unexpected end of data";
				1027	goto utf16Error;
				1028	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1029	if (0xDC00 <= q && q <= 0xDFFF) {
				1030	q++;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1031	if (0xD800 <= q && q <= 0xDBFF) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1032	/* This is valid data (a UTF-16 surrogate pair), but
				1033	we are not able to store this information since our
				1034	Py_UNICODE type only has 16 bits... this might
				1035	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1036	errmsg = "code pairs are not supported";
				1037	goto utf16Error;
				1038	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1039	else
				1040	continue;
				1041	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1042	errmsg = "illegal encoding";
				1043	/* Fall through to report the error */
				1044
				1045	utf16Error:
				1046	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1047	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1048	}
				1049
				1050	if (byteorder)
				1051	*byteorder = bo;
				1052
				1053	/* Adjust length */
				1054	if (_PyUnicode_Resize(unicode, p - unicode->str))
				1055	goto onError;
				1056
				1057	return (PyObject *)unicode;
				1058
				1059	onError:
				1060	Py_DECREF(unicode);
				1061	return NULL;
				1062	}
				1063
				1064	#undef UTF16_ERROR
				1065
				1066	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1067	int size,
				1068	const char *errors,
				1069	int byteorder)
				1070	{
				1071	PyObject *v;
				1072	Py_UNICODE *p;
				1073	char *q;
				1074
				1075	/* We don't create UTF-16 pairs... */
				1076	v = PyString_FromStringAndSize(NULL,
				1077	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				1078	if (v == NULL)
				1079	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1080
				1081	q = PyString_AS_STRING(v);
				1082	p = (Py_UNICODE *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1083	if (byteorder == 0)
				1084	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1085	if (size == 0)
				1086	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1087	if (byteorder == 0 \|\|
				1088	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1089	byteorder == -1
				1090	#else
				1091	byteorder == 1
				1092	#endif
				1093	)
				1094	memcpy(p, s, size * sizeof(Py_UNICODE));
				1095	else
				1096	while (size-- > 0) {
				1097	Py_UNICODE ch = *s++;
				1098	*p++ = (ch >> 8) \| (ch << 8);
				1099	}
				1100	done:
				1101	return v;
				1102	}
				1103
				1104	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1105	{
				1106	if (!PyUnicode_Check(unicode)) {
				1107	PyErr_BadArgument();
				1108	return NULL;
				1109	}
				1110	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1111	PyUnicode_GET_SIZE(unicode),
				1112	NULL,
				1113	0);
				1114	}
				1115
				1116	/* --- Unicode Escape Codec ----------------------------------------------- */
				1117
				1118	static
				1119	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1120	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1121	const char *errors,
				1122	const char *details)
				1123	{
				1124	if ((errors == NULL) \|\|
				1125	(strcmp(errors,"strict") == 0)) {
				1126	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1127	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1128	details);
				1129	return -1;
				1130	}
				1131	else if (strcmp(errors,"ignore") == 0) {
				1132	return 0;
				1133	}
				1134	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1135	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1136	return 0;
				1137	}
				1138	else {
				1139	PyErr_Format(PyExc_ValueError,
				1140	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1141	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1142	errors);
				1143	return -1;
				1144	}
				1145	}
				1146
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1147	static _Py_UCNHashAPI *pucnHash = NULL;
				1148
				1149	static
				1150	int mystrnicmp(const char s1, const char s2, size_t count)
				1151	{
				1152	char c1, c2;
				1153
				1154	if (count)
				1155	{
				1156	do
				1157	{
				1158	c1 = tolower(*(s1++));
				1159	c2 = tolower(*(s2++));
				1160	}
				1161	while(--count && c1 == c2);
				1162
				1163	return c1 - c2;
				1164	}
				1165
				1166	return 0;
				1167	}
				1168
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1169	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1170	int size,
				1171	const char *errors)
				1172	{
				1173	PyUnicodeObject *v;
				1174	Py_UNICODE p = NULL, buf = NULL;
				1175	const char *end;
				1176
				1177	/* Escaped strings will always be longer than the resulting
				1178	Unicode string, so we start with size here and then reduce the
				1179	length after conversion to the true value. */
				1180	v = _PyUnicode_New(size);
				1181	if (v == NULL)
				1182	goto onError;
				1183	if (size == 0)
				1184	return (PyObject *)v;
				1185	p = buf = PyUnicode_AS_UNICODE(v);
				1186	end = s + size;
				1187	while (s < end) {
				1188	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1189	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1190	int i;
				1191
				1192	/* Non-escape characters are interpreted as Unicode ordinals */
				1193	if (*s != '\\') {
				1194	p++ = (unsigned char)s++;
				1195	continue;
				1196	}
				1197
				1198	/* \ - Escapes */
				1199	s++;
				1200	switch (*s++) {
				1201
				1202	/* \x escapes */
				1203	case '\n': break;
				1204	case '\\': *p++ = '\\'; break;
				1205	case '\'': *p++ = '\''; break;
				1206	case '\"': *p++ = '\"'; break;
				1207	case 'b': *p++ = '\b'; break;
				1208	case 'f': p++ = '\014'; break; / FF */
				1209	case 't': *p++ = '\t'; break;
				1210	case 'n': *p++ = '\n'; break;
				1211	case 'r': *p++ = '\r'; break;
				1212	case 'v': p++ = '\013'; break; / VT */
				1213	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1214
				1215	/* \OOO (octal) escapes */
				1216	case '0': case '1': case '2': case '3':
				1217	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1218	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1219	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1220	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1221	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1222	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1223	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1224	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1225	break;
				1226
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1227	/* \xXXXX escape with 1-n hex digits. for compatibility
				1228	with 8-bit strings, this code ignores all but the last
				1229	two digits */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1230	case 'x':
				1231	x = 0;
				1232	c = (unsigned char)*s;
				1233	if (isxdigit(c)) {
				1234	do {
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1235	x = (x<<4) & 0xF0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1236	if ('0' <= c && c <= '9')
				1237	x += c - '0';
				1238	else if ('a' <= c && c <= 'f')
				1239	x += 10 + c - 'a';
				1240	else
				1241	x += 10 + c - 'A';
				1242	c = (unsigned char)*++s;
				1243	} while (isxdigit(c));
Fredrik Lundh	0e19e76	2000-07-16 18:47:43 +0000	[diff] [blame]	1244	*p++ = (unsigned char) x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1245	} else {
				1246	*p++ = '\\';
				1247	*p++ = (unsigned char)s[-1];
				1248	}
				1249	break;
				1250
				1251	/* \uXXXX with 4 hex digits */
				1252	case 'u':
				1253	for (x = 0, i = 0; i < 4; i++) {
				1254	c = (unsigned char)s[i];
				1255	if (!isxdigit(c)) {
				1256	if (unicodeescape_decoding_error(&s, &x, errors,
				1257	"truncated \\uXXXX"))
				1258	goto onError;
				1259	i++;
				1260	break;
				1261	}
				1262	x = (x<<4) & ~0xF;
				1263	if (c >= '0' && c <= '9')
				1264	x += c - '0';
				1265	else if (c >= 'a' && c <= 'f')
				1266	x += 10 + c - 'a';
				1267	else
				1268	x += 10 + c - 'A';
				1269	}
				1270	s += i;
				1271	*p++ = x;
				1272	break;
				1273
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1274	case 'N':
				1275	/* Ok, we need to deal with Unicode Character Names now,
				1276	* make sure we've imported the hash table data...
				1277	*/
				1278	if (pucnHash == NULL)
				1279	{
				1280	PyObject mod = 0, v = 0;
				1281
				1282	mod = PyImport_ImportModule("ucnhash");
				1283	if (mod == NULL)
				1284	goto onError;
				1285	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1286	Py_DECREF(mod);
				1287	if (v == NULL)
				1288	{
				1289	goto onError;
				1290	}
				1291	pucnHash = PyCObject_AsVoidPtr(v);
				1292	Py_DECREF(v);
				1293	if (pucnHash == NULL)
				1294	{
				1295	goto onError;
				1296	}
				1297	}
				1298
				1299	if (*s == '{')
				1300	{
				1301	const char *start = s + 1;
				1302	const char *endBrace = start;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1303	Py_UCS4 value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1304	unsigned long j;
				1305
				1306	/* look for either the closing brace, or we
				1307	* exceed the maximum length of the unicode character names
				1308	*/
				1309	while (*endBrace != '}' &&
				1310	(unsigned int)(endBrace - start) <=
				1311	pucnHash->cchMax &&
				1312	endBrace < end)
				1313	{
				1314	endBrace++;
				1315	}
				1316	if (endBrace != end && *endBrace == '}')
				1317	{
				1318	j = pucnHash->hash(start, endBrace - start);
				1319	if (j > pucnHash->cKeys \|\|
				1320	mystrnicmp(
				1321	start,
				1322	((_Py_UnicodeCharacterName *)
				1323	(pucnHash->getValue(j)))->pszUCN,
				1324	(int)(endBrace - start)) != 0)
				1325	{
				1326	if (unicodeescape_decoding_error(
				1327	&s, &x, errors,
				1328	"Invalid Unicode Character Name"))
				1329	{
				1330	goto onError;
				1331	}
				1332	goto ucnFallthrough;
				1333	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1334	value = ((_Py_UnicodeCharacterName *)
				1335	(pucnHash->getValue(j)))->value;
				1336	if (value < 1<<16)
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1337	{
				1338	/* In UCS-2 range, easy solution.. */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1339	*p++ = value;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1340	}
				1341	else
				1342	{
				1343	/* Oops, its in UCS-4 space, */
				1344	/* compute and append the two surrogates: */
				1345	/* translate from 10000..10FFFF to 0..FFFFF */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1346	value -= 0x10000;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1347
				1348	/* high surrogate = top 10 bits added to D800 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1349	*p++ = 0xD800 + (value >> 10);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1350
				1351	/* low surrogate = bottom 10 bits added to DC00 */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1352	*p++ = 0xDC00 + (value & ~0xFC00);
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1353	}
				1354	s = endBrace + 1;
				1355	}
				1356	else
				1357	{
				1358	if (unicodeescape_decoding_error(
				1359	&s, &x, errors,
				1360	"Unicode name missing closing brace"))
				1361	goto onError;
				1362	goto ucnFallthrough;
				1363	}
				1364	break;
				1365	}
				1366	if (unicodeescape_decoding_error(
				1367	&s, &x, errors,
				1368	"Missing opening brace for Unicode Character Name escape"))
				1369	goto onError;
				1370	ucnFallthrough:
				1371	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1372	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1373	*p++ = '\\';
				1374	*p++ = (unsigned char)s[-1];
				1375	break;
				1376	}
				1377	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1378	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1379	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1380	return (PyObject *)v;
				1381
				1382	onError:
				1383	Py_XDECREF(v);
				1384	return NULL;
				1385	}
				1386
				1387	/* Return a Unicode-Escape string version of the Unicode object.
				1388
				1389	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1390	appropriate.
				1391
				1392	*/
				1393
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1394	static const Py_UNICODE findchar(const Py_UNICODE s,
				1395	int size,
				1396	Py_UNICODE ch);
				1397
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1398	static
				1399	PyObject unicodeescape_string(const Py_UNICODE s,
				1400	int size,
				1401	int quotes)
				1402	{
				1403	PyObject *repr;
				1404	char *p;
				1405	char *q;
				1406
				1407	static const char *hexdigit = "0123456789ABCDEF";
				1408
				1409	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1410	if (repr == NULL)
				1411	return NULL;
				1412
				1413	p = q = PyString_AS_STRING(repr);
				1414
				1415	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1416	*p++ = 'u';
				1417	*p++ = (findchar(s, size, '\'') &&
				1418	!findchar(s, size, '"')) ? '"' : '\'';
				1419	}
				1420	while (size-- > 0) {
				1421	Py_UNICODE ch = *s++;
				1422	/* Escape quotes */
				1423	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1424	*p++ = '\\';
				1425	*p++ = (char) ch;
				1426	}
				1427	/* Map 16-bit characters to '\uxxxx' */
				1428	else if (ch >= 256) {
				1429	*p++ = '\\';
				1430	*p++ = 'u';
				1431	*p++ = hexdigit[(ch >> 12) & 0xf];
				1432	*p++ = hexdigit[(ch >> 8) & 0xf];
				1433	*p++ = hexdigit[(ch >> 4) & 0xf];
				1434	*p++ = hexdigit[ch & 15];
				1435	}
				1436	/* Map non-printable US ASCII to '\ooo' */
				1437	else if (ch < ' ' \|\| ch >= 128) {
				1438	*p++ = '\\';
				1439	*p++ = hexdigit[(ch >> 6) & 7];
				1440	*p++ = hexdigit[(ch >> 3) & 7];
				1441	*p++ = hexdigit[ch & 7];
				1442	}
				1443	/* Copy everything else as-is */
				1444	else
				1445	*p++ = (char) ch;
				1446	}
				1447	if (quotes)
				1448	*p++ = q[1];
				1449
				1450	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1451	if (_PyString_Resize(&repr, p - q))
				1452	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1453
				1454	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1455
				1456	onError:
				1457	Py_DECREF(repr);
				1458	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1459	}
				1460
				1461	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1462	int size)
				1463	{
				1464	return unicodeescape_string(s, size, 0);
				1465	}
				1466
				1467	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1468	{
				1469	if (!PyUnicode_Check(unicode)) {
				1470	PyErr_BadArgument();
				1471	return NULL;
				1472	}
				1473	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1474	PyUnicode_GET_SIZE(unicode));
				1475	}
				1476
				1477	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1478
				1479	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1480	int size,
				1481	const char *errors)
				1482	{
				1483	PyUnicodeObject *v;
				1484	Py_UNICODE p, buf;
				1485	const char *end;
				1486	const char *bs;
				1487
				1488	/* Escaped strings will always be longer than the resulting
				1489	Unicode string, so we start with size here and then reduce the
				1490	length after conversion to the true value. */
				1491	v = _PyUnicode_New(size);
				1492	if (v == NULL)
				1493	goto onError;
				1494	if (size == 0)
				1495	return (PyObject *)v;
				1496	p = buf = PyUnicode_AS_UNICODE(v);
				1497	end = s + size;
				1498	while (s < end) {
				1499	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1500	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1501	int i;
				1502
				1503	/* Non-escape characters are interpreted as Unicode ordinals */
				1504	if (*s != '\\') {
				1505	p++ = (unsigned char)s++;
				1506	continue;
				1507	}
				1508
				1509	/* \u-escapes are only interpreted iff the number of leading
				1510	backslashes if odd */
				1511	bs = s;
				1512	for (;s < end;) {
				1513	if (*s != '\\')
				1514	break;
				1515	p++ = (unsigned char)s++;
				1516	}
				1517	if (((s - bs) & 1) == 0 \|\|
				1518	s >= end \|\|
				1519	*s != 'u') {
				1520	continue;
				1521	}
				1522	p--;
				1523	s++;
				1524
				1525	/* \uXXXX with 4 hex digits */
				1526	for (x = 0, i = 0; i < 4; i++) {
				1527	c = (unsigned char)s[i];
				1528	if (!isxdigit(c)) {
				1529	if (unicodeescape_decoding_error(&s, &x, errors,
				1530	"truncated \\uXXXX"))
				1531	goto onError;
				1532	i++;
				1533	break;
				1534	}
				1535	x = (x<<4) & ~0xF;
				1536	if (c >= '0' && c <= '9')
				1537	x += c - '0';
				1538	else if (c >= 'a' && c <= 'f')
				1539	x += 10 + c - 'a';
				1540	else
				1541	x += 10 + c - 'A';
				1542	}
				1543	s += i;
				1544	*p++ = x;
				1545	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1546	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1547	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1548	return (PyObject *)v;
				1549
				1550	onError:
				1551	Py_XDECREF(v);
				1552	return NULL;
				1553	}
				1554
				1555	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1556	int size)
				1557	{
				1558	PyObject *repr;
				1559	char *p;
				1560	char *q;
				1561
				1562	static const char *hexdigit = "0123456789ABCDEF";
				1563
				1564	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1565	if (repr == NULL)
				1566	return NULL;
				1567
				1568	p = q = PyString_AS_STRING(repr);
				1569	while (size-- > 0) {
				1570	Py_UNICODE ch = *s++;
				1571	/* Map 16-bit characters to '\uxxxx' */
				1572	if (ch >= 256) {
				1573	*p++ = '\\';
				1574	*p++ = 'u';
				1575	*p++ = hexdigit[(ch >> 12) & 0xf];
				1576	*p++ = hexdigit[(ch >> 8) & 0xf];
				1577	*p++ = hexdigit[(ch >> 4) & 0xf];
				1578	*p++ = hexdigit[ch & 15];
				1579	}
				1580	/* Copy everything else as-is */
				1581	else
				1582	*p++ = (char) ch;
				1583	}
				1584	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1585	if (_PyString_Resize(&repr, p - q))
				1586	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1587
				1588	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1589
				1590	onError:
				1591	Py_DECREF(repr);
				1592	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1593	}
				1594
				1595	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1596	{
				1597	if (!PyUnicode_Check(unicode)) {
				1598	PyErr_BadArgument();
				1599	return NULL;
				1600	}
				1601	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1602	PyUnicode_GET_SIZE(unicode));
				1603	}
				1604
				1605	/* --- Latin-1 Codec ------------------------------------------------------ */
				1606
				1607	PyObject PyUnicode_DecodeLatin1(const char s,
				1608	int size,
				1609	const char *errors)
				1610	{
				1611	PyUnicodeObject *v;
				1612	Py_UNICODE *p;
				1613
				1614	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1615	v = _PyUnicode_New(size);
				1616	if (v == NULL)
				1617	goto onError;
				1618	if (size == 0)
				1619	return (PyObject *)v;
				1620	p = PyUnicode_AS_UNICODE(v);
				1621	while (size-- > 0)
				1622	p++ = (unsigned char)s++;
				1623	return (PyObject *)v;
				1624
				1625	onError:
				1626	Py_XDECREF(v);
				1627	return NULL;
				1628	}
				1629
				1630	static
				1631	int latin1_encoding_error(const Py_UNICODE **source,
				1632	char **dest,
				1633	const char *errors,
				1634	const char *details)
				1635	{
				1636	if ((errors == NULL) \|\|
				1637	(strcmp(errors,"strict") == 0)) {
				1638	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1639	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1640	details);
				1641	return -1;
				1642	}
				1643	else if (strcmp(errors,"ignore") == 0) {
				1644	return 0;
				1645	}
				1646	else if (strcmp(errors,"replace") == 0) {
				1647	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1648	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1649	return 0;
				1650	}
				1651	else {
				1652	PyErr_Format(PyExc_ValueError,
				1653	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1654	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1655	errors);
				1656	return -1;
				1657	}
				1658	}
				1659
				1660	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1661	int size,
				1662	const char *errors)
				1663	{
				1664	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1665	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1666	repr = PyString_FromStringAndSize(NULL, size);
				1667	if (repr == NULL)
				1668	return NULL;
				1669
				1670	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1671	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1672	while (size-- > 0) {
				1673	Py_UNICODE ch = *p++;
				1674	if (ch >= 256) {
				1675	if (latin1_encoding_error(&p, &s, errors,
				1676	"ordinal not in range(256)"))
				1677	goto onError;
				1678	}
				1679	else
				1680	*s++ = (char)ch;
				1681	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1682	/* Resize if error handling skipped some characters */
				1683	if (s - start < PyString_GET_SIZE(repr))
				1684	if (_PyString_Resize(&repr, s - start))
				1685	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1686	return repr;
				1687
				1688	onError:
				1689	Py_DECREF(repr);
				1690	return NULL;
				1691	}
				1692
				1693	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1694	{
				1695	if (!PyUnicode_Check(unicode)) {
				1696	PyErr_BadArgument();
				1697	return NULL;
				1698	}
				1699	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1700	PyUnicode_GET_SIZE(unicode),
				1701	NULL);
				1702	}
				1703
				1704	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1705
				1706	static
				1707	int ascii_decoding_error(const char **source,
				1708	Py_UNICODE **dest,
				1709	const char *errors,
				1710	const char *details)
				1711	{
				1712	if ((errors == NULL) \|\|
				1713	(strcmp(errors,"strict") == 0)) {
				1714	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1715	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1716	details);
				1717	return -1;
				1718	}
				1719	else if (strcmp(errors,"ignore") == 0) {
				1720	return 0;
				1721	}
				1722	else if (strcmp(errors,"replace") == 0) {
				1723	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1724	(*dest)++;
				1725	return 0;
				1726	}
				1727	else {
				1728	PyErr_Format(PyExc_ValueError,
				1729	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1730	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1731	errors);
				1732	return -1;
				1733	}
				1734	}
				1735
				1736	PyObject PyUnicode_DecodeASCII(const char s,
				1737	int size,
				1738	const char *errors)
				1739	{
				1740	PyUnicodeObject *v;
				1741	Py_UNICODE *p;
				1742
				1743	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1744	v = _PyUnicode_New(size);
				1745	if (v == NULL)
				1746	goto onError;
				1747	if (size == 0)
				1748	return (PyObject *)v;
				1749	p = PyUnicode_AS_UNICODE(v);
				1750	while (size-- > 0) {
				1751	register unsigned char c;
				1752
				1753	c = (unsigned char)*s++;
				1754	if (c < 128)
				1755	*p++ = c;
				1756	else if (ascii_decoding_error(&s, &p, errors,
				1757	"ordinal not in range(128)"))
				1758	goto onError;
				1759	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1760	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1761	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1762	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1763	return (PyObject *)v;
				1764
				1765	onError:
				1766	Py_XDECREF(v);
				1767	return NULL;
				1768	}
				1769
				1770	static
				1771	int ascii_encoding_error(const Py_UNICODE **source,
				1772	char **dest,
				1773	const char *errors,
				1774	const char *details)
				1775	{
				1776	if ((errors == NULL) \|\|
				1777	(strcmp(errors,"strict") == 0)) {
				1778	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1779	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1780	details);
				1781	return -1;
				1782	}
				1783	else if (strcmp(errors,"ignore") == 0) {
				1784	return 0;
				1785	}
				1786	else if (strcmp(errors,"replace") == 0) {
				1787	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1788	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1789	return 0;
				1790	}
				1791	else {
				1792	PyErr_Format(PyExc_ValueError,
				1793	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1794	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1795	errors);
				1796	return -1;
				1797	}
				1798	}
				1799
				1800	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1801	int size,
				1802	const char *errors)
				1803	{
				1804	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1805	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1806	repr = PyString_FromStringAndSize(NULL, size);
				1807	if (repr == NULL)
				1808	return NULL;
				1809
				1810	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1811	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1812	while (size-- > 0) {
				1813	Py_UNICODE ch = *p++;
				1814	if (ch >= 128) {
				1815	if (ascii_encoding_error(&p, &s, errors,
				1816	"ordinal not in range(128)"))
				1817	goto onError;
				1818	}
				1819	else
				1820	*s++ = (char)ch;
				1821	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1822	/* Resize if error handling skipped some characters */
				1823	if (s - start < PyString_GET_SIZE(repr))
				1824	if (_PyString_Resize(&repr, s - start))
				1825	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1826	return repr;
				1827
				1828	onError:
				1829	Py_DECREF(repr);
				1830	return NULL;
				1831	}
				1832
				1833	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1834	{
				1835	if (!PyUnicode_Check(unicode)) {
				1836	PyErr_BadArgument();
				1837	return NULL;
				1838	}
				1839	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1840	PyUnicode_GET_SIZE(unicode),
				1841	NULL);
				1842	}
				1843
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1844	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1845
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1846	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1847
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1848	PyObject PyUnicode_DecodeMBCS(const char s,
				1849	int size,
				1850	const char *errors)
				1851	{
				1852	PyUnicodeObject *v;
				1853	Py_UNICODE *p;
				1854
				1855	/* First get the size of the result */
				1856	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1857	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1858	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1859
				1860	v = _PyUnicode_New(usize);
				1861	if (v == NULL)
				1862	return NULL;
				1863	if (usize == 0)
				1864	return (PyObject *)v;
				1865	p = PyUnicode_AS_UNICODE(v);
				1866	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1867	Py_DECREF(v);
				1868	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1869	}
				1870
				1871	return (PyObject *)v;
				1872	}
				1873
				1874	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1875	int size,
				1876	const char *errors)
				1877	{
				1878	PyObject *repr;
				1879	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1880	DWORD mbcssize;
				1881
				1882	/* If there are no characters, bail now! */
				1883	if (size==0)
				1884	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1885
				1886	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1887	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1888	if (mbcssize==0)
				1889	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1890
				1891	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1892	if (repr == NULL)
				1893	return NULL;
				1894	if (mbcssize==0)
				1895	return repr;
				1896
				1897	/* Do the conversion */
				1898	s = PyString_AS_STRING(repr);
				1899	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1900	Py_DECREF(repr);
				1901	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1902	}
				1903	return repr;
				1904	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1905
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1906	#endif /* MS_WIN32 */
				1907
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1908	/* --- Character Mapping Codec -------------------------------------------- */
				1909
				1910	static
				1911	int charmap_decoding_error(const char **source,
				1912	Py_UNICODE **dest,
				1913	const char *errors,
				1914	const char *details)
				1915	{
				1916	if ((errors == NULL) \|\|
				1917	(strcmp(errors,"strict") == 0)) {
				1918	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1919	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1920	details);
				1921	return -1;
				1922	}
				1923	else if (strcmp(errors,"ignore") == 0) {
				1924	return 0;
				1925	}
				1926	else if (strcmp(errors,"replace") == 0) {
				1927	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1928	(*dest)++;
				1929	return 0;
				1930	}
				1931	else {
				1932	PyErr_Format(PyExc_ValueError,
				1933	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1934	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1935	errors);
				1936	return -1;
				1937	}
				1938	}
				1939
				1940	PyObject PyUnicode_DecodeCharmap(const char s,
				1941	int size,
				1942	PyObject *mapping,
				1943	const char *errors)
				1944	{
				1945	PyUnicodeObject *v;
				1946	Py_UNICODE *p;
				1947
				1948	/* Default to Latin-1 */
				1949	if (mapping == NULL)
				1950	return PyUnicode_DecodeLatin1(s, size, errors);
				1951
				1952	v = _PyUnicode_New(size);
				1953	if (v == NULL)
				1954	goto onError;
				1955	if (size == 0)
				1956	return (PyObject *)v;
				1957	p = PyUnicode_AS_UNICODE(v);
				1958	while (size-- > 0) {
				1959	unsigned char ch = *s++;
				1960	PyObject w, x;
				1961
				1962	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1963	w = PyInt_FromLong((long)ch);
				1964	if (w == NULL)
				1965	goto onError;
				1966	x = PyObject_GetItem(mapping, w);
				1967	Py_DECREF(w);
				1968	if (x == NULL) {
				1969	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1970	/* No mapping found: default to Latin-1 mapping */
				1971	PyErr_Clear();
				1972	*p++ = (Py_UNICODE)ch;
				1973	continue;
				1974	}
				1975	goto onError;
				1976	}
				1977
				1978	/* Apply mapping */
				1979	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1980	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1981	if (value < 0 \|\| value > 65535) {
				1982	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1983	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1984	Py_DECREF(x);
				1985	goto onError;
				1986	}
				1987	*p++ = (Py_UNICODE)value;
				1988	}
				1989	else if (x == Py_None) {
				1990	/* undefined mapping */
				1991	if (charmap_decoding_error(&s, &p, errors,
				1992	"character maps to <undefined>")) {
				1993	Py_DECREF(x);
				1994	goto onError;
				1995	}
				1996	}
				1997	else if (PyUnicode_Check(x)) {
				1998	if (PyUnicode_GET_SIZE(x) != 1) {
				1999	/* 1-n mapping */
				2000	PyErr_SetString(PyExc_NotImplementedError,
				2001	"1-n mappings are currently not implemented");
				2002	Py_DECREF(x);
				2003	goto onError;
				2004	}
				2005	p++ = PyUnicode_AS_UNICODE(x);
				2006	}
				2007	else {
				2008	/* wrong return value */
				2009	PyErr_SetString(PyExc_TypeError,
				2010	"character mapping must return integer, None or unicode");
				2011	Py_DECREF(x);
				2012	goto onError;
				2013	}
				2014	Py_DECREF(x);
				2015	}
				2016	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				2017	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2018	goto onError;
				2019	return (PyObject *)v;
				2020
				2021	onError:
				2022	Py_XDECREF(v);
				2023	return NULL;
				2024	}
				2025
				2026	static
				2027	int charmap_encoding_error(const Py_UNICODE **source,
				2028	char **dest,
				2029	const char *errors,
				2030	const char *details)
				2031	{
				2032	if ((errors == NULL) \|\|
				2033	(strcmp(errors,"strict") == 0)) {
				2034	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2035	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2036	details);
				2037	return -1;
				2038	}
				2039	else if (strcmp(errors,"ignore") == 0) {
				2040	return 0;
				2041	}
				2042	else if (strcmp(errors,"replace") == 0) {
				2043	**dest = '?';
				2044	(*dest)++;
				2045	return 0;
				2046	}
				2047	else {
				2048	PyErr_Format(PyExc_ValueError,
				2049	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2050	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2051	errors);
				2052	return -1;
				2053	}
				2054	}
				2055
				2056	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2057	int size,
				2058	PyObject *mapping,
				2059	const char *errors)
				2060	{
				2061	PyObject *v;
				2062	char *s;
				2063
				2064	/* Default to Latin-1 */
				2065	if (mapping == NULL)
				2066	return PyUnicode_EncodeLatin1(p, size, errors);
				2067
				2068	v = PyString_FromStringAndSize(NULL, size);
				2069	if (v == NULL)
				2070	return NULL;
				2071	s = PyString_AS_STRING(v);
				2072	while (size-- > 0) {
				2073	Py_UNICODE ch = *p++;
				2074	PyObject w, x;
				2075
				2076	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2077	w = PyInt_FromLong((long)ch);
				2078	if (w == NULL)
				2079	goto onError;
				2080	x = PyObject_GetItem(mapping, w);
				2081	Py_DECREF(w);
				2082	if (x == NULL) {
				2083	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2084	/* No mapping found: default to Latin-1 mapping if possible */
				2085	PyErr_Clear();
				2086	if (ch < 256) {
				2087	*s++ = (char)ch;
				2088	continue;
				2089	}
				2090	else if (!charmap_encoding_error(&p, &s, errors,
				2091	"missing character mapping"))
				2092	continue;
				2093	}
				2094	goto onError;
				2095	}
				2096
				2097	/* Apply mapping */
				2098	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2099	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2100	if (value < 0 \|\| value > 255) {
				2101	PyErr_SetString(PyExc_TypeError,
				2102	"character mapping must be in range(256)");
				2103	Py_DECREF(x);
				2104	goto onError;
				2105	}
				2106	*s++ = (char)value;
				2107	}
				2108	else if (x == Py_None) {
				2109	/* undefined mapping */
				2110	if (charmap_encoding_error(&p, &s, errors,
				2111	"character maps to <undefined>")) {
				2112	Py_DECREF(x);
				2113	goto onError;
				2114	}
				2115	}
				2116	else if (PyString_Check(x)) {
				2117	if (PyString_GET_SIZE(x) != 1) {
				2118	/* 1-n mapping */
				2119	PyErr_SetString(PyExc_NotImplementedError,
				2120	"1-n mappings are currently not implemented");
				2121	Py_DECREF(x);
				2122	goto onError;
				2123	}
				2124	s++ = PyString_AS_STRING(x);
				2125	}
				2126	else {
				2127	/* wrong return value */
				2128	PyErr_SetString(PyExc_TypeError,
				2129	"character mapping must return integer, None or unicode");
				2130	Py_DECREF(x);
				2131	goto onError;
				2132	}
				2133	Py_DECREF(x);
				2134	}
				2135	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2136	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2137	goto onError;
				2138	return v;
				2139
				2140	onError:
				2141	Py_DECREF(v);
				2142	return NULL;
				2143	}
				2144
				2145	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2146	PyObject *mapping)
				2147	{
				2148	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2149	PyErr_BadArgument();
				2150	return NULL;
				2151	}
				2152	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2153	PyUnicode_GET_SIZE(unicode),
				2154	mapping,
				2155	NULL);
				2156	}
				2157
				2158	static
				2159	int translate_error(const Py_UNICODE **source,
				2160	Py_UNICODE **dest,
				2161	const char *errors,
				2162	const char *details)
				2163	{
				2164	if ((errors == NULL) \|\|
				2165	(strcmp(errors,"strict") == 0)) {
				2166	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2167	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2168	details);
				2169	return -1;
				2170	}
				2171	else if (strcmp(errors,"ignore") == 0) {
				2172	return 0;
				2173	}
				2174	else if (strcmp(errors,"replace") == 0) {
				2175	**dest = '?';
				2176	(*dest)++;
				2177	return 0;
				2178	}
				2179	else {
				2180	PyErr_Format(PyExc_ValueError,
				2181	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2182	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2183	errors);
				2184	return -1;
				2185	}
				2186	}
				2187
				2188	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2189	int size,
				2190	PyObject *mapping,
				2191	const char *errors)
				2192	{
				2193	PyUnicodeObject *v;
				2194	Py_UNICODE *p;
				2195
				2196	if (mapping == NULL) {
				2197	PyErr_BadArgument();
				2198	return NULL;
				2199	}
				2200
				2201	/* Output will never be longer than input */
				2202	v = _PyUnicode_New(size);
				2203	if (v == NULL)
				2204	goto onError;
				2205	if (size == 0)
				2206	goto done;
				2207	p = PyUnicode_AS_UNICODE(v);
				2208	while (size-- > 0) {
				2209	Py_UNICODE ch = *s++;
				2210	PyObject w, x;
				2211
				2212	/* Get mapping */
				2213	w = PyInt_FromLong(ch);
				2214	if (w == NULL)
				2215	goto onError;
				2216	x = PyObject_GetItem(mapping, w);
				2217	Py_DECREF(w);
				2218	if (x == NULL) {
				2219	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2220	/* No mapping found: default to 1-1 mapping */
				2221	PyErr_Clear();
				2222	*p++ = ch;
				2223	continue;
				2224	}
				2225	goto onError;
				2226	}
				2227
				2228	/* Apply mapping */
				2229	if (PyInt_Check(x))
				2230	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2231	else if (x == Py_None) {
				2232	/* undefined mapping */
				2233	if (translate_error(&s, &p, errors,
				2234	"character maps to <undefined>")) {
				2235	Py_DECREF(x);
				2236	goto onError;
				2237	}
				2238	}
				2239	else if (PyUnicode_Check(x)) {
				2240	if (PyUnicode_GET_SIZE(x) != 1) {
				2241	/* 1-n mapping */
				2242	PyErr_SetString(PyExc_NotImplementedError,
				2243	"1-n mappings are currently not implemented");
				2244	Py_DECREF(x);
				2245	goto onError;
				2246	}
				2247	p++ = PyUnicode_AS_UNICODE(x);
				2248	}
				2249	else {
				2250	/* wrong return value */
				2251	PyErr_SetString(PyExc_TypeError,
				2252	"translate mapping must return integer, None or unicode");
				2253	Py_DECREF(x);
				2254	goto onError;
				2255	}
				2256	Py_DECREF(x);
				2257	}
				2258	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2259	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2260	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2261
				2262	done:
				2263	return (PyObject *)v;
				2264
				2265	onError:
				2266	Py_XDECREF(v);
				2267	return NULL;
				2268	}
				2269
				2270	PyObject PyUnicode_Translate(PyObject str,
				2271	PyObject *mapping,
				2272	const char *errors)
				2273	{
				2274	PyObject *result;
				2275
				2276	str = PyUnicode_FromObject(str);
				2277	if (str == NULL)
				2278	goto onError;
				2279	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2280	PyUnicode_GET_SIZE(str),
				2281	mapping,
				2282	errors);
				2283	Py_DECREF(str);
				2284	return result;
				2285
				2286	onError:
				2287	Py_XDECREF(str);
				2288	return NULL;
				2289	}
				2290
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2291	/* --- Decimal Encoder ---------------------------------------------------- */
				2292
				2293	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2294	int length,
				2295	char *output,
				2296	const char *errors)
				2297	{
				2298	Py_UNICODE p, end;
				2299
				2300	if (output == NULL) {
				2301	PyErr_BadArgument();
				2302	return -1;
				2303	}
				2304
				2305	p = s;
				2306	end = s + length;
				2307	while (p < end) {
				2308	register Py_UNICODE ch = *p++;
				2309	int decimal;
				2310
				2311	if (Py_UNICODE_ISSPACE(ch)) {
				2312	*output++ = ' ';
				2313	continue;
				2314	}
				2315	decimal = Py_UNICODE_TODECIMAL(ch);
				2316	if (decimal >= 0) {
				2317	*output++ = '0' + decimal;
				2318	continue;
				2319	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2320	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2321	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2322	continue;
				2323	}
				2324	/* All other characters are considered invalid */
				2325	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2326	PyErr_SetString(PyExc_ValueError,
				2327	"invalid decimal Unicode string");
				2328	goto onError;
				2329	}
				2330	else if (strcmp(errors, "ignore") == 0)
				2331	continue;
				2332	else if (strcmp(errors, "replace") == 0) {
				2333	*output++ = '?';
				2334	continue;
				2335	}
				2336	}
				2337	/* 0-terminate the output string */
				2338	*output++ = '\0';
				2339	return 0;
				2340
				2341	onError:
				2342	return -1;
				2343	}
				2344
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2345	/* --- Helpers ------------------------------------------------------------ */
				2346
				2347	static
				2348	int count(PyUnicodeObject *self,
				2349	int start,
				2350	int end,
				2351	PyUnicodeObject *substring)
				2352	{
				2353	int count = 0;
				2354
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2355	if (substring->length == 0)
				2356	return (end - start + 1);
				2357
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2358	end -= substring->length;
				2359
				2360	while (start <= end)
				2361	if (Py_UNICODE_MATCH(self, start, substring)) {
				2362	count++;
				2363	start += substring->length;
				2364	} else
				2365	start++;
				2366
				2367	return count;
				2368	}
				2369
				2370	int PyUnicode_Count(PyObject *str,
				2371	PyObject *substr,
				2372	int start,
				2373	int end)
				2374	{
				2375	int result;
				2376
				2377	str = PyUnicode_FromObject(str);
				2378	if (str == NULL)
				2379	return -1;
				2380	substr = PyUnicode_FromObject(substr);
				2381	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2382	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2383	return -1;
				2384	}
				2385
				2386	result = count((PyUnicodeObject *)str,
				2387	start, end,
				2388	(PyUnicodeObject *)substr);
				2389
				2390	Py_DECREF(str);
				2391	Py_DECREF(substr);
				2392	return result;
				2393	}
				2394
				2395	static
				2396	int findstring(PyUnicodeObject *self,
				2397	PyUnicodeObject *substring,
				2398	int start,
				2399	int end,
				2400	int direction)
				2401	{
				2402	if (start < 0)
				2403	start += self->length;
				2404	if (start < 0)
				2405	start = 0;
				2406
				2407	if (substring->length == 0)
				2408	return start;
				2409
				2410	if (end > self->length)
				2411	end = self->length;
				2412	if (end < 0)
				2413	end += self->length;
				2414	if (end < 0)
				2415	end = 0;
				2416
				2417	end -= substring->length;
				2418
				2419	if (direction < 0) {
				2420	for (; end >= start; end--)
				2421	if (Py_UNICODE_MATCH(self, end, substring))
				2422	return end;
				2423	} else {
				2424	for (; start <= end; start++)
				2425	if (Py_UNICODE_MATCH(self, start, substring))
				2426	return start;
				2427	}
				2428
				2429	return -1;
				2430	}
				2431
				2432	int PyUnicode_Find(PyObject *str,
				2433	PyObject *substr,
				2434	int start,
				2435	int end,
				2436	int direction)
				2437	{
				2438	int result;
				2439
				2440	str = PyUnicode_FromObject(str);
				2441	if (str == NULL)
				2442	return -1;
				2443	substr = PyUnicode_FromObject(substr);
				2444	if (substr == NULL) {
				2445	Py_DECREF(substr);
				2446	return -1;
				2447	}
				2448
				2449	result = findstring((PyUnicodeObject *)str,
				2450	(PyUnicodeObject *)substr,
				2451	start, end, direction);
				2452	Py_DECREF(str);
				2453	Py_DECREF(substr);
				2454	return result;
				2455	}
				2456
				2457	static
				2458	int tailmatch(PyUnicodeObject *self,
				2459	PyUnicodeObject *substring,
				2460	int start,
				2461	int end,
				2462	int direction)
				2463	{
				2464	if (start < 0)
				2465	start += self->length;
				2466	if (start < 0)
				2467	start = 0;
				2468
				2469	if (substring->length == 0)
				2470	return 1;
				2471
				2472	if (end > self->length)
				2473	end = self->length;
				2474	if (end < 0)
				2475	end += self->length;
				2476	if (end < 0)
				2477	end = 0;
				2478
				2479	end -= substring->length;
				2480	if (end < start)
				2481	return 0;
				2482
				2483	if (direction > 0) {
				2484	if (Py_UNICODE_MATCH(self, end, substring))
				2485	return 1;
				2486	} else {
				2487	if (Py_UNICODE_MATCH(self, start, substring))
				2488	return 1;
				2489	}
				2490
				2491	return 0;
				2492	}
				2493
				2494	int PyUnicode_Tailmatch(PyObject *str,
				2495	PyObject *substr,
				2496	int start,
				2497	int end,
				2498	int direction)
				2499	{
				2500	int result;
				2501
				2502	str = PyUnicode_FromObject(str);
				2503	if (str == NULL)
				2504	return -1;
				2505	substr = PyUnicode_FromObject(substr);
				2506	if (substr == NULL) {
				2507	Py_DECREF(substr);
				2508	return -1;
				2509	}
				2510
				2511	result = tailmatch((PyUnicodeObject *)str,
				2512	(PyUnicodeObject *)substr,
				2513	start, end, direction);
				2514	Py_DECREF(str);
				2515	Py_DECREF(substr);
				2516	return result;
				2517	}
				2518
				2519	static
				2520	const Py_UNICODE findchar(const Py_UNICODE s,
				2521	int size,
				2522	Py_UNICODE ch)
				2523	{
				2524	/* like wcschr, but doesn't stop at NULL characters */
				2525
				2526	while (size-- > 0) {
				2527	if (*s == ch)
				2528	return s;
				2529	s++;
				2530	}
				2531
				2532	return NULL;
				2533	}
				2534
				2535	/* Apply fixfct filter to the Unicode object self and return a
				2536	reference to the modified object */
				2537
				2538	static
				2539	PyObject fixup(PyUnicodeObject self,
				2540	int (fixfct)(PyUnicodeObject s))
				2541	{
				2542
				2543	PyUnicodeObject *u;
				2544
				2545	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2546	self->length);
				2547	if (u == NULL)
				2548	return NULL;
				2549	if (!fixfct(u)) {
				2550	/* fixfct should return TRUE if it modified the buffer. If
				2551	FALSE, return a reference to the original buffer instead
				2552	(to save space, not time) */
				2553	Py_INCREF(self);
				2554	Py_DECREF(u);
				2555	return (PyObject*) self;
				2556	}
				2557	return (PyObject*) u;
				2558	}
				2559
				2560	static
				2561	int fixupper(PyUnicodeObject *self)
				2562	{
				2563	int len = self->length;
				2564	Py_UNICODE *s = self->str;
				2565	int status = 0;
				2566
				2567	while (len-- > 0) {
				2568	register Py_UNICODE ch;
				2569
				2570	ch = Py_UNICODE_TOUPPER(*s);
				2571	if (ch != *s) {
				2572	status = 1;
				2573	*s = ch;
				2574	}
				2575	s++;
				2576	}
				2577
				2578	return status;
				2579	}
				2580
				2581	static
				2582	int fixlower(PyUnicodeObject *self)
				2583	{
				2584	int len = self->length;
				2585	Py_UNICODE *s = self->str;
				2586	int status = 0;
				2587
				2588	while (len-- > 0) {
				2589	register Py_UNICODE ch;
				2590
				2591	ch = Py_UNICODE_TOLOWER(*s);
				2592	if (ch != *s) {
				2593	status = 1;
				2594	*s = ch;
				2595	}
				2596	s++;
				2597	}
				2598
				2599	return status;
				2600	}
				2601
				2602	static
				2603	int fixswapcase(PyUnicodeObject *self)
				2604	{
				2605	int len = self->length;
				2606	Py_UNICODE *s = self->str;
				2607	int status = 0;
				2608
				2609	while (len-- > 0) {
				2610	if (Py_UNICODE_ISUPPER(*s)) {
				2611	s = Py_UNICODE_TOLOWER(s);
				2612	status = 1;
				2613	} else if (Py_UNICODE_ISLOWER(*s)) {
				2614	s = Py_UNICODE_TOUPPER(s);
				2615	status = 1;
				2616	}
				2617	s++;
				2618	}
				2619
				2620	return status;
				2621	}
				2622
				2623	static
				2624	int fixcapitalize(PyUnicodeObject *self)
				2625	{
				2626	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2627	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2628	return 1;
				2629	}
				2630	return 0;
				2631	}
				2632
				2633	static
				2634	int fixtitle(PyUnicodeObject *self)
				2635	{
				2636	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2637	register Py_UNICODE *e;
				2638	int previous_is_cased;
				2639
				2640	/* Shortcut for single character strings */
				2641	if (PyUnicode_GET_SIZE(self) == 1) {
				2642	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2643	if (*p != ch) {
				2644	*p = ch;
				2645	return 1;
				2646	}
				2647	else
				2648	return 0;
				2649	}
				2650
				2651	e = p + PyUnicode_GET_SIZE(self);
				2652	previous_is_cased = 0;
				2653	for (; p < e; p++) {
				2654	register const Py_UNICODE ch = *p;
				2655
				2656	if (previous_is_cased)
				2657	*p = Py_UNICODE_TOLOWER(ch);
				2658	else
				2659	*p = Py_UNICODE_TOTITLE(ch);
				2660
				2661	if (Py_UNICODE_ISLOWER(ch) \|\|
				2662	Py_UNICODE_ISUPPER(ch) \|\|
				2663	Py_UNICODE_ISTITLE(ch))
				2664	previous_is_cased = 1;
				2665	else
				2666	previous_is_cased = 0;
				2667	}
				2668	return 1;
				2669	}
				2670
				2671	PyObject PyUnicode_Join(PyObject separator,
				2672	PyObject *seq)
				2673	{
				2674	Py_UNICODE *sep;
				2675	int seplen;
				2676	PyUnicodeObject *res = NULL;
				2677	int reslen = 0;
				2678	Py_UNICODE *p;
				2679	int seqlen = 0;
				2680	int sz = 100;
				2681	int i;
				2682
Jeremy Hylton	03657cf	2000-07-12 13:05:33 +0000	[diff] [blame]	2683	seqlen = PySequence_Size(seq);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2684	if (seqlen < 0 && PyErr_Occurred())
				2685	return NULL;
				2686
				2687	if (separator == NULL) {
				2688	Py_UNICODE blank = ' ';
				2689	sep = &blank;
				2690	seplen = 1;
				2691	}
				2692	else {
				2693	separator = PyUnicode_FromObject(separator);
				2694	if (separator == NULL)
				2695	return NULL;
				2696	sep = PyUnicode_AS_UNICODE(separator);
				2697	seplen = PyUnicode_GET_SIZE(separator);
				2698	}
				2699
				2700	res = _PyUnicode_New(sz);
				2701	if (res == NULL)
				2702	goto onError;
				2703	p = PyUnicode_AS_UNICODE(res);
				2704	reslen = 0;
				2705
				2706	for (i = 0; i < seqlen; i++) {
				2707	int itemlen;
				2708	PyObject *item;
				2709
				2710	item = PySequence_GetItem(seq, i);
				2711	if (item == NULL)
				2712	goto onError;
				2713	if (!PyUnicode_Check(item)) {
				2714	PyObject *v;
				2715	v = PyUnicode_FromObject(item);
				2716	Py_DECREF(item);
				2717	item = v;
				2718	if (item == NULL)
				2719	goto onError;
				2720	}
				2721	itemlen = PyUnicode_GET_SIZE(item);
				2722	while (reslen + itemlen + seplen >= sz) {
				2723	if (_PyUnicode_Resize(res, sz*2))
				2724	goto onError;
				2725	sz *= 2;
				2726	p = PyUnicode_AS_UNICODE(res) + reslen;
				2727	}
				2728	if (i > 0) {
				2729	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2730	p += seplen;
				2731	reslen += seplen;
				2732	}
				2733	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2734	p += itemlen;
				2735	reslen += itemlen;
				2736	Py_DECREF(item);
				2737	}
				2738	if (_PyUnicode_Resize(res, reslen))
				2739	goto onError;
				2740
				2741	Py_XDECREF(separator);
				2742	return (PyObject *)res;
				2743
				2744	onError:
				2745	Py_XDECREF(separator);
				2746	Py_DECREF(res);
				2747	return NULL;
				2748	}
				2749
				2750	static
				2751	PyUnicodeObject pad(PyUnicodeObject self,
				2752	int left,
				2753	int right,
				2754	Py_UNICODE fill)
				2755	{
				2756	PyUnicodeObject *u;
				2757
				2758	if (left < 0)
				2759	left = 0;
				2760	if (right < 0)
				2761	right = 0;
				2762
				2763	if (left == 0 && right == 0) {
				2764	Py_INCREF(self);
				2765	return self;
				2766	}
				2767
				2768	u = _PyUnicode_New(left + self->length + right);
				2769	if (u) {
				2770	if (left)
				2771	Py_UNICODE_FILL(u->str, fill, left);
				2772	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2773	if (right)
				2774	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2775	}
				2776
				2777	return u;
				2778	}
				2779
				2780	#define SPLIT_APPEND(data, left, right) \
				2781	str = PyUnicode_FromUnicode(data + left, right - left); \
				2782	if (!str) \
				2783	goto onError; \
				2784	if (PyList_Append(list, str)) { \
				2785	Py_DECREF(str); \
				2786	goto onError; \
				2787	} \
				2788	else \
				2789	Py_DECREF(str);
				2790
				2791	static
				2792	PyObject split_whitespace(PyUnicodeObject self,
				2793	PyObject *list,
				2794	int maxcount)
				2795	{
				2796	register int i;
				2797	register int j;
				2798	int len = self->length;
				2799	PyObject *str;
				2800
				2801	for (i = j = 0; i < len; ) {
				2802	/* find a token */
				2803	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2804	i++;
				2805	j = i;
				2806	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2807	i++;
				2808	if (j < i) {
				2809	if (maxcount-- <= 0)
				2810	break;
				2811	SPLIT_APPEND(self->str, j, i);
				2812	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2813	i++;
				2814	j = i;
				2815	}
				2816	}
				2817	if (j < len) {
				2818	SPLIT_APPEND(self->str, j, len);
				2819	}
				2820	return list;
				2821
				2822	onError:
				2823	Py_DECREF(list);
				2824	return NULL;
				2825	}
				2826
				2827	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2828	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2829	{
				2830	register int i;
				2831	register int j;
				2832	int len;
				2833	PyObject *list;
				2834	PyObject *str;
				2835	Py_UNICODE *data;
				2836
				2837	string = PyUnicode_FromObject(string);
				2838	if (string == NULL)
				2839	return NULL;
				2840	data = PyUnicode_AS_UNICODE(string);
				2841	len = PyUnicode_GET_SIZE(string);
				2842
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2843	list = PyList_New(0);
				2844	if (!list)
				2845	goto onError;
				2846
				2847	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2848	int eol;
				2849
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2850	/* Find a line and append it */
				2851	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2852	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2853
				2854	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2855	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2856	if (i < len) {
				2857	if (data[i] == '\r' && i + 1 < len &&
				2858	data[i+1] == '\n')
				2859	i += 2;
				2860	else
				2861	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2862	if (keepends)
				2863	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2864	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2865	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2866	j = i;
				2867	}
				2868	if (j < len) {
				2869	SPLIT_APPEND(data, j, len);
				2870	}
				2871
				2872	Py_DECREF(string);
				2873	return list;
				2874
				2875	onError:
				2876	Py_DECREF(list);
				2877	Py_DECREF(string);
				2878	return NULL;
				2879	}
				2880
				2881	static
				2882	PyObject split_char(PyUnicodeObject self,
				2883	PyObject *list,
				2884	Py_UNICODE ch,
				2885	int maxcount)
				2886	{
				2887	register int i;
				2888	register int j;
				2889	int len = self->length;
				2890	PyObject *str;
				2891
				2892	for (i = j = 0; i < len; ) {
				2893	if (self->str[i] == ch) {
				2894	if (maxcount-- <= 0)
				2895	break;
				2896	SPLIT_APPEND(self->str, j, i);
				2897	i = j = i + 1;
				2898	} else
				2899	i++;
				2900	}
				2901	if (j <= len) {
				2902	SPLIT_APPEND(self->str, j, len);
				2903	}
				2904	return list;
				2905
				2906	onError:
				2907	Py_DECREF(list);
				2908	return NULL;
				2909	}
				2910
				2911	static
				2912	PyObject split_substring(PyUnicodeObject self,
				2913	PyObject *list,
				2914	PyUnicodeObject *substring,
				2915	int maxcount)
				2916	{
				2917	register int i;
				2918	register int j;
				2919	int len = self->length;
				2920	int sublen = substring->length;
				2921	PyObject *str;
				2922
				2923	for (i = j = 0; i < len - sublen; ) {
				2924	if (Py_UNICODE_MATCH(self, i, substring)) {
				2925	if (maxcount-- <= 0)
				2926	break;
				2927	SPLIT_APPEND(self->str, j, i);
				2928	i = j = i + sublen;
				2929	} else
				2930	i++;
				2931	}
				2932	if (j <= len) {
				2933	SPLIT_APPEND(self->str, j, len);
				2934	}
				2935	return list;
				2936
				2937	onError:
				2938	Py_DECREF(list);
				2939	return NULL;
				2940	}
				2941
				2942	#undef SPLIT_APPEND
				2943
				2944	static
				2945	PyObject split(PyUnicodeObject self,
				2946	PyUnicodeObject *substring,
				2947	int maxcount)
				2948	{
				2949	PyObject *list;
				2950
				2951	if (maxcount < 0)
				2952	maxcount = INT_MAX;
				2953
				2954	list = PyList_New(0);
				2955	if (!list)
				2956	return NULL;
				2957
				2958	if (substring == NULL)
				2959	return split_whitespace(self,list,maxcount);
				2960
				2961	else if (substring->length == 1)
				2962	return split_char(self,list,substring->str[0],maxcount);
				2963
				2964	else if (substring->length == 0) {
				2965	Py_DECREF(list);
				2966	PyErr_SetString(PyExc_ValueError, "empty separator");
				2967	return NULL;
				2968	}
				2969	else
				2970	return split_substring(self,list,substring,maxcount);
				2971	}
				2972
				2973	static
				2974	PyObject strip(PyUnicodeObject self,
				2975	int left,
				2976	int right)
				2977	{
				2978	Py_UNICODE *p = self->str;
				2979	int start = 0;
				2980	int end = self->length;
				2981
				2982	if (left)
				2983	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2984	start++;
				2985
				2986	if (right)
				2987	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2988	end--;
				2989
				2990	if (start == 0 && end == self->length) {
				2991	/* couldn't strip anything off, return original string */
				2992	Py_INCREF(self);
				2993	return (PyObject*) self;
				2994	}
				2995
				2996	return (PyObject*) PyUnicode_FromUnicode(
				2997	self->str + start,
				2998	end - start
				2999	);
				3000	}
				3001
				3002	static
				3003	PyObject replace(PyUnicodeObject self,
				3004	PyUnicodeObject *str1,
				3005	PyUnicodeObject *str2,
				3006	int maxcount)
				3007	{
				3008	PyUnicodeObject *u;
				3009
				3010	if (maxcount < 0)
				3011	maxcount = INT_MAX;
				3012
				3013	if (str1->length == 1 && str2->length == 1) {
				3014	int i;
				3015
				3016	/* replace characters */
				3017	if (!findchar(self->str, self->length, str1->str[0])) {
				3018	/* nothing to replace, return original string */
				3019	Py_INCREF(self);
				3020	u = self;
				3021	} else {
				3022	Py_UNICODE u1 = str1->str[0];
				3023	Py_UNICODE u2 = str2->str[0];
				3024
				3025	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				3026	self->str,
				3027	self->length
				3028	);
				3029	if (u)
				3030	for (i = 0; i < u->length; i++)
				3031	if (u->str[i] == u1) {
				3032	if (--maxcount < 0)
				3033	break;
				3034	u->str[i] = u2;
				3035	}
				3036	}
				3037
				3038	} else {
				3039	int n, i;
				3040	Py_UNICODE *p;
				3041
				3042	/* replace strings */
				3043	n = count(self, 0, self->length, str1);
				3044	if (n > maxcount)
				3045	n = maxcount;
				3046	if (n == 0) {
				3047	/* nothing to replace, return original string */
				3048	Py_INCREF(self);
				3049	u = self;
				3050	} else {
				3051	u = _PyUnicode_New(
				3052	self->length + n * (str2->length - str1->length));
				3053	if (u) {
				3054	i = 0;
				3055	p = u->str;
				3056	while (i <= self->length - str1->length)
				3057	if (Py_UNICODE_MATCH(self, i, str1)) {
				3058	/* replace string segment */
				3059	Py_UNICODE_COPY(p, str2->str, str2->length);
				3060	p += str2->length;
				3061	i += str1->length;
				3062	if (--n <= 0) {
				3063	/* copy remaining part */
				3064	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3065	break;
				3066	}
				3067	} else
				3068	*p++ = self->str[i++];
				3069	}
				3070	}
				3071	}
				3072
				3073	return (PyObject *) u;
				3074	}
				3075
				3076	/* --- Unicode Object Methods --------------------------------------------- */
				3077
				3078	static char title__doc__[] =
				3079	"S.title() -> unicode\n\
				3080	\n\
				3081	Return a titlecased version of S, i.e. words start with title case\n\
				3082	characters, all remaining cased characters have lower case.";
				3083
				3084	static PyObject*
				3085	unicode_title(PyUnicodeObject self, PyObject args)
				3086	{
				3087	if (!PyArg_NoArgs(args))
				3088	return NULL;
				3089	return fixup(self, fixtitle);
				3090	}
				3091
				3092	static char capitalize__doc__[] =
				3093	"S.capitalize() -> unicode\n\
				3094	\n\
				3095	Return a capitalized version of S, i.e. make the first character\n\
				3096	have upper case.";
				3097
				3098	static PyObject*
				3099	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3100	{
				3101	if (!PyArg_NoArgs(args))
				3102	return NULL;
				3103	return fixup(self, fixcapitalize);
				3104	}
				3105
				3106	#if 0
				3107	static char capwords__doc__[] =
				3108	"S.capwords() -> unicode\n\
				3109	\n\
				3110	Apply .capitalize() to all words in S and return the result with\n\
				3111	normalized whitespace (all whitespace strings are replaced by ' ').";
				3112
				3113	static PyObject*
				3114	unicode_capwords(PyUnicodeObject self, PyObject args)
				3115	{
				3116	PyObject *list;
				3117	PyObject *item;
				3118	int i;
				3119
				3120	if (!PyArg_NoArgs(args))
				3121	return NULL;
				3122
				3123	/* Split into words */
				3124	list = split(self, NULL, -1);
				3125	if (!list)
				3126	return NULL;
				3127
				3128	/* Capitalize each word */
				3129	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3130	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3131	fixcapitalize);
				3132	if (item == NULL)
				3133	goto onError;
				3134	Py_DECREF(PyList_GET_ITEM(list, i));
				3135	PyList_SET_ITEM(list, i, item);
				3136	}
				3137
				3138	/* Join the words to form a new string */
				3139	item = PyUnicode_Join(NULL, list);
				3140
				3141	onError:
				3142	Py_DECREF(list);
				3143	return (PyObject *)item;
				3144	}
				3145	#endif
				3146
				3147	static char center__doc__[] =
				3148	"S.center(width) -> unicode\n\
				3149	\n\
				3150	Return S centered in a Unicode string of length width. Padding is done\n\
				3151	using spaces.";
				3152
				3153	static PyObject *
				3154	unicode_center(PyUnicodeObject self, PyObject args)
				3155	{
				3156	int marg, left;
				3157	int width;
				3158
				3159	if (!PyArg_ParseTuple(args, "i:center", &width))
				3160	return NULL;
				3161
				3162	if (self->length >= width) {
				3163	Py_INCREF(self);
				3164	return (PyObject*) self;
				3165	}
				3166
				3167	marg = width - self->length;
				3168	left = marg / 2 + (marg & width & 1);
				3169
				3170	return (PyObject*) pad(self, left, marg - left, ' ');
				3171	}
				3172
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3173	/* speedy UTF-16 code point order comparison */
				3174	/* gleaned from: */
				3175	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3176
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3177	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3178	{
				3179	0, 0, 0, 0, 0, 0, 0, 0,
				3180	0, 0, 0, 0, 0, 0, 0, 0,
				3181	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3182	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3183	};
				3184
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3185	static int
				3186	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3187	{
				3188	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3189
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3190	Py_UNICODE *s1 = str1->str;
				3191	Py_UNICODE *s2 = str2->str;
				3192
				3193	len1 = str1->length;
				3194	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3195
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3196	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3197	Py_UNICODE c1, c2;
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame]	3198	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3199
				3200	c1 = *s1++;
				3201	c2 = *s2++;
				3202	if (c1 > (1<<11) * 26)
				3203	c1 += utf16Fixup[c1>>11];
				3204	if (c2 > (1<<11) * 26)
				3205	c2 += utf16Fixup[c2>>11];
				3206
				3207	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3208	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3209	if (diff)
				3210	return (diff < 0) ? -1 : (diff != 0);
				3211	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3212	}
				3213
				3214	return (len1 < len2) ? -1 : (len1 != len2);
				3215	}
				3216
				3217	int PyUnicode_Compare(PyObject *left,
				3218	PyObject *right)
				3219	{
				3220	PyUnicodeObject u = NULL, v = NULL;
				3221	int result;
				3222
				3223	/* Coerce the two arguments */
				3224	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3225	if (u == NULL)
				3226	goto onError;
				3227	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3228	if (v == NULL)
				3229	goto onError;
				3230
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3231	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3232	if (v == u) {
				3233	Py_DECREF(u);
				3234	Py_DECREF(v);
				3235	return 0;
				3236	}
				3237
				3238	result = unicode_compare(u, v);
				3239
				3240	Py_DECREF(u);
				3241	Py_DECREF(v);
				3242	return result;
				3243
				3244	onError:
				3245	Py_XDECREF(u);
				3246	Py_XDECREF(v);
				3247	return -1;
				3248	}
				3249
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3250	int PyUnicode_Contains(PyObject *container,
				3251	PyObject *element)
				3252	{
				3253	PyUnicodeObject u = NULL, v = NULL;
				3254	int result;
				3255	register const Py_UNICODE p, e;
				3256	register Py_UNICODE ch;
				3257
				3258	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3259	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3260	if (v == NULL) {
				3261	PyErr_SetString(PyExc_TypeError,
				3262	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3263	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3264	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3265	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3266	if (u == NULL) {
				3267	Py_DECREF(v);
				3268	goto onError;
				3269	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3270
				3271	/* Check v in u */
				3272	if (PyUnicode_GET_SIZE(v) != 1) {
				3273	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3274	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3275	goto onError;
				3276	}
				3277	ch = *PyUnicode_AS_UNICODE(v);
				3278	p = PyUnicode_AS_UNICODE(u);
				3279	e = p + PyUnicode_GET_SIZE(u);
				3280	result = 0;
				3281	while (p < e) {
				3282	if (*p++ == ch) {
				3283	result = 1;
				3284	break;
				3285	}
				3286	}
				3287
				3288	Py_DECREF(u);
				3289	Py_DECREF(v);
				3290	return result;
				3291
				3292	onError:
				3293	Py_XDECREF(u);
				3294	Py_XDECREF(v);
				3295	return -1;
				3296	}
				3297
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3298	/* Concat to string or Unicode object giving a new Unicode object. */
				3299
				3300	PyObject PyUnicode_Concat(PyObject left,
				3301	PyObject *right)
				3302	{
				3303	PyUnicodeObject u = NULL, v = NULL, *w;
				3304
				3305	/* Coerce the two arguments */
				3306	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3307	if (u == NULL)
				3308	goto onError;
				3309	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3310	if (v == NULL)
				3311	goto onError;
				3312
				3313	/* Shortcuts */
				3314	if (v == unicode_empty) {
				3315	Py_DECREF(v);
				3316	return (PyObject *)u;
				3317	}
				3318	if (u == unicode_empty) {
				3319	Py_DECREF(u);
				3320	return (PyObject *)v;
				3321	}
				3322
				3323	/* Concat the two Unicode strings */
				3324	w = _PyUnicode_New(u->length + v->length);
				3325	if (w == NULL)
				3326	goto onError;
				3327	Py_UNICODE_COPY(w->str, u->str, u->length);
				3328	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3329
				3330	Py_DECREF(u);
				3331	Py_DECREF(v);
				3332	return (PyObject *)w;
				3333
				3334	onError:
				3335	Py_XDECREF(u);
				3336	Py_XDECREF(v);
				3337	return NULL;
				3338	}
				3339
				3340	static char count__doc__[] =
				3341	"S.count(sub[, start[, end]]) -> int\n\
				3342	\n\
				3343	Return the number of occurrences of substring sub in Unicode string\n\
				3344	S[start:end]. Optional arguments start and end are\n\
				3345	interpreted as in slice notation.";
				3346
				3347	static PyObject *
				3348	unicode_count(PyUnicodeObject self, PyObject args)
				3349	{
				3350	PyUnicodeObject *substring;
				3351	int start = 0;
				3352	int end = INT_MAX;
				3353	PyObject *result;
				3354
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3355	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3356	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3357	return NULL;
				3358
				3359	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3360	(PyObject *)substring);
				3361	if (substring == NULL)
				3362	return NULL;
				3363
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3364	if (start < 0)
				3365	start += self->length;
				3366	if (start < 0)
				3367	start = 0;
				3368	if (end > self->length)
				3369	end = self->length;
				3370	if (end < 0)
				3371	end += self->length;
				3372	if (end < 0)
				3373	end = 0;
				3374
				3375	result = PyInt_FromLong((long) count(self, start, end, substring));
				3376
				3377	Py_DECREF(substring);
				3378	return result;
				3379	}
				3380
				3381	static char encode__doc__[] =
				3382	"S.encode([encoding[,errors]]) -> string\n\
				3383	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3384	Return an encoded string version of S. Default encoding is the current\n\
				3385	default string encoding. errors may be given to set a different error\n\
				3386	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3387	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3388
				3389	static PyObject *
				3390	unicode_encode(PyUnicodeObject self, PyObject args)
				3391	{
				3392	char *encoding = NULL;
				3393	char *errors = NULL;
				3394	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3395	return NULL;
				3396	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3397	}
				3398
				3399	static char expandtabs__doc__[] =
				3400	"S.expandtabs([tabsize]) -> unicode\n\
				3401	\n\
				3402	Return a copy of S where all tab characters are expanded using spaces.\n\
				3403	If tabsize is not given, a tab size of 8 characters is assumed.";
				3404
				3405	static PyObject*
				3406	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3407	{
				3408	Py_UNICODE *e;
				3409	Py_UNICODE *p;
				3410	Py_UNICODE *q;
				3411	int i, j;
				3412	PyUnicodeObject *u;
				3413	int tabsize = 8;
				3414
				3415	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3416	return NULL;
				3417
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3418	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3419	i = j = 0;
				3420	e = self->str + self->length;
				3421	for (p = self->str; p < e; p++)
				3422	if (*p == '\t') {
				3423	if (tabsize > 0)
				3424	j += tabsize - (j % tabsize);
				3425	}
				3426	else {
				3427	j++;
				3428	if (p == '\n' \|\| p == '\r') {
				3429	i += j;
				3430	j = 0;
				3431	}
				3432	}
				3433
				3434	/* Second pass: create output string and fill it */
				3435	u = _PyUnicode_New(i + j);
				3436	if (!u)
				3437	return NULL;
				3438
				3439	j = 0;
				3440	q = u->str;
				3441
				3442	for (p = self->str; p < e; p++)
				3443	if (*p == '\t') {
				3444	if (tabsize > 0) {
				3445	i = tabsize - (j % tabsize);
				3446	j += i;
				3447	while (i--)
				3448	*q++ = ' ';
				3449	}
				3450	}
				3451	else {
				3452	j++;
				3453	q++ = p;
				3454	if (p == '\n' \|\| p == '\r')
				3455	j = 0;
				3456	}
				3457
				3458	return (PyObject*) u;
				3459	}
				3460
				3461	static char find__doc__[] =
				3462	"S.find(sub [,start [,end]]) -> int\n\
				3463	\n\
				3464	Return the lowest index in S where substring sub is found,\n\
				3465	such that sub is contained within s[start,end]. Optional\n\
				3466	arguments start and end are interpreted as in slice notation.\n\
				3467	\n\
				3468	Return -1 on failure.";
				3469
				3470	static PyObject *
				3471	unicode_find(PyUnicodeObject self, PyObject args)
				3472	{
				3473	PyUnicodeObject *substring;
				3474	int start = 0;
				3475	int end = INT_MAX;
				3476	PyObject *result;
				3477
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3478	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3479	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3480	return NULL;
				3481	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3482	(PyObject *)substring);
				3483	if (substring == NULL)
				3484	return NULL;
				3485
				3486	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3487
				3488	Py_DECREF(substring);
				3489	return result;
				3490	}
				3491
				3492	static PyObject *
				3493	unicode_getitem(PyUnicodeObject *self, int index)
				3494	{
				3495	if (index < 0 \|\| index >= self->length) {
				3496	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3497	return NULL;
				3498	}
				3499
				3500	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3501	}
				3502
				3503	static long
				3504	unicode_hash(PyUnicodeObject *self)
				3505	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3506	/* Since Unicode objects compare equal to their ASCII string
				3507	counterparts, they should use the individual character values
				3508	as basis for their hash value. This is needed to assure that
				3509	strings and Unicode objects behave in the same way as
				3510	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3511
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3512	register int len;
				3513	register Py_UNICODE *p;
				3514	register long x;
				3515
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3516	if (self->hash != -1)
				3517	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3518	len = PyUnicode_GET_SIZE(self);
				3519	p = PyUnicode_AS_UNICODE(self);
				3520	x = *p << 7;
				3521	while (--len >= 0)
				3522	x = (1000003x) ^ p++;
				3523	x ^= PyUnicode_GET_SIZE(self);
				3524	if (x == -1)
				3525	x = -2;
				3526	self->hash = x;
				3527	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3528	}
				3529
				3530	static char index__doc__[] =
				3531	"S.index(sub [,start [,end]]) -> int\n\
				3532	\n\
				3533	Like S.find() but raise ValueError when the substring is not found.";
				3534
				3535	static PyObject *
				3536	unicode_index(PyUnicodeObject self, PyObject args)
				3537	{
				3538	int result;
				3539	PyUnicodeObject *substring;
				3540	int start = 0;
				3541	int end = INT_MAX;
				3542
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3543	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3544	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3545	return NULL;
				3546
				3547	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3548	(PyObject *)substring);
				3549	if (substring == NULL)
				3550	return NULL;
				3551
				3552	result = findstring(self, substring, start, end, 1);
				3553
				3554	Py_DECREF(substring);
				3555	if (result < 0) {
				3556	PyErr_SetString(PyExc_ValueError, "substring not found");
				3557	return NULL;
				3558	}
				3559	return PyInt_FromLong(result);
				3560	}
				3561
				3562	static char islower__doc__[] =
				3563	"S.islower() -> int\n\
				3564	\n\
				3565	Return 1 if all cased characters in S are lowercase and there is\n\
				3566	at least one cased character in S, 0 otherwise.";
				3567
				3568	static PyObject*
				3569	unicode_islower(PyUnicodeObject self, PyObject args)
				3570	{
				3571	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3572	register const Py_UNICODE *e;
				3573	int cased;
				3574
				3575	if (!PyArg_NoArgs(args))
				3576	return NULL;
				3577
				3578	/* Shortcut for single character strings */
				3579	if (PyUnicode_GET_SIZE(self) == 1)
				3580	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3581
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3582	/* Special case for empty strings */
				3583	if (PyString_GET_SIZE(self) == 0)
				3584	return PyInt_FromLong(0);
				3585
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3586	e = p + PyUnicode_GET_SIZE(self);
				3587	cased = 0;
				3588	for (; p < e; p++) {
				3589	register const Py_UNICODE ch = *p;
				3590
				3591	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3592	return PyInt_FromLong(0);
				3593	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3594	cased = 1;
				3595	}
				3596	return PyInt_FromLong(cased);
				3597	}
				3598
				3599	static char isupper__doc__[] =
				3600	"S.isupper() -> int\n\
				3601	\n\
				3602	Return 1 if all cased characters in S are uppercase and there is\n\
				3603	at least one cased character in S, 0 otherwise.";
				3604
				3605	static PyObject*
				3606	unicode_isupper(PyUnicodeObject self, PyObject args)
				3607	{
				3608	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3609	register const Py_UNICODE *e;
				3610	int cased;
				3611
				3612	if (!PyArg_NoArgs(args))
				3613	return NULL;
				3614
				3615	/* Shortcut for single character strings */
				3616	if (PyUnicode_GET_SIZE(self) == 1)
				3617	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3618
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3619	/* Special case for empty strings */
				3620	if (PyString_GET_SIZE(self) == 0)
				3621	return PyInt_FromLong(0);
				3622
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3623	e = p + PyUnicode_GET_SIZE(self);
				3624	cased = 0;
				3625	for (; p < e; p++) {
				3626	register const Py_UNICODE ch = *p;
				3627
				3628	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3629	return PyInt_FromLong(0);
				3630	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3631	cased = 1;
				3632	}
				3633	return PyInt_FromLong(cased);
				3634	}
				3635
				3636	static char istitle__doc__[] =
				3637	"S.istitle() -> int\n\
				3638	\n\
				3639	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3640	may only follow uncased characters and lowercase characters only cased\n\
				3641	ones. Return 0 otherwise.";
				3642
				3643	static PyObject*
				3644	unicode_istitle(PyUnicodeObject self, PyObject args)
				3645	{
				3646	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3647	register const Py_UNICODE *e;
				3648	int cased, previous_is_cased;
				3649
				3650	if (!PyArg_NoArgs(args))
				3651	return NULL;
				3652
				3653	/* Shortcut for single character strings */
				3654	if (PyUnicode_GET_SIZE(self) == 1)
				3655	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3656	(Py_UNICODE_ISUPPER(*p) != 0));
				3657
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3658	/* Special case for empty strings */
				3659	if (PyString_GET_SIZE(self) == 0)
				3660	return PyInt_FromLong(0);
				3661
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3662	e = p + PyUnicode_GET_SIZE(self);
				3663	cased = 0;
				3664	previous_is_cased = 0;
				3665	for (; p < e; p++) {
				3666	register const Py_UNICODE ch = *p;
				3667
				3668	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3669	if (previous_is_cased)
				3670	return PyInt_FromLong(0);
				3671	previous_is_cased = 1;
				3672	cased = 1;
				3673	}
				3674	else if (Py_UNICODE_ISLOWER(ch)) {
				3675	if (!previous_is_cased)
				3676	return PyInt_FromLong(0);
				3677	previous_is_cased = 1;
				3678	cased = 1;
				3679	}
				3680	else
				3681	previous_is_cased = 0;
				3682	}
				3683	return PyInt_FromLong(cased);
				3684	}
				3685
				3686	static char isspace__doc__[] =
				3687	"S.isspace() -> int\n\
				3688	\n\
				3689	Return 1 if there are only whitespace characters in S,\n\
				3690	0 otherwise.";
				3691
				3692	static PyObject*
				3693	unicode_isspace(PyUnicodeObject self, PyObject args)
				3694	{
				3695	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3696	register const Py_UNICODE *e;
				3697
				3698	if (!PyArg_NoArgs(args))
				3699	return NULL;
				3700
				3701	/* Shortcut for single character strings */
				3702	if (PyUnicode_GET_SIZE(self) == 1 &&
				3703	Py_UNICODE_ISSPACE(*p))
				3704	return PyInt_FromLong(1);
				3705
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3706	/* Special case for empty strings */
				3707	if (PyString_GET_SIZE(self) == 0)
				3708	return PyInt_FromLong(0);
				3709
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3710	e = p + PyUnicode_GET_SIZE(self);
				3711	for (; p < e; p++) {
				3712	if (!Py_UNICODE_ISSPACE(*p))
				3713	return PyInt_FromLong(0);
				3714	}
				3715	return PyInt_FromLong(1);
				3716	}
				3717
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3718	static char isalpha__doc__[] =
				3719	"S.isalpha() -> int\n\
				3720	\n\
				3721	Return 1 if all characters in S are alphabetic\n\
				3722	and there is at least one character in S, 0 otherwise.";
				3723
				3724	static PyObject*
				3725	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3726	{
				3727	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3728	register const Py_UNICODE *e;
				3729
				3730	if (!PyArg_NoArgs(args))
				3731	return NULL;
				3732
				3733	/* Shortcut for single character strings */
				3734	if (PyUnicode_GET_SIZE(self) == 1 &&
				3735	Py_UNICODE_ISALPHA(*p))
				3736	return PyInt_FromLong(1);
				3737
				3738	/* Special case for empty strings */
				3739	if (PyString_GET_SIZE(self) == 0)
				3740	return PyInt_FromLong(0);
				3741
				3742	e = p + PyUnicode_GET_SIZE(self);
				3743	for (; p < e; p++) {
				3744	if (!Py_UNICODE_ISALPHA(*p))
				3745	return PyInt_FromLong(0);
				3746	}
				3747	return PyInt_FromLong(1);
				3748	}
				3749
				3750	static char isalnum__doc__[] =
				3751	"S.isalnum() -> int\n\
				3752	\n\
				3753	Return 1 if all characters in S are alphanumeric\n\
				3754	and there is at least one character in S, 0 otherwise.";
				3755
				3756	static PyObject*
				3757	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3758	{
				3759	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3760	register const Py_UNICODE *e;
				3761
				3762	if (!PyArg_NoArgs(args))
				3763	return NULL;
				3764
				3765	/* Shortcut for single character strings */
				3766	if (PyUnicode_GET_SIZE(self) == 1 &&
				3767	Py_UNICODE_ISALNUM(*p))
				3768	return PyInt_FromLong(1);
				3769
				3770	/* Special case for empty strings */
				3771	if (PyString_GET_SIZE(self) == 0)
				3772	return PyInt_FromLong(0);
				3773
				3774	e = p + PyUnicode_GET_SIZE(self);
				3775	for (; p < e; p++) {
				3776	if (!Py_UNICODE_ISALNUM(*p))
				3777	return PyInt_FromLong(0);
				3778	}
				3779	return PyInt_FromLong(1);
				3780	}
				3781
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3782	static char isdecimal__doc__[] =
				3783	"S.isdecimal() -> int\n\
				3784	\n\
				3785	Return 1 if there are only decimal characters in S,\n\
				3786	0 otherwise.";
				3787
				3788	static PyObject*
				3789	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3790	{
				3791	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3792	register const Py_UNICODE *e;
				3793
				3794	if (!PyArg_NoArgs(args))
				3795	return NULL;
				3796
				3797	/* Shortcut for single character strings */
				3798	if (PyUnicode_GET_SIZE(self) == 1 &&
				3799	Py_UNICODE_ISDECIMAL(*p))
				3800	return PyInt_FromLong(1);
				3801
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3802	/* Special case for empty strings */
				3803	if (PyString_GET_SIZE(self) == 0)
				3804	return PyInt_FromLong(0);
				3805
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3806	e = p + PyUnicode_GET_SIZE(self);
				3807	for (; p < e; p++) {
				3808	if (!Py_UNICODE_ISDECIMAL(*p))
				3809	return PyInt_FromLong(0);
				3810	}
				3811	return PyInt_FromLong(1);
				3812	}
				3813
				3814	static char isdigit__doc__[] =
				3815	"S.isdigit() -> int\n\
				3816	\n\
				3817	Return 1 if there are only digit characters in S,\n\
				3818	0 otherwise.";
				3819
				3820	static PyObject*
				3821	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3822	{
				3823	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3824	register const Py_UNICODE *e;
				3825
				3826	if (!PyArg_NoArgs(args))
				3827	return NULL;
				3828
				3829	/* Shortcut for single character strings */
				3830	if (PyUnicode_GET_SIZE(self) == 1 &&
				3831	Py_UNICODE_ISDIGIT(*p))
				3832	return PyInt_FromLong(1);
				3833
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3834	/* Special case for empty strings */
				3835	if (PyString_GET_SIZE(self) == 0)
				3836	return PyInt_FromLong(0);
				3837
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3838	e = p + PyUnicode_GET_SIZE(self);
				3839	for (; p < e; p++) {
				3840	if (!Py_UNICODE_ISDIGIT(*p))
				3841	return PyInt_FromLong(0);
				3842	}
				3843	return PyInt_FromLong(1);
				3844	}
				3845
				3846	static char isnumeric__doc__[] =
				3847	"S.isnumeric() -> int\n\
				3848	\n\
				3849	Return 1 if there are only numeric characters in S,\n\
				3850	0 otherwise.";
				3851
				3852	static PyObject*
				3853	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3854	{
				3855	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3856	register const Py_UNICODE *e;
				3857
				3858	if (!PyArg_NoArgs(args))
				3859	return NULL;
				3860
				3861	/* Shortcut for single character strings */
				3862	if (PyUnicode_GET_SIZE(self) == 1 &&
				3863	Py_UNICODE_ISNUMERIC(*p))
				3864	return PyInt_FromLong(1);
				3865
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3866	/* Special case for empty strings */
				3867	if (PyString_GET_SIZE(self) == 0)
				3868	return PyInt_FromLong(0);
				3869
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3870	e = p + PyUnicode_GET_SIZE(self);
				3871	for (; p < e; p++) {
				3872	if (!Py_UNICODE_ISNUMERIC(*p))
				3873	return PyInt_FromLong(0);
				3874	}
				3875	return PyInt_FromLong(1);
				3876	}
				3877
				3878	static char join__doc__[] =
				3879	"S.join(sequence) -> unicode\n\
				3880	\n\
				3881	Return a string which is the concatenation of the strings in the\n\
				3882	sequence. The separator between elements is S.";
				3883
				3884	static PyObject*
				3885	unicode_join(PyUnicodeObject self, PyObject args)
				3886	{
				3887	PyObject *data;
				3888	if (!PyArg_ParseTuple(args, "O:join", &data))
				3889	return NULL;
				3890
				3891	return PyUnicode_Join((PyObject *)self, data);
				3892	}
				3893
				3894	static int
				3895	unicode_length(PyUnicodeObject *self)
				3896	{
				3897	return self->length;
				3898	}
				3899
				3900	static char ljust__doc__[] =
				3901	"S.ljust(width) -> unicode\n\
				3902	\n\
				3903	Return S left justified in a Unicode string of length width. Padding is\n\
				3904	done using spaces.";
				3905
				3906	static PyObject *
				3907	unicode_ljust(PyUnicodeObject self, PyObject args)
				3908	{
				3909	int width;
				3910	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3911	return NULL;
				3912
				3913	if (self->length >= width) {
				3914	Py_INCREF(self);
				3915	return (PyObject*) self;
				3916	}
				3917
				3918	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3919	}
				3920
				3921	static char lower__doc__[] =
				3922	"S.lower() -> unicode\n\
				3923	\n\
				3924	Return a copy of the string S converted to lowercase.";
				3925
				3926	static PyObject*
				3927	unicode_lower(PyUnicodeObject self, PyObject args)
				3928	{
				3929	if (!PyArg_NoArgs(args))
				3930	return NULL;
				3931	return fixup(self, fixlower);
				3932	}
				3933
				3934	static char lstrip__doc__[] =
				3935	"S.lstrip() -> unicode\n\
				3936	\n\
				3937	Return a copy of the string S with leading whitespace removed.";
				3938
				3939	static PyObject *
				3940	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3941	{
				3942	if (!PyArg_NoArgs(args))
				3943	return NULL;
				3944	return strip(self, 1, 0);
				3945	}
				3946
				3947	static PyObject*
				3948	unicode_repeat(PyUnicodeObject *str, int len)
				3949	{
				3950	PyUnicodeObject *u;
				3951	Py_UNICODE *p;
				3952
				3953	if (len < 0)
				3954	len = 0;
				3955
				3956	if (len == 1) {
				3957	/* no repeat, return original string */
				3958	Py_INCREF(str);
				3959	return (PyObject*) str;
				3960	}
				3961
				3962	u = _PyUnicode_New(len * str->length);
				3963	if (!u)
				3964	return NULL;
				3965
				3966	p = u->str;
				3967
				3968	while (len-- > 0) {
				3969	Py_UNICODE_COPY(p, str->str, str->length);
				3970	p += str->length;
				3971	}
				3972
				3973	return (PyObject*) u;
				3974	}
				3975
				3976	PyObject PyUnicode_Replace(PyObject obj,
				3977	PyObject *subobj,
				3978	PyObject *replobj,
				3979	int maxcount)
				3980	{
				3981	PyObject *self;
				3982	PyObject *str1;
				3983	PyObject *str2;
				3984	PyObject *result;
				3985
				3986	self = PyUnicode_FromObject(obj);
				3987	if (self == NULL)
				3988	return NULL;
				3989	str1 = PyUnicode_FromObject(subobj);
				3990	if (str1 == NULL) {
				3991	Py_DECREF(self);
				3992	return NULL;
				3993	}
				3994	str2 = PyUnicode_FromObject(replobj);
				3995	if (str2 == NULL) {
				3996	Py_DECREF(self);
				3997	Py_DECREF(str1);
				3998	return NULL;
				3999	}
				4000	result = replace((PyUnicodeObject *)self,
				4001	(PyUnicodeObject *)str1,
				4002	(PyUnicodeObject *)str2,
				4003	maxcount);
				4004	Py_DECREF(self);
				4005	Py_DECREF(str1);
				4006	Py_DECREF(str2);
				4007	return result;
				4008	}
				4009
				4010	static char replace__doc__[] =
				4011	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4012	\n\
				4013	Return a copy of S with all occurrences of substring\n\
				4014	old replaced by new. If the optional argument maxsplit is\n\
				4015	given, only the first maxsplit occurrences are replaced.";
				4016
				4017	static PyObject*
				4018	unicode_replace(PyUnicodeObject self, PyObject args)
				4019	{
				4020	PyUnicodeObject *str1;
				4021	PyUnicodeObject *str2;
				4022	int maxcount = -1;
				4023	PyObject *result;
				4024
				4025	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4026	return NULL;
				4027	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4028	if (str1 == NULL)
				4029	return NULL;
				4030	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4031	if (str2 == NULL)
				4032	return NULL;
				4033
				4034	result = replace(self, str1, str2, maxcount);
				4035
				4036	Py_DECREF(str1);
				4037	Py_DECREF(str2);
				4038	return result;
				4039	}
				4040
				4041	static
				4042	PyObject unicode_repr(PyObject unicode)
				4043	{
				4044	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4045	PyUnicode_GET_SIZE(unicode),
				4046	1);
				4047	}
				4048
				4049	static char rfind__doc__[] =
				4050	"S.rfind(sub [,start [,end]]) -> int\n\
				4051	\n\
				4052	Return the highest index in S where substring sub is found,\n\
				4053	such that sub is contained within s[start,end]. Optional\n\
				4054	arguments start and end are interpreted as in slice notation.\n\
				4055	\n\
				4056	Return -1 on failure.";
				4057
				4058	static PyObject *
				4059	unicode_rfind(PyUnicodeObject self, PyObject args)
				4060	{
				4061	PyUnicodeObject *substring;
				4062	int start = 0;
				4063	int end = INT_MAX;
				4064	PyObject *result;
				4065
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4066	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4067	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4068	return NULL;
				4069	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4070	(PyObject *)substring);
				4071	if (substring == NULL)
				4072	return NULL;
				4073
				4074	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4075
				4076	Py_DECREF(substring);
				4077	return result;
				4078	}
				4079
				4080	static char rindex__doc__[] =
				4081	"S.rindex(sub [,start [,end]]) -> int\n\
				4082	\n\
				4083	Like S.rfind() but raise ValueError when the substring is not found.";
				4084
				4085	static PyObject *
				4086	unicode_rindex(PyUnicodeObject self, PyObject args)
				4087	{
				4088	int result;
				4089	PyUnicodeObject *substring;
				4090	int start = 0;
				4091	int end = INT_MAX;
				4092
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4093	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4094	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4095	return NULL;
				4096	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4097	(PyObject *)substring);
				4098	if (substring == NULL)
				4099	return NULL;
				4100
				4101	result = findstring(self, substring, start, end, -1);
				4102
				4103	Py_DECREF(substring);
				4104	if (result < 0) {
				4105	PyErr_SetString(PyExc_ValueError, "substring not found");
				4106	return NULL;
				4107	}
				4108	return PyInt_FromLong(result);
				4109	}
				4110
				4111	static char rjust__doc__[] =
				4112	"S.rjust(width) -> unicode\n\
				4113	\n\
				4114	Return S right justified in a Unicode string of length width. Padding is\n\
				4115	done using spaces.";
				4116
				4117	static PyObject *
				4118	unicode_rjust(PyUnicodeObject self, PyObject args)
				4119	{
				4120	int width;
				4121	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4122	return NULL;
				4123
				4124	if (self->length >= width) {
				4125	Py_INCREF(self);
				4126	return (PyObject*) self;
				4127	}
				4128
				4129	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4130	}
				4131
				4132	static char rstrip__doc__[] =
				4133	"S.rstrip() -> unicode\n\
				4134	\n\
				4135	Return a copy of the string S with trailing whitespace removed.";
				4136
				4137	static PyObject *
				4138	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4139	{
				4140	if (!PyArg_NoArgs(args))
				4141	return NULL;
				4142	return strip(self, 0, 1);
				4143	}
				4144
				4145	static PyObject*
				4146	unicode_slice(PyUnicodeObject *self, int start, int end)
				4147	{
				4148	/* standard clamping */
				4149	if (start < 0)
				4150	start = 0;
				4151	if (end < 0)
				4152	end = 0;
				4153	if (end > self->length)
				4154	end = self->length;
				4155	if (start == 0 && end == self->length) {
				4156	/* full slice, return original string */
				4157	Py_INCREF(self);
				4158	return (PyObject*) self;
				4159	}
				4160	if (start > end)
				4161	start = end;
				4162	/* copy slice */
				4163	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4164	end - start);
				4165	}
				4166
				4167	PyObject PyUnicode_Split(PyObject s,
				4168	PyObject *sep,
				4169	int maxsplit)
				4170	{
				4171	PyObject *result;
				4172
				4173	s = PyUnicode_FromObject(s);
				4174	if (s == NULL)
				4175	return NULL;
				4176	if (sep != NULL) {
				4177	sep = PyUnicode_FromObject(sep);
				4178	if (sep == NULL) {
				4179	Py_DECREF(s);
				4180	return NULL;
				4181	}
				4182	}
				4183
				4184	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4185
				4186	Py_DECREF(s);
				4187	Py_XDECREF(sep);
				4188	return result;
				4189	}
				4190
				4191	static char split__doc__[] =
				4192	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4193	\n\
				4194	Return a list of the words in S, using sep as the\n\
				4195	delimiter string. If maxsplit is given, at most maxsplit\n\
				4196	splits are done. If sep is not specified, any whitespace string\n\
				4197	is a separator.";
				4198
				4199	static PyObject*
				4200	unicode_split(PyUnicodeObject self, PyObject args)
				4201	{
				4202	PyObject *substring = Py_None;
				4203	int maxcount = -1;
				4204
				4205	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4206	return NULL;
				4207
				4208	if (substring == Py_None)
				4209	return split(self, NULL, maxcount);
				4210	else if (PyUnicode_Check(substring))
				4211	return split(self, (PyUnicodeObject *)substring, maxcount);
				4212	else
				4213	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4214	}
				4215
				4216	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4217	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4218	\n\
				4219	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4220	Line breaks are not included in the resulting list unless keepends\n\
				4221	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4222
				4223	static PyObject*
				4224	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4225	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4226	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4227
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4228	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4229	return NULL;
				4230
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4231	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4232	}
				4233
				4234	static
				4235	PyObject unicode_str(PyUnicodeObject self)
				4236	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4237	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4238	}
				4239
				4240	static char strip__doc__[] =
				4241	"S.strip() -> unicode\n\
				4242	\n\
				4243	Return a copy of S with leading and trailing whitespace removed.";
				4244
				4245	static PyObject *
				4246	unicode_strip(PyUnicodeObject self, PyObject args)
				4247	{
				4248	if (!PyArg_NoArgs(args))
				4249	return NULL;
				4250	return strip(self, 1, 1);
				4251	}
				4252
				4253	static char swapcase__doc__[] =
				4254	"S.swapcase() -> unicode\n\
				4255	\n\
				4256	Return a copy of S with uppercase characters converted to lowercase\n\
				4257	and vice versa.";
				4258
				4259	static PyObject*
				4260	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4261	{
				4262	if (!PyArg_NoArgs(args))
				4263	return NULL;
				4264	return fixup(self, fixswapcase);
				4265	}
				4266
				4267	static char translate__doc__[] =
				4268	"S.translate(table) -> unicode\n\
				4269	\n\
				4270	Return a copy of the string S, where all characters have been mapped\n\
				4271	through the given translation table, which must be a mapping of\n\
				4272	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4273	are left untouched. Characters mapped to None are deleted.";
				4274
				4275	static PyObject*
				4276	unicode_translate(PyUnicodeObject self, PyObject args)
				4277	{
				4278	PyObject *table;
				4279
				4280	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4281	return NULL;
				4282	return PyUnicode_TranslateCharmap(self->str,
				4283	self->length,
				4284	table,
				4285	"ignore");
				4286	}
				4287
				4288	static char upper__doc__[] =
				4289	"S.upper() -> unicode\n\
				4290	\n\
				4291	Return a copy of S converted to uppercase.";
				4292
				4293	static PyObject*
				4294	unicode_upper(PyUnicodeObject self, PyObject args)
				4295	{
				4296	if (!PyArg_NoArgs(args))
				4297	return NULL;
				4298	return fixup(self, fixupper);
				4299	}
				4300
				4301	#if 0
				4302	static char zfill__doc__[] =
				4303	"S.zfill(width) -> unicode\n\
				4304	\n\
				4305	Pad a numeric string x with zeros on the left, to fill a field\n\
				4306	of the specified width. The string x is never truncated.";
				4307
				4308	static PyObject *
				4309	unicode_zfill(PyUnicodeObject self, PyObject args)
				4310	{
				4311	int fill;
				4312	PyUnicodeObject *u;
				4313
				4314	int width;
				4315	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4316	return NULL;
				4317
				4318	if (self->length >= width) {
				4319	Py_INCREF(self);
				4320	return (PyObject*) self;
				4321	}
				4322
				4323	fill = width - self->length;
				4324
				4325	u = pad(self, fill, 0, '0');
				4326
				4327	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4328	/* move sign to beginning of string */
				4329	u->str[0] = u->str[fill];
				4330	u->str[fill] = '0';
				4331	}
				4332
				4333	return (PyObject*) u;
				4334	}
				4335	#endif
				4336
				4337	#if 0
				4338	static PyObject*
				4339	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4340	{
				4341	if (!PyArg_NoArgs(args))
				4342	return NULL;
				4343	return PyInt_FromLong(unicode_freelist_size);
				4344	}
				4345	#endif
				4346
				4347	static char startswith__doc__[] =
				4348	"S.startswith(prefix[, start[, end]]) -> int\n\
				4349	\n\
				4350	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4351	optional start, test S beginning at that position. With optional end, stop\n\
				4352	comparing S at that position.";
				4353
				4354	static PyObject *
				4355	unicode_startswith(PyUnicodeObject *self,
				4356	PyObject *args)
				4357	{
				4358	PyUnicodeObject *substring;
				4359	int start = 0;
				4360	int end = INT_MAX;
				4361	PyObject *result;
				4362
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4363	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4364	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4365	return NULL;
				4366	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4367	(PyObject *)substring);
				4368	if (substring == NULL)
				4369	return NULL;
				4370
				4371	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4372
				4373	Py_DECREF(substring);
				4374	return result;
				4375	}
				4376
				4377
				4378	static char endswith__doc__[] =
				4379	"S.endswith(suffix[, start[, end]]) -> int\n\
				4380	\n\
				4381	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4382	optional start, test S beginning at that position. With optional end, stop\n\
				4383	comparing S at that position.";
				4384
				4385	static PyObject *
				4386	unicode_endswith(PyUnicodeObject *self,
				4387	PyObject *args)
				4388	{
				4389	PyUnicodeObject *substring;
				4390	int start = 0;
				4391	int end = INT_MAX;
				4392	PyObject *result;
				4393
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4394	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4395	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4396	return NULL;
				4397	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4398	(PyObject *)substring);
				4399	if (substring == NULL)
				4400	return NULL;
				4401
				4402	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4403
				4404	Py_DECREF(substring);
				4405	return result;
				4406	}
				4407
				4408
				4409	static PyMethodDef unicode_methods[] = {
				4410
				4411	/* Order is according to common usage: often used methods should
				4412	appear first, since lookup is done sequentially. */
				4413
				4414	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4415	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4416	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4417	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4418	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4419	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4420	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4421	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4422	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4423	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4424	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4425	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4426	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4427	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4428	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4429	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4430	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4431	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4432	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4433	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4434	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4435	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4436	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4437	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4438	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4439	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4440	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4441	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4442	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4443	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4444	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4445	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4446	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4447	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4448	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4449	#if 0
				4450	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4451	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4452	#endif
				4453
				4454	#if 0
				4455	/* This one is just used for debugging the implementation. */
				4456	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4457	#endif
				4458
				4459	{NULL, NULL}
				4460	};
				4461
				4462	static PyObject *
				4463	unicode_getattr(PyUnicodeObject self, char name)
				4464	{
				4465	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4466	}
				4467
				4468	static PySequenceMethods unicode_as_sequence = {
				4469	(inquiry) unicode_length, /* sq_length */
				4470	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4471	(intargfunc) unicode_repeat, /* sq_repeat */
				4472	(intargfunc) unicode_getitem, /* sq_item */
				4473	(intintargfunc) unicode_slice, /* sq_slice */
				4474	0, /* sq_ass_item */
				4475	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4476	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4477	};
				4478
				4479	static int
				4480	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4481	int index,
				4482	const void **ptr)
				4483	{
				4484	if (index != 0) {
				4485	PyErr_SetString(PyExc_SystemError,
				4486	"accessing non-existent unicode segment");
				4487	return -1;
				4488	}
				4489	ptr = (void ) self->str;
				4490	return PyUnicode_GET_DATA_SIZE(self);
				4491	}
				4492
				4493	static int
				4494	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4495	const void **ptr)
				4496	{
				4497	PyErr_SetString(PyExc_TypeError,
				4498	"cannot use unicode as modifyable buffer");
				4499	return -1;
				4500	}
				4501
				4502	static int
				4503	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4504	int *lenp)
				4505	{
				4506	if (lenp)
				4507	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4508	return 1;
				4509	}
				4510
				4511	static int
				4512	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4513	int index,
				4514	const void **ptr)
				4515	{
				4516	PyObject *str;
				4517
				4518	if (index != 0) {
				4519	PyErr_SetString(PyExc_SystemError,
				4520	"accessing non-existent unicode segment");
				4521	return -1;
				4522	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	4523	str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4524	if (str == NULL)
				4525	return -1;
				4526	ptr = (void ) PyString_AS_STRING(str);
				4527	return PyString_GET_SIZE(str);
				4528	}
				4529
				4530	/* Helpers for PyUnicode_Format() */
				4531
				4532	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame^]	4533	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4534	{
				4535	int argidx = *p_argidx;
				4536	if (argidx < arglen) {
				4537	(*p_argidx)++;
				4538	if (arglen < 0)
				4539	return args;
				4540	else
				4541	return PyTuple_GetItem(args, argidx);
				4542	}
				4543	PyErr_SetString(PyExc_TypeError,
				4544	"not enough arguments for format string");
				4545	return NULL;
				4546	}
				4547
				4548	#define F_LJUST (1<<0)
				4549	#define F_SIGN (1<<1)
				4550	#define F_BLANK (1<<2)
				4551	#define F_ALT (1<<3)
				4552	#define F_ZERO (1<<4)
				4553
				4554	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4555	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4556	{
				4557	register int i;
				4558	int len;
				4559	va_list va;
				4560	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4561	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4562
				4563	/* First, format the string as char array, then expand to Py_UNICODE
				4564	array. */
				4565	charbuffer = (char *)buffer;
				4566	len = vsprintf(charbuffer, format, va);
				4567	for (i = len - 1; i >= 0; i--)
				4568	buffer[i] = (Py_UNICODE) charbuffer[i];
				4569
				4570	va_end(va);
				4571	return len;
				4572	}
				4573
				4574	static int
				4575	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4576	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4577	int flags,
				4578	int prec,
				4579	int type,
				4580	PyObject *v)
				4581	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4582	/* fmt = '%#.' + `prec` + `type`
				4583	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4584	char fmt[20];
				4585	double x;
				4586
				4587	x = PyFloat_AsDouble(v);
				4588	if (x == -1.0 && PyErr_Occurred())
				4589	return -1;
				4590	if (prec < 0)
				4591	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4592	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4593	type = 'g';
				4594	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4595	/* worst case length calc to ensure no buffer overrun:
				4596	fmt = %#.<prec>g
				4597	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4598	for any double rep.)
				4599	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4600	If prec=0 the effective precision is 1 (the leading digit is
				4601	always given), therefore increase by one to 10+prec. */
				4602	if (buflen <= (size_t)10 + (size_t)prec) {
				4603	PyErr_SetString(PyExc_OverflowError,
				4604	"formatted float is too long (precision too long?)");
				4605	return -1;
				4606	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4607	return usprintf(buf, fmt, x);
				4608	}
				4609
				4610	static int
				4611	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4612	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4613	int flags,
				4614	int prec,
				4615	int type,
				4616	PyObject *v)
				4617	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4618	/* fmt = '%#.' + `prec` + 'l' + `type`
				4619	worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4620	char fmt[20];
				4621	long x;
				4622
				4623	x = PyInt_AsLong(v);
				4624	if (x == -1 && PyErr_Occurred())
				4625	return -1;
				4626	if (prec < 0)
				4627	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4628	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4629	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4630	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4631	PyErr_SetString(PyExc_OverflowError,
				4632	"formatted integer is too long (precision too long?)");
				4633	return -1;
				4634	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4635	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4636	return usprintf(buf, fmt, x);
				4637	}
				4638
				4639	static int
				4640	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4641	size_t buflen,
				4642	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4643	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4644	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4645	if (PyUnicode_Check(v)) {
				4646	if (PyUnicode_GET_SIZE(v) != 1)
				4647	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4648	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4649	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4650
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4651	else if (PyString_Check(v)) {
				4652	if (PyString_GET_SIZE(v) != 1)
				4653	goto onError;
				4654	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4655	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4656
				4657	else {
				4658	/* Integer input truncated to a character */
				4659	long x;
				4660	x = PyInt_AsLong(v);
				4661	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4662	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4663	buf[0] = (char) x;
				4664	}
				4665	buf[1] = '\0';
				4666	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4667
				4668	onError:
				4669	PyErr_SetString(PyExc_TypeError,
				4670	"%c requires int or char");
				4671	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4672	}
				4673
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4674	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4675
				4676	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4677	chars are formatted. XXX This is a magic number. Each formatting
				4678	routine does bounds checking to ensure no overflow, but a better
				4679	solution may be to malloc a buffer of appropriate size for each
				4680	format. For now, the current solution is sufficient.
				4681	*/
				4682	#define FORMATBUFLEN (size_t)120
				4683
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4684	PyObject PyUnicode_Format(PyObject format,
				4685	PyObject *args)
				4686	{
				4687	Py_UNICODE fmt, res;
				4688	int fmtcnt, rescnt, reslen, arglen, argidx;
				4689	int args_owned = 0;
				4690	PyUnicodeObject *result = NULL;
				4691	PyObject *dict = NULL;
				4692	PyObject *uformat;
				4693
				4694	if (format == NULL \|\| args == NULL) {
				4695	PyErr_BadInternalCall();
				4696	return NULL;
				4697	}
				4698	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4699	if (uformat == NULL)
				4700	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4701	fmt = PyUnicode_AS_UNICODE(uformat);
				4702	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4703
				4704	reslen = rescnt = fmtcnt + 100;
				4705	result = _PyUnicode_New(reslen);
				4706	if (result == NULL)
				4707	goto onError;
				4708	res = PyUnicode_AS_UNICODE(result);
				4709
				4710	if (PyTuple_Check(args)) {
				4711	arglen = PyTuple_Size(args);
				4712	argidx = 0;
				4713	}
				4714	else {
				4715	arglen = -1;
				4716	argidx = -2;
				4717	}
				4718	if (args->ob_type->tp_as_mapping)
				4719	dict = args;
				4720
				4721	while (--fmtcnt >= 0) {
				4722	if (*fmt != '%') {
				4723	if (--rescnt < 0) {
				4724	rescnt = fmtcnt + 100;
				4725	reslen += rescnt;
				4726	if (_PyUnicode_Resize(result, reslen) < 0)
				4727	return NULL;
				4728	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4729	--rescnt;
				4730	}
				4731	res++ = fmt++;
				4732	}
				4733	else {
				4734	/* Got a format specifier */
				4735	int flags = 0;
				4736	int width = -1;
				4737	int prec = -1;
				4738	int size = 0;
				4739	Py_UNICODE c = '\0';
				4740	Py_UNICODE fill;
				4741	PyObject *v = NULL;
				4742	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4743	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4744	Py_UNICODE sign;
				4745	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4746	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4747
				4748	fmt++;
				4749	if (*fmt == '(') {
				4750	Py_UNICODE *keystart;
				4751	int keylen;
				4752	PyObject *key;
				4753	int pcount = 1;
				4754
				4755	if (dict == NULL) {
				4756	PyErr_SetString(PyExc_TypeError,
				4757	"format requires a mapping");
				4758	goto onError;
				4759	}
				4760	++fmt;
				4761	--fmtcnt;
				4762	keystart = fmt;
				4763	/* Skip over balanced parentheses */
				4764	while (pcount > 0 && --fmtcnt >= 0) {
				4765	if (*fmt == ')')
				4766	--pcount;
				4767	else if (*fmt == '(')
				4768	++pcount;
				4769	fmt++;
				4770	}
				4771	keylen = fmt - keystart - 1;
				4772	if (fmtcnt < 0 \|\| pcount > 0) {
				4773	PyErr_SetString(PyExc_ValueError,
				4774	"incomplete format key");
				4775	goto onError;
				4776	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4777	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4778	then looked up since Python uses strings to hold
				4779	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4780	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4781	key = PyUnicode_EncodeUTF8(keystart,
				4782	keylen,
				4783	NULL);
				4784	if (key == NULL)
				4785	goto onError;
				4786	if (args_owned) {
				4787	Py_DECREF(args);
				4788	args_owned = 0;
				4789	}
				4790	args = PyObject_GetItem(dict, key);
				4791	Py_DECREF(key);
				4792	if (args == NULL) {
				4793	goto onError;
				4794	}
				4795	args_owned = 1;
				4796	arglen = -1;
				4797	argidx = -2;
				4798	}
				4799	while (--fmtcnt >= 0) {
				4800	switch (c = *fmt++) {
				4801	case '-': flags \|= F_LJUST; continue;
				4802	case '+': flags \|= F_SIGN; continue;
				4803	case ' ': flags \|= F_BLANK; continue;
				4804	case '#': flags \|= F_ALT; continue;
				4805	case '0': flags \|= F_ZERO; continue;
				4806	}
				4807	break;
				4808	}
				4809	if (c == '*') {
				4810	v = getnextarg(args, arglen, &argidx);
				4811	if (v == NULL)
				4812	goto onError;
				4813	if (!PyInt_Check(v)) {
				4814	PyErr_SetString(PyExc_TypeError,
				4815	"* wants int");
				4816	goto onError;
				4817	}
				4818	width = PyInt_AsLong(v);
				4819	if (width < 0) {
				4820	flags \|= F_LJUST;
				4821	width = -width;
				4822	}
				4823	if (--fmtcnt >= 0)
				4824	c = *fmt++;
				4825	}
				4826	else if (c >= '0' && c <= '9') {
				4827	width = c - '0';
				4828	while (--fmtcnt >= 0) {
				4829	c = *fmt++;
				4830	if (c < '0' \|\| c > '9')
				4831	break;
				4832	if ((width*10) / 10 != width) {
				4833	PyErr_SetString(PyExc_ValueError,
				4834	"width too big");
				4835	goto onError;
				4836	}
				4837	width = width*10 + (c - '0');
				4838	}
				4839	}
				4840	if (c == '.') {
				4841	prec = 0;
				4842	if (--fmtcnt >= 0)
				4843	c = *fmt++;
				4844	if (c == '*') {
				4845	v = getnextarg(args, arglen, &argidx);
				4846	if (v == NULL)
				4847	goto onError;
				4848	if (!PyInt_Check(v)) {
				4849	PyErr_SetString(PyExc_TypeError,
				4850	"* wants int");
				4851	goto onError;
				4852	}
				4853	prec = PyInt_AsLong(v);
				4854	if (prec < 0)
				4855	prec = 0;
				4856	if (--fmtcnt >= 0)
				4857	c = *fmt++;
				4858	}
				4859	else if (c >= '0' && c <= '9') {
				4860	prec = c - '0';
				4861	while (--fmtcnt >= 0) {
				4862	c = Py_CHARMASK(*fmt++);
				4863	if (c < '0' \|\| c > '9')
				4864	break;
				4865	if ((prec*10) / 10 != prec) {
				4866	PyErr_SetString(PyExc_ValueError,
				4867	"prec too big");
				4868	goto onError;
				4869	}
				4870	prec = prec*10 + (c - '0');
				4871	}
				4872	}
				4873	} /* prec */
				4874	if (fmtcnt >= 0) {
				4875	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4876	size = c;
				4877	if (--fmtcnt >= 0)
				4878	c = *fmt++;
				4879	}
				4880	}
				4881	if (fmtcnt < 0) {
				4882	PyErr_SetString(PyExc_ValueError,
				4883	"incomplete format");
				4884	goto onError;
				4885	}
				4886	if (c != '%') {
				4887	v = getnextarg(args, arglen, &argidx);
				4888	if (v == NULL)
				4889	goto onError;
				4890	}
				4891	sign = 0;
				4892	fill = ' ';
				4893	switch (c) {
				4894
				4895	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4896	pbuf = formatbuf;
				4897	/* presume that buffer length is at least 1 */
				4898	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4899	len = 1;
				4900	break;
				4901
				4902	case 's':
				4903	case 'r':
				4904	if (PyUnicode_Check(v) && c == 's') {
				4905	temp = v;
				4906	Py_INCREF(temp);
				4907	}
				4908	else {
				4909	PyObject *unicode;
				4910	if (c == 's')
				4911	temp = PyObject_Str(v);
				4912	else
				4913	temp = PyObject_Repr(v);
				4914	if (temp == NULL)
				4915	goto onError;
				4916	if (!PyString_Check(temp)) {
				4917	/* XXX Note: this should never happen, since
				4918	PyObject_Repr() and PyObject_Str() assure
				4919	this */
				4920	Py_DECREF(temp);
				4921	PyErr_SetString(PyExc_TypeError,
				4922	"%s argument has non-string str()");
				4923	goto onError;
				4924	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4925	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4926	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4927	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4928	"strict");
				4929	Py_DECREF(temp);
				4930	temp = unicode;
				4931	if (temp == NULL)
				4932	goto onError;
				4933	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4934	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4935	len = PyUnicode_GET_SIZE(temp);
				4936	if (prec >= 0 && len > prec)
				4937	len = prec;
				4938	break;
				4939
				4940	case 'i':
				4941	case 'd':
				4942	case 'u':
				4943	case 'o':
				4944	case 'x':
				4945	case 'X':
				4946	if (c == 'i')
				4947	c = 'd';
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4948	pbuf = formatbuf;
				4949	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4950	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4951	if (len < 0)
				4952	goto onError;
				4953	sign = (c == 'd');
				4954	if (flags & F_ZERO) {
				4955	fill = '0';
				4956	if ((flags&F_ALT) &&
				4957	(c == 'x' \|\| c == 'X') &&
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4958	pbuf[0] == '0' && pbuf[1] == c) {
				4959	res++ = pbuf++;
				4960	res++ = pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4961	rescnt -= 2;
				4962	len -= 2;
				4963	width -= 2;
				4964	if (width < 0)
				4965	width = 0;
				4966	}
				4967	}
				4968	break;
				4969
				4970	case 'e':
				4971	case 'E':
				4972	case 'f':
				4973	case 'g':
				4974	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4975	pbuf = formatbuf;
				4976	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4977	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4978	if (len < 0)
				4979	goto onError;
				4980	sign = 1;
				4981	if (flags&F_ZERO)
				4982	fill = '0';
				4983	break;
				4984
				4985	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4986	pbuf = formatbuf;
				4987	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4988	if (len < 0)
				4989	goto onError;
				4990	break;
				4991
				4992	default:
				4993	PyErr_Format(PyExc_ValueError,
				4994	"unsupported format character '%c' (0x%x)",
				4995	c, c);
				4996	goto onError;
				4997	}
				4998	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4999	if (pbuf == '-' \|\| pbuf == '+') {
				5000	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5001	len--;
				5002	}
				5003	else if (flags & F_SIGN)
				5004	sign = '+';
				5005	else if (flags & F_BLANK)
				5006	sign = ' ';
				5007	else
				5008	sign = 0;
				5009	}
				5010	if (width < len)
				5011	width = len;
				5012	if (rescnt < width + (sign != 0)) {
				5013	reslen -= rescnt;
				5014	rescnt = width + fmtcnt + 100;
				5015	reslen += rescnt;
				5016	if (_PyUnicode_Resize(result, reslen) < 0)
				5017	return NULL;
				5018	res = PyUnicode_AS_UNICODE(result)
				5019	+ reslen - rescnt;
				5020	}
				5021	if (sign) {
				5022	if (fill != ' ')
				5023	*res++ = sign;
				5024	rescnt--;
				5025	if (width > len)
				5026	width--;
				5027	}
				5028	if (width > len && !(flags & F_LJUST)) {
				5029	do {
				5030	--rescnt;
				5031	*res++ = fill;
				5032	} while (--width > len);
				5033	}
				5034	if (sign && fill == ' ')
				5035	*res++ = sign;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5036	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5037	res += len;
				5038	rescnt -= len;
				5039	while (--width >= len) {
				5040	--rescnt;
				5041	*res++ = ' ';
				5042	}
				5043	if (dict && (argidx < arglen) && c != '%') {
				5044	PyErr_SetString(PyExc_TypeError,
				5045	"not all arguments converted");
				5046	goto onError;
				5047	}
				5048	Py_XDECREF(temp);
				5049	} /* '%' */
				5050	} /* until end */
				5051	if (argidx < arglen && !dict) {
				5052	PyErr_SetString(PyExc_TypeError,
				5053	"not all arguments converted");
				5054	goto onError;
				5055	}
				5056
				5057	if (args_owned) {
				5058	Py_DECREF(args);
				5059	}
				5060	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5061	if (_PyUnicode_Resize(result, reslen - rescnt))
				5062	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5063	return (PyObject *)result;
				5064
				5065	onError:
				5066	Py_XDECREF(result);
				5067	Py_DECREF(uformat);
				5068	if (args_owned) {
				5069	Py_DECREF(args);
				5070	}
				5071	return NULL;
				5072	}
				5073
				5074	static PyBufferProcs unicode_as_buffer = {
				5075	(getreadbufferproc) unicode_buffer_getreadbuf,
				5076	(getwritebufferproc) unicode_buffer_getwritebuf,
				5077	(getsegcountproc) unicode_buffer_getsegcount,
				5078	(getcharbufferproc) unicode_buffer_getcharbuf,
				5079	};
				5080
				5081	PyTypeObject PyUnicode_Type = {
				5082	PyObject_HEAD_INIT(&PyType_Type)
				5083	0, /* ob_size */
				5084	"unicode", /* tp_name */
				5085	sizeof(PyUnicodeObject), /* tp_size */
				5086	0, /* tp_itemsize */
				5087	/* Slots */
				5088	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5089	0, /* tp_print */
				5090	(getattrfunc)unicode_getattr, /* tp_getattr */
				5091	0, /* tp_setattr */
				5092	(cmpfunc) unicode_compare, /* tp_compare */
				5093	(reprfunc) unicode_repr, /* tp_repr */
				5094	0, /* tp_as_number */
				5095	&unicode_as_sequence, /* tp_as_sequence */
				5096	0, /* tp_as_mapping */
				5097	(hashfunc) unicode_hash, /* tp_hash*/
				5098	0, /* tp_call*/
				5099	(reprfunc) unicode_str, /* tp_str */
				5100	(getattrofunc) NULL, /* tp_getattro */
				5101	(setattrofunc) NULL, /* tp_setattro */
				5102	&unicode_as_buffer, /* tp_as_buffer */
				5103	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5104	};
				5105
				5106	/* Initialize the Unicode implementation */
				5107
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame^]	5108	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5109	{
				5110	/* Doublecheck the configuration... */
				5111	if (sizeof(Py_UNICODE) != 2)
				5112	Py_FatalError("Unicode configuration error: "
				5113	"sizeof(Py_UNICODE) != 2 bytes");
				5114
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5115	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5116	unicode_freelist = NULL;
				5117	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5118	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5119	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5120	}
				5121
				5122	/* Finalize the Unicode implementation */
				5123
				5124	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame^]	5125	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5126	{
				5127	PyUnicodeObject *u = unicode_freelist;
				5128
				5129	while (u != NULL) {
				5130	PyUnicodeObject *v = u;
				5131	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5132	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5133	PyMem_DEL(v->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5134	Py_XDECREF(v->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5135	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5136	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5137	unicode_freelist = NULL;
				5138	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5139	Py_XDECREF(unicode_empty);
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5140	unicode_empty = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5141	}