Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: fa7c5ea2e61fe201d4b394ec9c29fbfd2935f9f2 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
				69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	79
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	80	/* Limit for the Unicode object free list */
				81
				82	#define MAX_UNICODE_FREELIST_SIZE 1024
				83
				84	/* Limit for the Unicode object free list stay alive optimization.
				85
				86	The implementation will keep allocated Unicode memory intact for
				87	all objects on the free list having a size less than this
				88	limit. This reduces malloc() overhead for small Unicode objects.
				89
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	90	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	91	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92	malloc()-overhead) bytes of unused garbage.
				93
				94	Setting the limit to 0 effectively turns the feature off.
				95
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	96	Note: This is an experimental feature ! If you get core dumps when
				97	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	98
				99	*/
				100
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	101	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
				111	/* --- Globals ------------------------------------------------------------ */
				112
				113	/* The empty Unicode object */
				114	static PyUnicodeObject *unicode_empty = NULL;
				115
				116	/* Free list for Unicode objects */
				117	static PyUnicodeObject *unicode_freelist = NULL;
				118	static int unicode_freelist_size = 0;
				119
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	120	/* Default encoding to use and assume when NULL is passed as encoding
				121	parameter; it is initialized by _PyUnicode_Init().
				122
				123	Always use the PyUnicode_SetDefaultEncoding() and
				124	PyUnicode_GetDefaultEncoding() APIs to access this global.
				125
				126	*/
				127
				128	static char unicode_default_encoding[100];
				129
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	130	/* --- Unicode Object ----------------------------------------------------- */
				131
				132	static
				133	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				134	int length)
				135	{
				136	void *oldstr;
				137
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	138	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	140	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	141
				142	/* Resizing unicode_empty is not allowed. */
				143	if (unicode == unicode_empty) {
				144	PyErr_SetString(PyExc_SystemError,
				145	"can't resize empty unicode object");
				146	return -1;
				147	}
				148
				149	/* We allocate one more byte to make sure the string is
				150	Ux0000 terminated -- XXX is this needed ? */
				151	oldstr = unicode->str;
				152	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				153	if (!unicode->str) {
				154	unicode->str = oldstr;
				155	PyErr_NoMemory();
				156	return -1;
				157	}
				158	unicode->str[length] = 0;
				159	unicode->length = length;
				160
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	161	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	162	/* Reset the object caches */
				163	if (unicode->utf8str) {
				164	Py_DECREF(unicode->utf8str);
				165	unicode->utf8str = NULL;
				166	}
				167	unicode->hash = -1;
				168
				169	return 0;
				170	}
				171
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	172	int PyUnicode_Resize(PyObject **unicode,
				173	int length)
				174	{
				175	PyUnicodeObject *v;
				176
				177	if (unicode == NULL) {
				178	PyErr_BadInternalCall();
				179	return -1;
				180	}
				181	v = (PyUnicodeObject )unicode;
				182	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				183	PyErr_BadInternalCall();
				184	return -1;
				185	}
				186	return _PyUnicode_Resize(v, length);
				187	}
				188
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	/* We allocate one more byte to make sure the string is
				190	Ux0000 terminated -- XXX is this needed ?
				191
				192	XXX This allocator could further be enhanced by assuring that the
				193	free list never reduces its size below 1.
				194
				195	*/
				196
				197	static
				198	PyUnicodeObject *_PyUnicode_New(int length)
				199	{
				200	register PyUnicodeObject *unicode;
				201
				202	/* Optimization for empty strings */
				203	if (length == 0 && unicode_empty != NULL) {
				204	Py_INCREF(unicode_empty);
				205	return unicode_empty;
				206	}
				207
				208	/* Unicode freelist & memory allocation */
				209	if (unicode_freelist) {
				210	unicode = unicode_freelist;
				211	unicode_freelist = (PyUnicodeObject *)unicode_freelist;
				212	unicode_freelist_size--;
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	213	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	215	/* Keep-Alive optimization: we only upsize the buffer,
				216	never downsize it. */
				217	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	219	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	220	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	221	}
				222	}
				223	else
				224	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				225	}
				226	else {
				227	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				228	if (unicode == NULL)
				229	return NULL;
				230	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				231	}
				232
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	233	if (!unicode->str) {
				234	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	235	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	236	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	237	unicode->str[length] = 0;
				238	unicode->length = length;
				239	unicode->hash = -1;
				240	unicode->utf8str = NULL;
				241	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	242
				243	onError:
				244	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	245	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	246	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	247	}
				248
				249	static
				250	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				251	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	252	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	253	/* Keep-Alive optimization */
				254	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	255	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	256	unicode->str = NULL;
				257	unicode->length = 0;
				258	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	259	if (unicode->utf8str) {
				260	Py_DECREF(unicode->utf8str);
				261	unicode->utf8str = NULL;
				262	}
				263	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	264	(PyUnicodeObject *)unicode = unicode_freelist;
				265	unicode_freelist = unicode;
				266	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	267	}
				268	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	269	PyMem_DEL(unicode->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	270	Py_XDECREF(unicode->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	271	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	272	}
				273	}
				274
				275	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				276	int size)
				277	{
				278	PyUnicodeObject *unicode;
				279
				280	unicode = _PyUnicode_New(size);
				281	if (!unicode)
				282	return NULL;
				283
				284	/* Copy the Unicode data into the new object */
				285	if (u != NULL)
				286	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				287
				288	return (PyObject *)unicode;
				289	}
				290
				291	#ifdef HAVE_WCHAR_H
				292
				293	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				294	int size)
				295	{
				296	PyUnicodeObject *unicode;
				297
				298	if (w == NULL) {
				299	PyErr_BadInternalCall();
				300	return NULL;
				301	}
				302
				303	unicode = _PyUnicode_New(size);
				304	if (!unicode)
				305	return NULL;
				306
				307	/* Copy the wchar_t data into the new object */
				308	#ifdef HAVE_USABLE_WCHAR_T
				309	memcpy(unicode->str, w, size * sizeof(wchar_t));
				310	#else
				311	{
				312	register Py_UNICODE *u;
				313	register int i;
				314	u = PyUnicode_AS_UNICODE(unicode);
				315	for (i = size; i >= 0; i--)
				316	u++ = w++;
				317	}
				318	#endif
				319
				320	return (PyObject *)unicode;
				321	}
				322
				323	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				324	register wchar_t *w,
				325	int size)
				326	{
				327	if (unicode == NULL) {
				328	PyErr_BadInternalCall();
				329	return -1;
				330	}
				331	if (size > PyUnicode_GET_SIZE(unicode))
				332	size = PyUnicode_GET_SIZE(unicode);
				333	#ifdef HAVE_USABLE_WCHAR_T
				334	memcpy(w, unicode->str, size * sizeof(wchar_t));
				335	#else
				336	{
				337	register Py_UNICODE *u;
				338	register int i;
				339	u = PyUnicode_AS_UNICODE(unicode);
				340	for (i = size; i >= 0; i--)
				341	w++ = u++;
				342	}
				343	#endif
				344
				345	return size;
				346	}
				347
				348	#endif
				349
				350	PyObject PyUnicode_FromObject(register PyObject obj)
				351	{
				352	const char *s;
				353	int len;
				354
				355	if (obj == NULL) {
				356	PyErr_BadInternalCall();
				357	return NULL;
				358	}
				359	else if (PyUnicode_Check(obj)) {
				360	Py_INCREF(obj);
				361	return obj;
				362	}
				363	else if (PyString_Check(obj)) {
				364	s = PyString_AS_STRING(obj);
				365	len = PyString_GET_SIZE(obj);
				366	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	367	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				368	/* Overwrite the error message with something more useful in
				369	case of a TypeError. */
				370	if (PyErr_ExceptionMatches(PyExc_TypeError))
				371	PyErr_SetString(PyExc_TypeError,
				372	"coercing to Unicode: need string or charbuffer");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	373	return NULL;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	374	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	375	if (len == 0) {
				376	Py_INCREF(unicode_empty);
				377	return (PyObject *)unicode_empty;
				378	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	379	return PyUnicode_Decode(s, len, NULL, "strict");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	380	}
				381
				382	PyObject PyUnicode_Decode(const char s,
				383	int size,
				384	const char *encoding,
				385	const char *errors)
				386	{
				387	PyObject buffer = NULL, unicode;
				388
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	389	if (encoding == NULL)
				390	encoding = PyUnicode_GetDefaultEncoding();
				391
				392	/* Shortcuts for common default encodings */
				393	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	394	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	395	else if (strcmp(encoding, "latin-1") == 0)
				396	return PyUnicode_DecodeLatin1(s, size, errors);
				397	else if (strcmp(encoding, "ascii") == 0)
				398	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	399
				400	/* Decode via the codec registry */
				401	buffer = PyBuffer_FromMemory((void *)s, size);
				402	if (buffer == NULL)
				403	goto onError;
				404	unicode = PyCodec_Decode(buffer, encoding, errors);
				405	if (unicode == NULL)
				406	goto onError;
				407	if (!PyUnicode_Check(unicode)) {
				408	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	409	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	410	unicode->ob_type->tp_name);
				411	Py_DECREF(unicode);
				412	goto onError;
				413	}
				414	Py_DECREF(buffer);
				415	return unicode;
				416
				417	onError:
				418	Py_XDECREF(buffer);
				419	return NULL;
				420	}
				421
				422	PyObject PyUnicode_Encode(const Py_UNICODE s,
				423	int size,
				424	const char *encoding,
				425	const char *errors)
				426	{
				427	PyObject v, unicode;
				428
				429	unicode = PyUnicode_FromUnicode(s, size);
				430	if (unicode == NULL)
				431	return NULL;
				432	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				433	Py_DECREF(unicode);
				434	return v;
				435	}
				436
				437	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				438	const char *encoding,
				439	const char *errors)
				440	{
				441	PyObject *v;
				442
				443	if (!PyUnicode_Check(unicode)) {
				444	PyErr_BadArgument();
				445	goto onError;
				446	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	447
				448	if (encoding == NULL)
				449	encoding = PyUnicode_GetDefaultEncoding();
				450
				451	/* Shortcuts for common default encodings */
				452	if (errors == NULL) {
				453	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	454	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	455	else if (strcmp(encoding, "latin-1") == 0)
				456	return PyUnicode_AsLatin1String(unicode);
				457	else if (strcmp(encoding, "ascii") == 0)
				458	return PyUnicode_AsASCIIString(unicode);
				459	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	460
				461	/* Encode via the codec registry */
				462	v = PyCodec_Encode(unicode, encoding, errors);
				463	if (v == NULL)
				464	goto onError;
				465	/* XXX Should we really enforce this ? */
				466	if (!PyString_Check(v)) {
				467	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	468	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	469	v->ob_type->tp_name);
				470	Py_DECREF(v);
				471	goto onError;
				472	}
				473	return v;
				474
				475	onError:
				476	return NULL;
				477	}
				478
				479	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				480	{
				481	if (!PyUnicode_Check(unicode)) {
				482	PyErr_BadArgument();
				483	goto onError;
				484	}
				485	return PyUnicode_AS_UNICODE(unicode);
				486
				487	onError:
				488	return NULL;
				489	}
				490
				491	int PyUnicode_GetSize(PyObject *unicode)
				492	{
				493	if (!PyUnicode_Check(unicode)) {
				494	PyErr_BadArgument();
				495	goto onError;
				496	}
				497	return PyUnicode_GET_SIZE(unicode);
				498
				499	onError:
				500	return -1;
				501	}
				502
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	503	const char *PyUnicode_GetDefaultEncoding()
				504	{
				505	return unicode_default_encoding;
				506	}
				507
				508	int PyUnicode_SetDefaultEncoding(const char *encoding)
				509	{
				510	PyObject *v;
				511
				512	/* Make sure the encoding is valid. As side effect, this also
				513	loads the encoding into the codec registry cache. */
				514	v = _PyCodec_Lookup(encoding);
				515	if (v == NULL)
				516	goto onError;
				517	Py_DECREF(v);
				518	strncpy(unicode_default_encoding,
				519	encoding,
				520	sizeof(unicode_default_encoding));
				521	return 0;
				522
				523	onError:
				524	return -1;
				525	}
				526
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	527	/* --- UTF-8 Codec -------------------------------------------------------- */
				528
				529	static
				530	char utf8_code_length[256] = {
				531	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				532	illegal prefix. see RFC 2279 for details */
				533	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				534	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				535	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				536	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				537	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				538	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				539	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				540	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				541	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				542	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				543	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				544	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				545	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				546	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				547	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				548	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				549	};
				550
				551	static
				552	int utf8_decoding_error(const char **source,
				553	Py_UNICODE **dest,
				554	const char *errors,
				555	const char *details)
				556	{
				557	if ((errors == NULL) \|\|
				558	(strcmp(errors,"strict") == 0)) {
				559	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	560	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	561	details);
				562	return -1;
				563	}
				564	else if (strcmp(errors,"ignore") == 0) {
				565	(*source)++;
				566	return 0;
				567	}
				568	else if (strcmp(errors,"replace") == 0) {
				569	(*source)++;
				570	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				571	(*dest)++;
				572	return 0;
				573	}
				574	else {
				575	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	576	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	577	errors);
				578	return -1;
				579	}
				580	}
				581
				582	#define UTF8_ERROR(details) do { \
				583	if (utf8_decoding_error(&s, &p, errors, details)) \
				584	goto onError; \
				585	continue; \
				586	} while (0)
				587
				588	PyObject PyUnicode_DecodeUTF8(const char s,
				589	int size,
				590	const char *errors)
				591	{
				592	int n;
				593	const char *e;
				594	PyUnicodeObject *unicode;
				595	Py_UNICODE *p;
				596
				597	/* Note: size will always be longer than the resulting Unicode
				598	character count */
				599	unicode = _PyUnicode_New(size);
				600	if (!unicode)
				601	return NULL;
				602	if (size == 0)
				603	return (PyObject *)unicode;
				604
				605	/* Unpack UTF-8 encoded data */
				606	p = unicode->str;
				607	e = s + size;
				608
				609	while (s < e) {
				610	register Py_UNICODE ch = (unsigned char)*s;
				611
				612	if (ch < 0x80) {
				613	*p++ = ch;
				614	s++;
				615	continue;
				616	}
				617
				618	n = utf8_code_length[ch];
				619
				620	if (s + n > e)
				621	UTF8_ERROR("unexpected end of data");
				622
				623	switch (n) {
				624
				625	case 0:
				626	UTF8_ERROR("unexpected code byte");
				627	break;
				628
				629	case 1:
				630	UTF8_ERROR("internal error");
				631	break;
				632
				633	case 2:
				634	if ((s[1] & 0xc0) != 0x80)
				635	UTF8_ERROR("invalid data");
				636	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				637	if (ch < 0x80)
				638	UTF8_ERROR("illegal encoding");
				639	else
				640	*p++ = ch;
				641	break;
				642
				643	case 3:
				644	if ((s[1] & 0xc0) != 0x80 \|\|
				645	(s[2] & 0xc0) != 0x80)
				646	UTF8_ERROR("invalid data");
				647	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				648	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				649	UTF8_ERROR("illegal encoding");
				650	else
				651	*p++ = ch;
				652	break;
				653
				654	default:
				655	/* Other sizes are only needed for UCS-4 */
				656	UTF8_ERROR("unsupported Unicode code range");
				657	}
				658	s += n;
				659	}
				660
				661	/* Adjust length */
				662	if (_PyUnicode_Resize(unicode, p - unicode->str))
				663	goto onError;
				664
				665	return (PyObject *)unicode;
				666
				667	onError:
				668	Py_DECREF(unicode);
				669	return NULL;
				670	}
				671
				672	#undef UTF8_ERROR
				673
				674	static
				675	int utf8_encoding_error(const Py_UNICODE **source,
				676	char **dest,
				677	const char *errors,
				678	const char *details)
				679	{
				680	if ((errors == NULL) \|\|
				681	(strcmp(errors,"strict") == 0)) {
				682	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	683	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	684	details);
				685	return -1;
				686	}
				687	else if (strcmp(errors,"ignore") == 0) {
				688	return 0;
				689	}
				690	else if (strcmp(errors,"replace") == 0) {
				691	**dest = '?';
				692	(*dest)++;
				693	return 0;
				694	}
				695	else {
				696	PyErr_Format(PyExc_ValueError,
				697	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	698	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699	errors);
				700	return -1;
				701	}
				702	}
				703
				704	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				705	int size,
				706	const char *errors)
				707	{
				708	PyObject *v;
				709	char *p;
				710	char *q;
				711
				712	v = PyString_FromStringAndSize(NULL, 3 * size);
				713	if (v == NULL)
				714	return NULL;
				715	if (size == 0)
				716	goto done;
				717
				718	p = q = PyString_AS_STRING(v);
				719	while (size-- > 0) {
				720	Py_UNICODE ch = *s++;
				721	if (ch < 0x80)
				722	*p++ = (char) ch;
				723	else if (ch < 0x0800) {
				724	*p++ = 0xc0 \| (ch >> 6);
				725	*p++ = 0x80 \| (ch & 0x3f);
				726	} else if (0xD800 <= ch && ch <= 0xDFFF) {
				727	/* These byte ranges are reserved for UTF-16 surrogate
				728	bytes which the Python implementation currently does
				729	not support. */
				730	printf("code range problem: U+%04x\n", ch);
				731	if (utf8_encoding_error(&s, &p, errors,
				732	"unsupported code range"))
				733	goto onError;
				734	} else {
				735	*p++ = 0xe0 \| (ch >> 12);
				736	*p++ = 0x80 \| ((ch >> 6) & 0x3f);
				737	*p++ = 0x80 \| (ch & 0x3f);
				738	}
				739	}
				740	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	741	if (_PyString_Resize(&v, p - q))
				742	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	743
				744	done:
				745	return v;
				746
				747	onError:
				748	Py_DECREF(v);
				749	return NULL;
				750	}
				751
				752	/* Return a Python string holding the UTF-8 encoded value of the
				753	Unicode object.
				754
				755	The resulting string is cached in the Unicode object for subsequent
				756	usage by this function. The cached version is needed to implement
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	757	the character buffer interface and will live (at least) as long as
				758	the Unicode object itself.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	759
				760	The refcount of the string is not incremented.
				761
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	762	* Exported for internal use by the interpreter only !!! *
				763
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	764	*/
				765
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	766	PyObject _PyUnicode_AsUTF8String(PyObject unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	767	const char *errors)
				768	{
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	769	PyObject v = ((PyUnicodeObject )unicode)->utf8str;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	770
				771	if (v)
				772	return v;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	773	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				774	PyUnicode_GET_SIZE(unicode),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	775	errors);
				776	if (v && errors == NULL)
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	777	((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	778	return v;
				779	}
				780
				781	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				782	{
				783	PyObject *str;
				784
				785	if (!PyUnicode_Check(unicode)) {
				786	PyErr_BadArgument();
				787	return NULL;
				788	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	789	str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	790	if (str == NULL)
				791	return NULL;
				792	Py_INCREF(str);
				793	return str;
				794	}
				795
				796	/* --- UTF-16 Codec ------------------------------------------------------- */
				797
				798	static
				799	int utf16_decoding_error(const Py_UNICODE **source,
				800	Py_UNICODE **dest,
				801	const char *errors,
				802	const char *details)
				803	{
				804	if ((errors == NULL) \|\|
				805	(strcmp(errors,"strict") == 0)) {
				806	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	807	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	808	details);
				809	return -1;
				810	}
				811	else if (strcmp(errors,"ignore") == 0) {
				812	return 0;
				813	}
				814	else if (strcmp(errors,"replace") == 0) {
				815	if (dest) {
				816	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				817	(*dest)++;
				818	}
				819	return 0;
				820	}
				821	else {
				822	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	823	"UTF-16 decoding error; "
				824	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	825	errors);
				826	return -1;
				827	}
				828	}
				829
				830	#define UTF16_ERROR(details) do { \
				831	if (utf16_decoding_error(&q, &p, errors, details)) \
				832	goto onError; \
				833	continue; \
				834	} while(0)
				835
				836	PyObject PyUnicode_DecodeUTF16(const char s,
				837	int size,
				838	const char *errors,
				839	int *byteorder)
				840	{
				841	PyUnicodeObject *unicode;
				842	Py_UNICODE *p;
				843	const Py_UNICODE q, e;
				844	int bo = 0;
				845
				846	/* size should be an even number */
				847	if (size % sizeof(Py_UNICODE) != 0) {
				848	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				849	return NULL;
				850	/* The remaining input chars are ignored if we fall through
				851	here... */
				852	}
				853
				854	/* Note: size will always be longer than the resulting Unicode
				855	character count */
				856	unicode = _PyUnicode_New(size);
				857	if (!unicode)
				858	return NULL;
				859	if (size == 0)
				860	return (PyObject *)unicode;
				861
				862	/* Unpack UTF-16 encoded data */
				863	p = unicode->str;
				864	q = (Py_UNICODE *)s;
				865	e = q + (size / sizeof(Py_UNICODE));
				866
				867	if (byteorder)
				868	bo = *byteorder;
				869
				870	while (q < e) {
				871	register Py_UNICODE ch = *q++;
				872
				873	/* Check for BOM marks (U+FEFF) in the input and adjust
				874	current byte order setting accordingly. Swap input
				875	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				876	!) */
				877	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				878	if (ch == 0xFEFF) {
				879	bo = -1;
				880	continue;
				881	} else if (ch == 0xFFFE) {
				882	bo = 1;
				883	continue;
				884	}
				885	if (bo == 1)
				886	ch = (ch >> 8) \| (ch << 8);
				887	#else
				888	if (ch == 0xFEFF) {
				889	bo = 1;
				890	continue;
				891	} else if (ch == 0xFFFE) {
				892	bo = -1;
				893	continue;
				894	}
				895	if (bo == -1)
				896	ch = (ch >> 8) \| (ch << 8);
				897	#endif
				898	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				899	*p++ = ch;
				900	continue;
				901	}
				902
				903	/* UTF-16 code pair: */
				904	if (q >= e)
				905	UTF16_ERROR("unexpected end of data");
				906	if (0xDC00 <= q && q <= 0xDFFF) {
				907	q++;
				908	if (0xD800 <= q && q <= 0xDBFF)
				909	/* This is valid data (a UTF-16 surrogate pair), but
				910	we are not able to store this information since our
				911	Py_UNICODE type only has 16 bits... this might
				912	change someday, even though it's unlikely. */
				913	UTF16_ERROR("code pairs are not supported");
				914	else
				915	continue;
				916	}
				917	UTF16_ERROR("illegal encoding");
				918	}
				919
				920	if (byteorder)
				921	*byteorder = bo;
				922
				923	/* Adjust length */
				924	if (_PyUnicode_Resize(unicode, p - unicode->str))
				925	goto onError;
				926
				927	return (PyObject *)unicode;
				928
				929	onError:
				930	Py_DECREF(unicode);
				931	return NULL;
				932	}
				933
				934	#undef UTF16_ERROR
				935
				936	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				937	int size,
				938	const char *errors,
				939	int byteorder)
				940	{
				941	PyObject *v;
				942	Py_UNICODE *p;
				943	char *q;
				944
				945	/* We don't create UTF-16 pairs... */
				946	v = PyString_FromStringAndSize(NULL,
				947	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				948	if (v == NULL)
				949	return NULL;
				950	if (size == 0)
				951	goto done;
				952
				953	q = PyString_AS_STRING(v);
				954	p = (Py_UNICODE *)q;
				955
				956	if (byteorder == 0)
				957	*p++ = 0xFEFF;
				958	if (byteorder == 0 \|\|
				959	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				960	byteorder == -1
				961	#else
				962	byteorder == 1
				963	#endif
				964	)
				965	memcpy(p, s, size * sizeof(Py_UNICODE));
				966	else
				967	while (size-- > 0) {
				968	Py_UNICODE ch = *s++;
				969	*p++ = (ch >> 8) \| (ch << 8);
				970	}
				971	done:
				972	return v;
				973	}
				974
				975	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				976	{
				977	if (!PyUnicode_Check(unicode)) {
				978	PyErr_BadArgument();
				979	return NULL;
				980	}
				981	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				982	PyUnicode_GET_SIZE(unicode),
				983	NULL,
				984	0);
				985	}
				986
				987	/* --- Unicode Escape Codec ----------------------------------------------- */
				988
				989	static
				990	int unicodeescape_decoding_error(const char **source,
				991	unsigned int *x,
				992	const char *errors,
				993	const char *details)
				994	{
				995	if ((errors == NULL) \|\|
				996	(strcmp(errors,"strict") == 0)) {
				997	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	998	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	999	details);
				1000	return -1;
				1001	}
				1002	else if (strcmp(errors,"ignore") == 0) {
				1003	return 0;
				1004	}
				1005	else if (strcmp(errors,"replace") == 0) {
				1006	*x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
				1007	return 0;
				1008	}
				1009	else {
				1010	PyErr_Format(PyExc_ValueError,
				1011	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1012	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1013	errors);
				1014	return -1;
				1015	}
				1016	}
				1017
				1018	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1019	int size,
				1020	const char *errors)
				1021	{
				1022	PyUnicodeObject *v;
				1023	Py_UNICODE p = NULL, buf = NULL;
				1024	const char *end;
				1025
				1026	/* Escaped strings will always be longer than the resulting
				1027	Unicode string, so we start with size here and then reduce the
				1028	length after conversion to the true value. */
				1029	v = _PyUnicode_New(size);
				1030	if (v == NULL)
				1031	goto onError;
				1032	if (size == 0)
				1033	return (PyObject *)v;
				1034	p = buf = PyUnicode_AS_UNICODE(v);
				1035	end = s + size;
				1036	while (s < end) {
				1037	unsigned char c;
				1038	unsigned int x;
				1039	int i;
				1040
				1041	/* Non-escape characters are interpreted as Unicode ordinals */
				1042	if (*s != '\\') {
				1043	p++ = (unsigned char)s++;
				1044	continue;
				1045	}
				1046
				1047	/* \ - Escapes */
				1048	s++;
				1049	switch (*s++) {
				1050
				1051	/* \x escapes */
				1052	case '\n': break;
				1053	case '\\': *p++ = '\\'; break;
				1054	case '\'': *p++ = '\''; break;
				1055	case '\"': *p++ = '\"'; break;
				1056	case 'b': *p++ = '\b'; break;
				1057	case 'f': p++ = '\014'; break; / FF */
				1058	case 't': *p++ = '\t'; break;
				1059	case 'n': *p++ = '\n'; break;
				1060	case 'r': *p++ = '\r'; break;
				1061	case 'v': p++ = '\013'; break; / VT */
				1062	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1063
				1064	/* \OOO (octal) escapes */
				1065	case '0': case '1': case '2': case '3':
				1066	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1067	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1068	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1069	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1070	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1071	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1072	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1073	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1074	break;
				1075
				1076	/* \xXXXX escape with 0-4 hex digits */
				1077	case 'x':
				1078	x = 0;
				1079	c = (unsigned char)*s;
				1080	if (isxdigit(c)) {
				1081	do {
				1082	x = (x<<4) & ~0xF;
				1083	if ('0' <= c && c <= '9')
				1084	x += c - '0';
				1085	else if ('a' <= c && c <= 'f')
				1086	x += 10 + c - 'a';
				1087	else
				1088	x += 10 + c - 'A';
				1089	c = (unsigned char)*++s;
				1090	} while (isxdigit(c));
				1091	*p++ = x;
				1092	} else {
				1093	*p++ = '\\';
				1094	*p++ = (unsigned char)s[-1];
				1095	}
				1096	break;
				1097
				1098	/* \uXXXX with 4 hex digits */
				1099	case 'u':
				1100	for (x = 0, i = 0; i < 4; i++) {
				1101	c = (unsigned char)s[i];
				1102	if (!isxdigit(c)) {
				1103	if (unicodeescape_decoding_error(&s, &x, errors,
				1104	"truncated \\uXXXX"))
				1105	goto onError;
				1106	i++;
				1107	break;
				1108	}
				1109	x = (x<<4) & ~0xF;
				1110	if (c >= '0' && c <= '9')
				1111	x += c - '0';
				1112	else if (c >= 'a' && c <= 'f')
				1113	x += 10 + c - 'a';
				1114	else
				1115	x += 10 + c - 'A';
				1116	}
				1117	s += i;
				1118	*p++ = x;
				1119	break;
				1120
				1121	default:
				1122	*p++ = '\\';
				1123	*p++ = (unsigned char)s[-1];
				1124	break;
				1125	}
				1126	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1127	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1128	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1129	return (PyObject *)v;
				1130
				1131	onError:
				1132	Py_XDECREF(v);
				1133	return NULL;
				1134	}
				1135
				1136	/* Return a Unicode-Escape string version of the Unicode object.
				1137
				1138	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1139	appropriate.
				1140
				1141	*/
				1142
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1143	static const Py_UNICODE findchar(const Py_UNICODE s,
				1144	int size,
				1145	Py_UNICODE ch);
				1146
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1147	static
				1148	PyObject unicodeescape_string(const Py_UNICODE s,
				1149	int size,
				1150	int quotes)
				1151	{
				1152	PyObject *repr;
				1153	char *p;
				1154	char *q;
				1155
				1156	static const char *hexdigit = "0123456789ABCDEF";
				1157
				1158	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1159	if (repr == NULL)
				1160	return NULL;
				1161
				1162	p = q = PyString_AS_STRING(repr);
				1163
				1164	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1165	*p++ = 'u';
				1166	*p++ = (findchar(s, size, '\'') &&
				1167	!findchar(s, size, '"')) ? '"' : '\'';
				1168	}
				1169	while (size-- > 0) {
				1170	Py_UNICODE ch = *s++;
				1171	/* Escape quotes */
				1172	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1173	*p++ = '\\';
				1174	*p++ = (char) ch;
				1175	}
				1176	/* Map 16-bit characters to '\uxxxx' */
				1177	else if (ch >= 256) {
				1178	*p++ = '\\';
				1179	*p++ = 'u';
				1180	*p++ = hexdigit[(ch >> 12) & 0xf];
				1181	*p++ = hexdigit[(ch >> 8) & 0xf];
				1182	*p++ = hexdigit[(ch >> 4) & 0xf];
				1183	*p++ = hexdigit[ch & 15];
				1184	}
				1185	/* Map non-printable US ASCII to '\ooo' */
				1186	else if (ch < ' ' \|\| ch >= 128) {
				1187	*p++ = '\\';
				1188	*p++ = hexdigit[(ch >> 6) & 7];
				1189	*p++ = hexdigit[(ch >> 3) & 7];
				1190	*p++ = hexdigit[ch & 7];
				1191	}
				1192	/* Copy everything else as-is */
				1193	else
				1194	*p++ = (char) ch;
				1195	}
				1196	if (quotes)
				1197	*p++ = q[1];
				1198
				1199	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1200	if (_PyString_Resize(&repr, p - q))
				1201	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1202
				1203	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1204
				1205	onError:
				1206	Py_DECREF(repr);
				1207	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1208	}
				1209
				1210	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1211	int size)
				1212	{
				1213	return unicodeescape_string(s, size, 0);
				1214	}
				1215
				1216	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1217	{
				1218	if (!PyUnicode_Check(unicode)) {
				1219	PyErr_BadArgument();
				1220	return NULL;
				1221	}
				1222	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1223	PyUnicode_GET_SIZE(unicode));
				1224	}
				1225
				1226	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1227
				1228	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1229	int size,
				1230	const char *errors)
				1231	{
				1232	PyUnicodeObject *v;
				1233	Py_UNICODE p, buf;
				1234	const char *end;
				1235	const char *bs;
				1236
				1237	/* Escaped strings will always be longer than the resulting
				1238	Unicode string, so we start with size here and then reduce the
				1239	length after conversion to the true value. */
				1240	v = _PyUnicode_New(size);
				1241	if (v == NULL)
				1242	goto onError;
				1243	if (size == 0)
				1244	return (PyObject *)v;
				1245	p = buf = PyUnicode_AS_UNICODE(v);
				1246	end = s + size;
				1247	while (s < end) {
				1248	unsigned char c;
				1249	unsigned int x;
				1250	int i;
				1251
				1252	/* Non-escape characters are interpreted as Unicode ordinals */
				1253	if (*s != '\\') {
				1254	p++ = (unsigned char)s++;
				1255	continue;
				1256	}
				1257
				1258	/* \u-escapes are only interpreted iff the number of leading
				1259	backslashes if odd */
				1260	bs = s;
				1261	for (;s < end;) {
				1262	if (*s != '\\')
				1263	break;
				1264	p++ = (unsigned char)s++;
				1265	}
				1266	if (((s - bs) & 1) == 0 \|\|
				1267	s >= end \|\|
				1268	*s != 'u') {
				1269	continue;
				1270	}
				1271	p--;
				1272	s++;
				1273
				1274	/* \uXXXX with 4 hex digits */
				1275	for (x = 0, i = 0; i < 4; i++) {
				1276	c = (unsigned char)s[i];
				1277	if (!isxdigit(c)) {
				1278	if (unicodeescape_decoding_error(&s, &x, errors,
				1279	"truncated \\uXXXX"))
				1280	goto onError;
				1281	i++;
				1282	break;
				1283	}
				1284	x = (x<<4) & ~0xF;
				1285	if (c >= '0' && c <= '9')
				1286	x += c - '0';
				1287	else if (c >= 'a' && c <= 'f')
				1288	x += 10 + c - 'a';
				1289	else
				1290	x += 10 + c - 'A';
				1291	}
				1292	s += i;
				1293	*p++ = x;
				1294	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1295	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1296	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1297	return (PyObject *)v;
				1298
				1299	onError:
				1300	Py_XDECREF(v);
				1301	return NULL;
				1302	}
				1303
				1304	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1305	int size)
				1306	{
				1307	PyObject *repr;
				1308	char *p;
				1309	char *q;
				1310
				1311	static const char *hexdigit = "0123456789ABCDEF";
				1312
				1313	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1314	if (repr == NULL)
				1315	return NULL;
				1316
				1317	p = q = PyString_AS_STRING(repr);
				1318	while (size-- > 0) {
				1319	Py_UNICODE ch = *s++;
				1320	/* Map 16-bit characters to '\uxxxx' */
				1321	if (ch >= 256) {
				1322	*p++ = '\\';
				1323	*p++ = 'u';
				1324	*p++ = hexdigit[(ch >> 12) & 0xf];
				1325	*p++ = hexdigit[(ch >> 8) & 0xf];
				1326	*p++ = hexdigit[(ch >> 4) & 0xf];
				1327	*p++ = hexdigit[ch & 15];
				1328	}
				1329	/* Copy everything else as-is */
				1330	else
				1331	*p++ = (char) ch;
				1332	}
				1333	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1334	if (_PyString_Resize(&repr, p - q))
				1335	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1336
				1337	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1338
				1339	onError:
				1340	Py_DECREF(repr);
				1341	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1342	}
				1343
				1344	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1345	{
				1346	if (!PyUnicode_Check(unicode)) {
				1347	PyErr_BadArgument();
				1348	return NULL;
				1349	}
				1350	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1351	PyUnicode_GET_SIZE(unicode));
				1352	}
				1353
				1354	/* --- Latin-1 Codec ------------------------------------------------------ */
				1355
				1356	PyObject PyUnicode_DecodeLatin1(const char s,
				1357	int size,
				1358	const char *errors)
				1359	{
				1360	PyUnicodeObject *v;
				1361	Py_UNICODE *p;
				1362
				1363	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1364	v = _PyUnicode_New(size);
				1365	if (v == NULL)
				1366	goto onError;
				1367	if (size == 0)
				1368	return (PyObject *)v;
				1369	p = PyUnicode_AS_UNICODE(v);
				1370	while (size-- > 0)
				1371	p++ = (unsigned char)s++;
				1372	return (PyObject *)v;
				1373
				1374	onError:
				1375	Py_XDECREF(v);
				1376	return NULL;
				1377	}
				1378
				1379	static
				1380	int latin1_encoding_error(const Py_UNICODE **source,
				1381	char **dest,
				1382	const char *errors,
				1383	const char *details)
				1384	{
				1385	if ((errors == NULL) \|\|
				1386	(strcmp(errors,"strict") == 0)) {
				1387	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1388	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1389	details);
				1390	return -1;
				1391	}
				1392	else if (strcmp(errors,"ignore") == 0) {
				1393	return 0;
				1394	}
				1395	else if (strcmp(errors,"replace") == 0) {
				1396	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1397	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1398	return 0;
				1399	}
				1400	else {
				1401	PyErr_Format(PyExc_ValueError,
				1402	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1403	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1404	errors);
				1405	return -1;
				1406	}
				1407	}
				1408
				1409	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1410	int size,
				1411	const char *errors)
				1412	{
				1413	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1414	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1415	repr = PyString_FromStringAndSize(NULL, size);
				1416	if (repr == NULL)
				1417	return NULL;
				1418
				1419	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1420	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1421	while (size-- > 0) {
				1422	Py_UNICODE ch = *p++;
				1423	if (ch >= 256) {
				1424	if (latin1_encoding_error(&p, &s, errors,
				1425	"ordinal not in range(256)"))
				1426	goto onError;
				1427	}
				1428	else
				1429	*s++ = (char)ch;
				1430	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1431	/* Resize if error handling skipped some characters */
				1432	if (s - start < PyString_GET_SIZE(repr))
				1433	if (_PyString_Resize(&repr, s - start))
				1434	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1435	return repr;
				1436
				1437	onError:
				1438	Py_DECREF(repr);
				1439	return NULL;
				1440	}
				1441
				1442	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1443	{
				1444	if (!PyUnicode_Check(unicode)) {
				1445	PyErr_BadArgument();
				1446	return NULL;
				1447	}
				1448	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1449	PyUnicode_GET_SIZE(unicode),
				1450	NULL);
				1451	}
				1452
				1453	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1454
				1455	static
				1456	int ascii_decoding_error(const char **source,
				1457	Py_UNICODE **dest,
				1458	const char *errors,
				1459	const char *details)
				1460	{
				1461	if ((errors == NULL) \|\|
				1462	(strcmp(errors,"strict") == 0)) {
				1463	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1464	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1465	details);
				1466	return -1;
				1467	}
				1468	else if (strcmp(errors,"ignore") == 0) {
				1469	return 0;
				1470	}
				1471	else if (strcmp(errors,"replace") == 0) {
				1472	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1473	(*dest)++;
				1474	return 0;
				1475	}
				1476	else {
				1477	PyErr_Format(PyExc_ValueError,
				1478	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1479	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1480	errors);
				1481	return -1;
				1482	}
				1483	}
				1484
				1485	PyObject PyUnicode_DecodeASCII(const char s,
				1486	int size,
				1487	const char *errors)
				1488	{
				1489	PyUnicodeObject *v;
				1490	Py_UNICODE *p;
				1491
				1492	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1493	v = _PyUnicode_New(size);
				1494	if (v == NULL)
				1495	goto onError;
				1496	if (size == 0)
				1497	return (PyObject *)v;
				1498	p = PyUnicode_AS_UNICODE(v);
				1499	while (size-- > 0) {
				1500	register unsigned char c;
				1501
				1502	c = (unsigned char)*s++;
				1503	if (c < 128)
				1504	*p++ = c;
				1505	else if (ascii_decoding_error(&s, &p, errors,
				1506	"ordinal not in range(128)"))
				1507	goto onError;
				1508	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1509	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1510	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1511	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1512	return (PyObject *)v;
				1513
				1514	onError:
				1515	Py_XDECREF(v);
				1516	return NULL;
				1517	}
				1518
				1519	static
				1520	int ascii_encoding_error(const Py_UNICODE **source,
				1521	char **dest,
				1522	const char *errors,
				1523	const char *details)
				1524	{
				1525	if ((errors == NULL) \|\|
				1526	(strcmp(errors,"strict") == 0)) {
				1527	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1528	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1529	details);
				1530	return -1;
				1531	}
				1532	else if (strcmp(errors,"ignore") == 0) {
				1533	return 0;
				1534	}
				1535	else if (strcmp(errors,"replace") == 0) {
				1536	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1537	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1538	return 0;
				1539	}
				1540	else {
				1541	PyErr_Format(PyExc_ValueError,
				1542	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1543	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1544	errors);
				1545	return -1;
				1546	}
				1547	}
				1548
				1549	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1550	int size,
				1551	const char *errors)
				1552	{
				1553	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1554	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1555	repr = PyString_FromStringAndSize(NULL, size);
				1556	if (repr == NULL)
				1557	return NULL;
				1558
				1559	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1560	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1561	while (size-- > 0) {
				1562	Py_UNICODE ch = *p++;
				1563	if (ch >= 128) {
				1564	if (ascii_encoding_error(&p, &s, errors,
				1565	"ordinal not in range(128)"))
				1566	goto onError;
				1567	}
				1568	else
				1569	*s++ = (char)ch;
				1570	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1571	/* Resize if error handling skipped some characters */
				1572	if (s - start < PyString_GET_SIZE(repr))
				1573	if (_PyString_Resize(&repr, s - start))
				1574	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1575	return repr;
				1576
				1577	onError:
				1578	Py_DECREF(repr);
				1579	return NULL;
				1580	}
				1581
				1582	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1583	{
				1584	if (!PyUnicode_Check(unicode)) {
				1585	PyErr_BadArgument();
				1586	return NULL;
				1587	}
				1588	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1589	PyUnicode_GET_SIZE(unicode),
				1590	NULL);
				1591	}
				1592
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1593	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1594
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1595	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1596
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1597	PyObject PyUnicode_DecodeMBCS(const char s,
				1598	int size,
				1599	const char *errors)
				1600	{
				1601	PyUnicodeObject *v;
				1602	Py_UNICODE *p;
				1603
				1604	/* First get the size of the result */
				1605	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1606	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1607	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1608
				1609	v = _PyUnicode_New(usize);
				1610	if (v == NULL)
				1611	return NULL;
				1612	if (usize == 0)
				1613	return (PyObject *)v;
				1614	p = PyUnicode_AS_UNICODE(v);
				1615	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1616	Py_DECREF(v);
				1617	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1618	}
				1619
				1620	return (PyObject *)v;
				1621	}
				1622
				1623	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1624	int size,
				1625	const char *errors)
				1626	{
				1627	PyObject *repr;
				1628	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1629	DWORD mbcssize;
				1630
				1631	/* If there are no characters, bail now! */
				1632	if (size==0)
				1633	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1634
				1635	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1636	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1637	if (mbcssize==0)
				1638	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1639
				1640	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1641	if (repr == NULL)
				1642	return NULL;
				1643	if (mbcssize==0)
				1644	return repr;
				1645
				1646	/* Do the conversion */
				1647	s = PyString_AS_STRING(repr);
				1648	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1649	Py_DECREF(repr);
				1650	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1651	}
				1652	return repr;
				1653	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1654
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1655	#endif /* MS_WIN32 */
				1656
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1657	/* --- Character Mapping Codec -------------------------------------------- */
				1658
				1659	static
				1660	int charmap_decoding_error(const char **source,
				1661	Py_UNICODE **dest,
				1662	const char *errors,
				1663	const char *details)
				1664	{
				1665	if ((errors == NULL) \|\|
				1666	(strcmp(errors,"strict") == 0)) {
				1667	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1668	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1669	details);
				1670	return -1;
				1671	}
				1672	else if (strcmp(errors,"ignore") == 0) {
				1673	return 0;
				1674	}
				1675	else if (strcmp(errors,"replace") == 0) {
				1676	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1677	(*dest)++;
				1678	return 0;
				1679	}
				1680	else {
				1681	PyErr_Format(PyExc_ValueError,
				1682	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1683	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1684	errors);
				1685	return -1;
				1686	}
				1687	}
				1688
				1689	PyObject PyUnicode_DecodeCharmap(const char s,
				1690	int size,
				1691	PyObject *mapping,
				1692	const char *errors)
				1693	{
				1694	PyUnicodeObject *v;
				1695	Py_UNICODE *p;
				1696
				1697	/* Default to Latin-1 */
				1698	if (mapping == NULL)
				1699	return PyUnicode_DecodeLatin1(s, size, errors);
				1700
				1701	v = _PyUnicode_New(size);
				1702	if (v == NULL)
				1703	goto onError;
				1704	if (size == 0)
				1705	return (PyObject *)v;
				1706	p = PyUnicode_AS_UNICODE(v);
				1707	while (size-- > 0) {
				1708	unsigned char ch = *s++;
				1709	PyObject w, x;
				1710
				1711	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1712	w = PyInt_FromLong((long)ch);
				1713	if (w == NULL)
				1714	goto onError;
				1715	x = PyObject_GetItem(mapping, w);
				1716	Py_DECREF(w);
				1717	if (x == NULL) {
				1718	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1719	/* No mapping found: default to Latin-1 mapping */
				1720	PyErr_Clear();
				1721	*p++ = (Py_UNICODE)ch;
				1722	continue;
				1723	}
				1724	goto onError;
				1725	}
				1726
				1727	/* Apply mapping */
				1728	if (PyInt_Check(x)) {
				1729	int value = PyInt_AS_LONG(x);
				1730	if (value < 0 \|\| value > 65535) {
				1731	PyErr_SetString(PyExc_TypeError,
				1732	"character mapping must be in range(65336)");
				1733	Py_DECREF(x);
				1734	goto onError;
				1735	}
				1736	*p++ = (Py_UNICODE)value;
				1737	}
				1738	else if (x == Py_None) {
				1739	/* undefined mapping */
				1740	if (charmap_decoding_error(&s, &p, errors,
				1741	"character maps to <undefined>")) {
				1742	Py_DECREF(x);
				1743	goto onError;
				1744	}
				1745	}
				1746	else if (PyUnicode_Check(x)) {
				1747	if (PyUnicode_GET_SIZE(x) != 1) {
				1748	/* 1-n mapping */
				1749	PyErr_SetString(PyExc_NotImplementedError,
				1750	"1-n mappings are currently not implemented");
				1751	Py_DECREF(x);
				1752	goto onError;
				1753	}
				1754	p++ = PyUnicode_AS_UNICODE(x);
				1755	}
				1756	else {
				1757	/* wrong return value */
				1758	PyErr_SetString(PyExc_TypeError,
				1759	"character mapping must return integer, None or unicode");
				1760	Py_DECREF(x);
				1761	goto onError;
				1762	}
				1763	Py_DECREF(x);
				1764	}
				1765	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1766	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1767	goto onError;
				1768	return (PyObject *)v;
				1769
				1770	onError:
				1771	Py_XDECREF(v);
				1772	return NULL;
				1773	}
				1774
				1775	static
				1776	int charmap_encoding_error(const Py_UNICODE **source,
				1777	char **dest,
				1778	const char *errors,
				1779	const char *details)
				1780	{
				1781	if ((errors == NULL) \|\|
				1782	(strcmp(errors,"strict") == 0)) {
				1783	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1784	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1785	details);
				1786	return -1;
				1787	}
				1788	else if (strcmp(errors,"ignore") == 0) {
				1789	return 0;
				1790	}
				1791	else if (strcmp(errors,"replace") == 0) {
				1792	**dest = '?';
				1793	(*dest)++;
				1794	return 0;
				1795	}
				1796	else {
				1797	PyErr_Format(PyExc_ValueError,
				1798	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1799	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1800	errors);
				1801	return -1;
				1802	}
				1803	}
				1804
				1805	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1806	int size,
				1807	PyObject *mapping,
				1808	const char *errors)
				1809	{
				1810	PyObject *v;
				1811	char *s;
				1812
				1813	/* Default to Latin-1 */
				1814	if (mapping == NULL)
				1815	return PyUnicode_EncodeLatin1(p, size, errors);
				1816
				1817	v = PyString_FromStringAndSize(NULL, size);
				1818	if (v == NULL)
				1819	return NULL;
				1820	s = PyString_AS_STRING(v);
				1821	while (size-- > 0) {
				1822	Py_UNICODE ch = *p++;
				1823	PyObject w, x;
				1824
				1825	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				1826	w = PyInt_FromLong((long)ch);
				1827	if (w == NULL)
				1828	goto onError;
				1829	x = PyObject_GetItem(mapping, w);
				1830	Py_DECREF(w);
				1831	if (x == NULL) {
				1832	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1833	/* No mapping found: default to Latin-1 mapping if possible */
				1834	PyErr_Clear();
				1835	if (ch < 256) {
				1836	*s++ = (char)ch;
				1837	continue;
				1838	}
				1839	else if (!charmap_encoding_error(&p, &s, errors,
				1840	"missing character mapping"))
				1841	continue;
				1842	}
				1843	goto onError;
				1844	}
				1845
				1846	/* Apply mapping */
				1847	if (PyInt_Check(x)) {
				1848	int value = PyInt_AS_LONG(x);
				1849	if (value < 0 \|\| value > 255) {
				1850	PyErr_SetString(PyExc_TypeError,
				1851	"character mapping must be in range(256)");
				1852	Py_DECREF(x);
				1853	goto onError;
				1854	}
				1855	*s++ = (char)value;
				1856	}
				1857	else if (x == Py_None) {
				1858	/* undefined mapping */
				1859	if (charmap_encoding_error(&p, &s, errors,
				1860	"character maps to <undefined>")) {
				1861	Py_DECREF(x);
				1862	goto onError;
				1863	}
				1864	}
				1865	else if (PyString_Check(x)) {
				1866	if (PyString_GET_SIZE(x) != 1) {
				1867	/* 1-n mapping */
				1868	PyErr_SetString(PyExc_NotImplementedError,
				1869	"1-n mappings are currently not implemented");
				1870	Py_DECREF(x);
				1871	goto onError;
				1872	}
				1873	s++ = PyString_AS_STRING(x);
				1874	}
				1875	else {
				1876	/* wrong return value */
				1877	PyErr_SetString(PyExc_TypeError,
				1878	"character mapping must return integer, None or unicode");
				1879	Py_DECREF(x);
				1880	goto onError;
				1881	}
				1882	Py_DECREF(x);
				1883	}
				1884	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				1885	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				1886	goto onError;
				1887	return v;
				1888
				1889	onError:
				1890	Py_DECREF(v);
				1891	return NULL;
				1892	}
				1893
				1894	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				1895	PyObject *mapping)
				1896	{
				1897	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				1898	PyErr_BadArgument();
				1899	return NULL;
				1900	}
				1901	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				1902	PyUnicode_GET_SIZE(unicode),
				1903	mapping,
				1904	NULL);
				1905	}
				1906
				1907	static
				1908	int translate_error(const Py_UNICODE **source,
				1909	Py_UNICODE **dest,
				1910	const char *errors,
				1911	const char *details)
				1912	{
				1913	if ((errors == NULL) \|\|
				1914	(strcmp(errors,"strict") == 0)) {
				1915	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1916	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1917	details);
				1918	return -1;
				1919	}
				1920	else if (strcmp(errors,"ignore") == 0) {
				1921	return 0;
				1922	}
				1923	else if (strcmp(errors,"replace") == 0) {
				1924	**dest = '?';
				1925	(*dest)++;
				1926	return 0;
				1927	}
				1928	else {
				1929	PyErr_Format(PyExc_ValueError,
				1930	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1931	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1932	errors);
				1933	return -1;
				1934	}
				1935	}
				1936
				1937	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				1938	int size,
				1939	PyObject *mapping,
				1940	const char *errors)
				1941	{
				1942	PyUnicodeObject *v;
				1943	Py_UNICODE *p;
				1944
				1945	if (mapping == NULL) {
				1946	PyErr_BadArgument();
				1947	return NULL;
				1948	}
				1949
				1950	/* Output will never be longer than input */
				1951	v = _PyUnicode_New(size);
				1952	if (v == NULL)
				1953	goto onError;
				1954	if (size == 0)
				1955	goto done;
				1956	p = PyUnicode_AS_UNICODE(v);
				1957	while (size-- > 0) {
				1958	Py_UNICODE ch = *s++;
				1959	PyObject w, x;
				1960
				1961	/* Get mapping */
				1962	w = PyInt_FromLong(ch);
				1963	if (w == NULL)
				1964	goto onError;
				1965	x = PyObject_GetItem(mapping, w);
				1966	Py_DECREF(w);
				1967	if (x == NULL) {
				1968	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1969	/* No mapping found: default to 1-1 mapping */
				1970	PyErr_Clear();
				1971	*p++ = ch;
				1972	continue;
				1973	}
				1974	goto onError;
				1975	}
				1976
				1977	/* Apply mapping */
				1978	if (PyInt_Check(x))
				1979	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				1980	else if (x == Py_None) {
				1981	/* undefined mapping */
				1982	if (translate_error(&s, &p, errors,
				1983	"character maps to <undefined>")) {
				1984	Py_DECREF(x);
				1985	goto onError;
				1986	}
				1987	}
				1988	else if (PyUnicode_Check(x)) {
				1989	if (PyUnicode_GET_SIZE(x) != 1) {
				1990	/* 1-n mapping */
				1991	PyErr_SetString(PyExc_NotImplementedError,
				1992	"1-n mappings are currently not implemented");
				1993	Py_DECREF(x);
				1994	goto onError;
				1995	}
				1996	p++ = PyUnicode_AS_UNICODE(x);
				1997	}
				1998	else {
				1999	/* wrong return value */
				2000	PyErr_SetString(PyExc_TypeError,
				2001	"translate mapping must return integer, None or unicode");
				2002	Py_DECREF(x);
				2003	goto onError;
				2004	}
				2005	Py_DECREF(x);
				2006	}
				2007	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2008	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2009	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2010
				2011	done:
				2012	return (PyObject *)v;
				2013
				2014	onError:
				2015	Py_XDECREF(v);
				2016	return NULL;
				2017	}
				2018
				2019	PyObject PyUnicode_Translate(PyObject str,
				2020	PyObject *mapping,
				2021	const char *errors)
				2022	{
				2023	PyObject *result;
				2024
				2025	str = PyUnicode_FromObject(str);
				2026	if (str == NULL)
				2027	goto onError;
				2028	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2029	PyUnicode_GET_SIZE(str),
				2030	mapping,
				2031	errors);
				2032	Py_DECREF(str);
				2033	return result;
				2034
				2035	onError:
				2036	Py_XDECREF(str);
				2037	return NULL;
				2038	}
				2039
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2040	/* --- Decimal Encoder ---------------------------------------------------- */
				2041
				2042	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2043	int length,
				2044	char *output,
				2045	const char *errors)
				2046	{
				2047	Py_UNICODE p, end;
				2048
				2049	if (output == NULL) {
				2050	PyErr_BadArgument();
				2051	return -1;
				2052	}
				2053
				2054	p = s;
				2055	end = s + length;
				2056	while (p < end) {
				2057	register Py_UNICODE ch = *p++;
				2058	int decimal;
				2059
				2060	if (Py_UNICODE_ISSPACE(ch)) {
				2061	*output++ = ' ';
				2062	continue;
				2063	}
				2064	decimal = Py_UNICODE_TODECIMAL(ch);
				2065	if (decimal >= 0) {
				2066	*output++ = '0' + decimal;
				2067	continue;
				2068	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2069	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2070	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2071	continue;
				2072	}
				2073	/* All other characters are considered invalid */
				2074	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2075	PyErr_SetString(PyExc_ValueError,
				2076	"invalid decimal Unicode string");
				2077	goto onError;
				2078	}
				2079	else if (strcmp(errors, "ignore") == 0)
				2080	continue;
				2081	else if (strcmp(errors, "replace") == 0) {
				2082	*output++ = '?';
				2083	continue;
				2084	}
				2085	}
				2086	/* 0-terminate the output string */
				2087	*output++ = '\0';
				2088	return 0;
				2089
				2090	onError:
				2091	return -1;
				2092	}
				2093
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2094	/* --- Helpers ------------------------------------------------------------ */
				2095
				2096	static
				2097	int count(PyUnicodeObject *self,
				2098	int start,
				2099	int end,
				2100	PyUnicodeObject *substring)
				2101	{
				2102	int count = 0;
				2103
				2104	end -= substring->length;
				2105
				2106	while (start <= end)
				2107	if (Py_UNICODE_MATCH(self, start, substring)) {
				2108	count++;
				2109	start += substring->length;
				2110	} else
				2111	start++;
				2112
				2113	return count;
				2114	}
				2115
				2116	int PyUnicode_Count(PyObject *str,
				2117	PyObject *substr,
				2118	int start,
				2119	int end)
				2120	{
				2121	int result;
				2122
				2123	str = PyUnicode_FromObject(str);
				2124	if (str == NULL)
				2125	return -1;
				2126	substr = PyUnicode_FromObject(substr);
				2127	if (substr == NULL) {
				2128	Py_DECREF(substr);
				2129	return -1;
				2130	}
				2131
				2132	result = count((PyUnicodeObject *)str,
				2133	start, end,
				2134	(PyUnicodeObject *)substr);
				2135
				2136	Py_DECREF(str);
				2137	Py_DECREF(substr);
				2138	return result;
				2139	}
				2140
				2141	static
				2142	int findstring(PyUnicodeObject *self,
				2143	PyUnicodeObject *substring,
				2144	int start,
				2145	int end,
				2146	int direction)
				2147	{
				2148	if (start < 0)
				2149	start += self->length;
				2150	if (start < 0)
				2151	start = 0;
				2152
				2153	if (substring->length == 0)
				2154	return start;
				2155
				2156	if (end > self->length)
				2157	end = self->length;
				2158	if (end < 0)
				2159	end += self->length;
				2160	if (end < 0)
				2161	end = 0;
				2162
				2163	end -= substring->length;
				2164
				2165	if (direction < 0) {
				2166	for (; end >= start; end--)
				2167	if (Py_UNICODE_MATCH(self, end, substring))
				2168	return end;
				2169	} else {
				2170	for (; start <= end; start++)
				2171	if (Py_UNICODE_MATCH(self, start, substring))
				2172	return start;
				2173	}
				2174
				2175	return -1;
				2176	}
				2177
				2178	int PyUnicode_Find(PyObject *str,
				2179	PyObject *substr,
				2180	int start,
				2181	int end,
				2182	int direction)
				2183	{
				2184	int result;
				2185
				2186	str = PyUnicode_FromObject(str);
				2187	if (str == NULL)
				2188	return -1;
				2189	substr = PyUnicode_FromObject(substr);
				2190	if (substr == NULL) {
				2191	Py_DECREF(substr);
				2192	return -1;
				2193	}
				2194
				2195	result = findstring((PyUnicodeObject *)str,
				2196	(PyUnicodeObject *)substr,
				2197	start, end, direction);
				2198	Py_DECREF(str);
				2199	Py_DECREF(substr);
				2200	return result;
				2201	}
				2202
				2203	static
				2204	int tailmatch(PyUnicodeObject *self,
				2205	PyUnicodeObject *substring,
				2206	int start,
				2207	int end,
				2208	int direction)
				2209	{
				2210	if (start < 0)
				2211	start += self->length;
				2212	if (start < 0)
				2213	start = 0;
				2214
				2215	if (substring->length == 0)
				2216	return 1;
				2217
				2218	if (end > self->length)
				2219	end = self->length;
				2220	if (end < 0)
				2221	end += self->length;
				2222	if (end < 0)
				2223	end = 0;
				2224
				2225	end -= substring->length;
				2226	if (end < start)
				2227	return 0;
				2228
				2229	if (direction > 0) {
				2230	if (Py_UNICODE_MATCH(self, end, substring))
				2231	return 1;
				2232	} else {
				2233	if (Py_UNICODE_MATCH(self, start, substring))
				2234	return 1;
				2235	}
				2236
				2237	return 0;
				2238	}
				2239
				2240	int PyUnicode_Tailmatch(PyObject *str,
				2241	PyObject *substr,
				2242	int start,
				2243	int end,
				2244	int direction)
				2245	{
				2246	int result;
				2247
				2248	str = PyUnicode_FromObject(str);
				2249	if (str == NULL)
				2250	return -1;
				2251	substr = PyUnicode_FromObject(substr);
				2252	if (substr == NULL) {
				2253	Py_DECREF(substr);
				2254	return -1;
				2255	}
				2256
				2257	result = tailmatch((PyUnicodeObject *)str,
				2258	(PyUnicodeObject *)substr,
				2259	start, end, direction);
				2260	Py_DECREF(str);
				2261	Py_DECREF(substr);
				2262	return result;
				2263	}
				2264
				2265	static
				2266	const Py_UNICODE findchar(const Py_UNICODE s,
				2267	int size,
				2268	Py_UNICODE ch)
				2269	{
				2270	/* like wcschr, but doesn't stop at NULL characters */
				2271
				2272	while (size-- > 0) {
				2273	if (*s == ch)
				2274	return s;
				2275	s++;
				2276	}
				2277
				2278	return NULL;
				2279	}
				2280
				2281	/* Apply fixfct filter to the Unicode object self and return a
				2282	reference to the modified object */
				2283
				2284	static
				2285	PyObject fixup(PyUnicodeObject self,
				2286	int (fixfct)(PyUnicodeObject s))
				2287	{
				2288
				2289	PyUnicodeObject *u;
				2290
				2291	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2292	self->length);
				2293	if (u == NULL)
				2294	return NULL;
				2295	if (!fixfct(u)) {
				2296	/* fixfct should return TRUE if it modified the buffer. If
				2297	FALSE, return a reference to the original buffer instead
				2298	(to save space, not time) */
				2299	Py_INCREF(self);
				2300	Py_DECREF(u);
				2301	return (PyObject*) self;
				2302	}
				2303	return (PyObject*) u;
				2304	}
				2305
				2306	static
				2307	int fixupper(PyUnicodeObject *self)
				2308	{
				2309	int len = self->length;
				2310	Py_UNICODE *s = self->str;
				2311	int status = 0;
				2312
				2313	while (len-- > 0) {
				2314	register Py_UNICODE ch;
				2315
				2316	ch = Py_UNICODE_TOUPPER(*s);
				2317	if (ch != *s) {
				2318	status = 1;
				2319	*s = ch;
				2320	}
				2321	s++;
				2322	}
				2323
				2324	return status;
				2325	}
				2326
				2327	static
				2328	int fixlower(PyUnicodeObject *self)
				2329	{
				2330	int len = self->length;
				2331	Py_UNICODE *s = self->str;
				2332	int status = 0;
				2333
				2334	while (len-- > 0) {
				2335	register Py_UNICODE ch;
				2336
				2337	ch = Py_UNICODE_TOLOWER(*s);
				2338	if (ch != *s) {
				2339	status = 1;
				2340	*s = ch;
				2341	}
				2342	s++;
				2343	}
				2344
				2345	return status;
				2346	}
				2347
				2348	static
				2349	int fixswapcase(PyUnicodeObject *self)
				2350	{
				2351	int len = self->length;
				2352	Py_UNICODE *s = self->str;
				2353	int status = 0;
				2354
				2355	while (len-- > 0) {
				2356	if (Py_UNICODE_ISUPPER(*s)) {
				2357	s = Py_UNICODE_TOLOWER(s);
				2358	status = 1;
				2359	} else if (Py_UNICODE_ISLOWER(*s)) {
				2360	s = Py_UNICODE_TOUPPER(s);
				2361	status = 1;
				2362	}
				2363	s++;
				2364	}
				2365
				2366	return status;
				2367	}
				2368
				2369	static
				2370	int fixcapitalize(PyUnicodeObject *self)
				2371	{
				2372	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2373	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2374	return 1;
				2375	}
				2376	return 0;
				2377	}
				2378
				2379	static
				2380	int fixtitle(PyUnicodeObject *self)
				2381	{
				2382	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2383	register Py_UNICODE *e;
				2384	int previous_is_cased;
				2385
				2386	/* Shortcut for single character strings */
				2387	if (PyUnicode_GET_SIZE(self) == 1) {
				2388	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2389	if (*p != ch) {
				2390	*p = ch;
				2391	return 1;
				2392	}
				2393	else
				2394	return 0;
				2395	}
				2396
				2397	e = p + PyUnicode_GET_SIZE(self);
				2398	previous_is_cased = 0;
				2399	for (; p < e; p++) {
				2400	register const Py_UNICODE ch = *p;
				2401
				2402	if (previous_is_cased)
				2403	*p = Py_UNICODE_TOLOWER(ch);
				2404	else
				2405	*p = Py_UNICODE_TOTITLE(ch);
				2406
				2407	if (Py_UNICODE_ISLOWER(ch) \|\|
				2408	Py_UNICODE_ISUPPER(ch) \|\|
				2409	Py_UNICODE_ISTITLE(ch))
				2410	previous_is_cased = 1;
				2411	else
				2412	previous_is_cased = 0;
				2413	}
				2414	return 1;
				2415	}
				2416
				2417	PyObject PyUnicode_Join(PyObject separator,
				2418	PyObject *seq)
				2419	{
				2420	Py_UNICODE *sep;
				2421	int seplen;
				2422	PyUnicodeObject *res = NULL;
				2423	int reslen = 0;
				2424	Py_UNICODE *p;
				2425	int seqlen = 0;
				2426	int sz = 100;
				2427	int i;
				2428
				2429	seqlen = PySequence_Length(seq);
				2430	if (seqlen < 0 && PyErr_Occurred())
				2431	return NULL;
				2432
				2433	if (separator == NULL) {
				2434	Py_UNICODE blank = ' ';
				2435	sep = &blank;
				2436	seplen = 1;
				2437	}
				2438	else {
				2439	separator = PyUnicode_FromObject(separator);
				2440	if (separator == NULL)
				2441	return NULL;
				2442	sep = PyUnicode_AS_UNICODE(separator);
				2443	seplen = PyUnicode_GET_SIZE(separator);
				2444	}
				2445
				2446	res = _PyUnicode_New(sz);
				2447	if (res == NULL)
				2448	goto onError;
				2449	p = PyUnicode_AS_UNICODE(res);
				2450	reslen = 0;
				2451
				2452	for (i = 0; i < seqlen; i++) {
				2453	int itemlen;
				2454	PyObject *item;
				2455
				2456	item = PySequence_GetItem(seq, i);
				2457	if (item == NULL)
				2458	goto onError;
				2459	if (!PyUnicode_Check(item)) {
				2460	PyObject *v;
				2461	v = PyUnicode_FromObject(item);
				2462	Py_DECREF(item);
				2463	item = v;
				2464	if (item == NULL)
				2465	goto onError;
				2466	}
				2467	itemlen = PyUnicode_GET_SIZE(item);
				2468	while (reslen + itemlen + seplen >= sz) {
				2469	if (_PyUnicode_Resize(res, sz*2))
				2470	goto onError;
				2471	sz *= 2;
				2472	p = PyUnicode_AS_UNICODE(res) + reslen;
				2473	}
				2474	if (i > 0) {
				2475	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2476	p += seplen;
				2477	reslen += seplen;
				2478	}
				2479	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2480	p += itemlen;
				2481	reslen += itemlen;
				2482	Py_DECREF(item);
				2483	}
				2484	if (_PyUnicode_Resize(res, reslen))
				2485	goto onError;
				2486
				2487	Py_XDECREF(separator);
				2488	return (PyObject *)res;
				2489
				2490	onError:
				2491	Py_XDECREF(separator);
				2492	Py_DECREF(res);
				2493	return NULL;
				2494	}
				2495
				2496	static
				2497	PyUnicodeObject pad(PyUnicodeObject self,
				2498	int left,
				2499	int right,
				2500	Py_UNICODE fill)
				2501	{
				2502	PyUnicodeObject *u;
				2503
				2504	if (left < 0)
				2505	left = 0;
				2506	if (right < 0)
				2507	right = 0;
				2508
				2509	if (left == 0 && right == 0) {
				2510	Py_INCREF(self);
				2511	return self;
				2512	}
				2513
				2514	u = _PyUnicode_New(left + self->length + right);
				2515	if (u) {
				2516	if (left)
				2517	Py_UNICODE_FILL(u->str, fill, left);
				2518	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2519	if (right)
				2520	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2521	}
				2522
				2523	return u;
				2524	}
				2525
				2526	#define SPLIT_APPEND(data, left, right) \
				2527	str = PyUnicode_FromUnicode(data + left, right - left); \
				2528	if (!str) \
				2529	goto onError; \
				2530	if (PyList_Append(list, str)) { \
				2531	Py_DECREF(str); \
				2532	goto onError; \
				2533	} \
				2534	else \
				2535	Py_DECREF(str);
				2536
				2537	static
				2538	PyObject split_whitespace(PyUnicodeObject self,
				2539	PyObject *list,
				2540	int maxcount)
				2541	{
				2542	register int i;
				2543	register int j;
				2544	int len = self->length;
				2545	PyObject *str;
				2546
				2547	for (i = j = 0; i < len; ) {
				2548	/* find a token */
				2549	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2550	i++;
				2551	j = i;
				2552	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2553	i++;
				2554	if (j < i) {
				2555	if (maxcount-- <= 0)
				2556	break;
				2557	SPLIT_APPEND(self->str, j, i);
				2558	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2559	i++;
				2560	j = i;
				2561	}
				2562	}
				2563	if (j < len) {
				2564	SPLIT_APPEND(self->str, j, len);
				2565	}
				2566	return list;
				2567
				2568	onError:
				2569	Py_DECREF(list);
				2570	return NULL;
				2571	}
				2572
				2573	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2574	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2575	{
				2576	register int i;
				2577	register int j;
				2578	int len;
				2579	PyObject *list;
				2580	PyObject *str;
				2581	Py_UNICODE *data;
				2582
				2583	string = PyUnicode_FromObject(string);
				2584	if (string == NULL)
				2585	return NULL;
				2586	data = PyUnicode_AS_UNICODE(string);
				2587	len = PyUnicode_GET_SIZE(string);
				2588
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2589	list = PyList_New(0);
				2590	if (!list)
				2591	goto onError;
				2592
				2593	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2594	int eol;
				2595
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2596	/* Find a line and append it */
				2597	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2598	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2599
				2600	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2601	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2602	if (i < len) {
				2603	if (data[i] == '\r' && i + 1 < len &&
				2604	data[i+1] == '\n')
				2605	i += 2;
				2606	else
				2607	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2608	if (keepends)
				2609	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2610	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2611	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2612	j = i;
				2613	}
				2614	if (j < len) {
				2615	SPLIT_APPEND(data, j, len);
				2616	}
				2617
				2618	Py_DECREF(string);
				2619	return list;
				2620
				2621	onError:
				2622	Py_DECREF(list);
				2623	Py_DECREF(string);
				2624	return NULL;
				2625	}
				2626
				2627	static
				2628	PyObject split_char(PyUnicodeObject self,
				2629	PyObject *list,
				2630	Py_UNICODE ch,
				2631	int maxcount)
				2632	{
				2633	register int i;
				2634	register int j;
				2635	int len = self->length;
				2636	PyObject *str;
				2637
				2638	for (i = j = 0; i < len; ) {
				2639	if (self->str[i] == ch) {
				2640	if (maxcount-- <= 0)
				2641	break;
				2642	SPLIT_APPEND(self->str, j, i);
				2643	i = j = i + 1;
				2644	} else
				2645	i++;
				2646	}
				2647	if (j <= len) {
				2648	SPLIT_APPEND(self->str, j, len);
				2649	}
				2650	return list;
				2651
				2652	onError:
				2653	Py_DECREF(list);
				2654	return NULL;
				2655	}
				2656
				2657	static
				2658	PyObject split_substring(PyUnicodeObject self,
				2659	PyObject *list,
				2660	PyUnicodeObject *substring,
				2661	int maxcount)
				2662	{
				2663	register int i;
				2664	register int j;
				2665	int len = self->length;
				2666	int sublen = substring->length;
				2667	PyObject *str;
				2668
				2669	for (i = j = 0; i < len - sublen; ) {
				2670	if (Py_UNICODE_MATCH(self, i, substring)) {
				2671	if (maxcount-- <= 0)
				2672	break;
				2673	SPLIT_APPEND(self->str, j, i);
				2674	i = j = i + sublen;
				2675	} else
				2676	i++;
				2677	}
				2678	if (j <= len) {
				2679	SPLIT_APPEND(self->str, j, len);
				2680	}
				2681	return list;
				2682
				2683	onError:
				2684	Py_DECREF(list);
				2685	return NULL;
				2686	}
				2687
				2688	#undef SPLIT_APPEND
				2689
				2690	static
				2691	PyObject split(PyUnicodeObject self,
				2692	PyUnicodeObject *substring,
				2693	int maxcount)
				2694	{
				2695	PyObject *list;
				2696
				2697	if (maxcount < 0)
				2698	maxcount = INT_MAX;
				2699
				2700	list = PyList_New(0);
				2701	if (!list)
				2702	return NULL;
				2703
				2704	if (substring == NULL)
				2705	return split_whitespace(self,list,maxcount);
				2706
				2707	else if (substring->length == 1)
				2708	return split_char(self,list,substring->str[0],maxcount);
				2709
				2710	else if (substring->length == 0) {
				2711	Py_DECREF(list);
				2712	PyErr_SetString(PyExc_ValueError, "empty separator");
				2713	return NULL;
				2714	}
				2715	else
				2716	return split_substring(self,list,substring,maxcount);
				2717	}
				2718
				2719	static
				2720	PyObject strip(PyUnicodeObject self,
				2721	int left,
				2722	int right)
				2723	{
				2724	Py_UNICODE *p = self->str;
				2725	int start = 0;
				2726	int end = self->length;
				2727
				2728	if (left)
				2729	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2730	start++;
				2731
				2732	if (right)
				2733	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2734	end--;
				2735
				2736	if (start == 0 && end == self->length) {
				2737	/* couldn't strip anything off, return original string */
				2738	Py_INCREF(self);
				2739	return (PyObject*) self;
				2740	}
				2741
				2742	return (PyObject*) PyUnicode_FromUnicode(
				2743	self->str + start,
				2744	end - start
				2745	);
				2746	}
				2747
				2748	static
				2749	PyObject replace(PyUnicodeObject self,
				2750	PyUnicodeObject *str1,
				2751	PyUnicodeObject *str2,
				2752	int maxcount)
				2753	{
				2754	PyUnicodeObject *u;
				2755
				2756	if (maxcount < 0)
				2757	maxcount = INT_MAX;
				2758
				2759	if (str1->length == 1 && str2->length == 1) {
				2760	int i;
				2761
				2762	/* replace characters */
				2763	if (!findchar(self->str, self->length, str1->str[0])) {
				2764	/* nothing to replace, return original string */
				2765	Py_INCREF(self);
				2766	u = self;
				2767	} else {
				2768	Py_UNICODE u1 = str1->str[0];
				2769	Py_UNICODE u2 = str2->str[0];
				2770
				2771	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2772	self->str,
				2773	self->length
				2774	);
				2775	if (u)
				2776	for (i = 0; i < u->length; i++)
				2777	if (u->str[i] == u1) {
				2778	if (--maxcount < 0)
				2779	break;
				2780	u->str[i] = u2;
				2781	}
				2782	}
				2783
				2784	} else {
				2785	int n, i;
				2786	Py_UNICODE *p;
				2787
				2788	/* replace strings */
				2789	n = count(self, 0, self->length, str1);
				2790	if (n > maxcount)
				2791	n = maxcount;
				2792	if (n == 0) {
				2793	/* nothing to replace, return original string */
				2794	Py_INCREF(self);
				2795	u = self;
				2796	} else {
				2797	u = _PyUnicode_New(
				2798	self->length + n * (str2->length - str1->length));
				2799	if (u) {
				2800	i = 0;
				2801	p = u->str;
				2802	while (i <= self->length - str1->length)
				2803	if (Py_UNICODE_MATCH(self, i, str1)) {
				2804	/* replace string segment */
				2805	Py_UNICODE_COPY(p, str2->str, str2->length);
				2806	p += str2->length;
				2807	i += str1->length;
				2808	if (--n <= 0) {
				2809	/* copy remaining part */
				2810	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				2811	break;
				2812	}
				2813	} else
				2814	*p++ = self->str[i++];
				2815	}
				2816	}
				2817	}
				2818
				2819	return (PyObject *) u;
				2820	}
				2821
				2822	/* --- Unicode Object Methods --------------------------------------------- */
				2823
				2824	static char title__doc__[] =
				2825	"S.title() -> unicode\n\
				2826	\n\
				2827	Return a titlecased version of S, i.e. words start with title case\n\
				2828	characters, all remaining cased characters have lower case.";
				2829
				2830	static PyObject*
				2831	unicode_title(PyUnicodeObject self, PyObject args)
				2832	{
				2833	if (!PyArg_NoArgs(args))
				2834	return NULL;
				2835	return fixup(self, fixtitle);
				2836	}
				2837
				2838	static char capitalize__doc__[] =
				2839	"S.capitalize() -> unicode\n\
				2840	\n\
				2841	Return a capitalized version of S, i.e. make the first character\n\
				2842	have upper case.";
				2843
				2844	static PyObject*
				2845	unicode_capitalize(PyUnicodeObject self, PyObject args)
				2846	{
				2847	if (!PyArg_NoArgs(args))
				2848	return NULL;
				2849	return fixup(self, fixcapitalize);
				2850	}
				2851
				2852	#if 0
				2853	static char capwords__doc__[] =
				2854	"S.capwords() -> unicode\n\
				2855	\n\
				2856	Apply .capitalize() to all words in S and return the result with\n\
				2857	normalized whitespace (all whitespace strings are replaced by ' ').";
				2858
				2859	static PyObject*
				2860	unicode_capwords(PyUnicodeObject self, PyObject args)
				2861	{
				2862	PyObject *list;
				2863	PyObject *item;
				2864	int i;
				2865
				2866	if (!PyArg_NoArgs(args))
				2867	return NULL;
				2868
				2869	/* Split into words */
				2870	list = split(self, NULL, -1);
				2871	if (!list)
				2872	return NULL;
				2873
				2874	/* Capitalize each word */
				2875	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				2876	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				2877	fixcapitalize);
				2878	if (item == NULL)
				2879	goto onError;
				2880	Py_DECREF(PyList_GET_ITEM(list, i));
				2881	PyList_SET_ITEM(list, i, item);
				2882	}
				2883
				2884	/* Join the words to form a new string */
				2885	item = PyUnicode_Join(NULL, list);
				2886
				2887	onError:
				2888	Py_DECREF(list);
				2889	return (PyObject *)item;
				2890	}
				2891	#endif
				2892
				2893	static char center__doc__[] =
				2894	"S.center(width) -> unicode\n\
				2895	\n\
				2896	Return S centered in a Unicode string of length width. Padding is done\n\
				2897	using spaces.";
				2898
				2899	static PyObject *
				2900	unicode_center(PyUnicodeObject self, PyObject args)
				2901	{
				2902	int marg, left;
				2903	int width;
				2904
				2905	if (!PyArg_ParseTuple(args, "i:center", &width))
				2906	return NULL;
				2907
				2908	if (self->length >= width) {
				2909	Py_INCREF(self);
				2910	return (PyObject*) self;
				2911	}
				2912
				2913	marg = width - self->length;
				2914	left = marg / 2 + (marg & width & 1);
				2915
				2916	return (PyObject*) pad(self, left, marg - left, ' ');
				2917	}
				2918
				2919	static int
				2920	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				2921	{
				2922	int len1, len2;
				2923	Py_UNICODE *s1 = str1->str;
				2924	Py_UNICODE *s2 = str2->str;
				2925
				2926	len1 = str1->length;
				2927	len2 = str2->length;
				2928
				2929	while (len1 > 0 && len2 > 0) {
				2930	int cmp = (s1++) - (s2++);
				2931	if (cmp)
				2932	/* This should make Christian happy! */
				2933	return (cmp < 0) ? -1 : (cmp != 0);
				2934	len1--, len2--;
				2935	}
				2936
				2937	return (len1 < len2) ? -1 : (len1 != len2);
				2938	}
				2939
				2940	int PyUnicode_Compare(PyObject *left,
				2941	PyObject *right)
				2942	{
				2943	PyUnicodeObject u = NULL, v = NULL;
				2944	int result;
				2945
				2946	/* Coerce the two arguments */
				2947	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2948	if (u == NULL)
				2949	goto onError;
				2950	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2951	if (v == NULL)
				2952	goto onError;
				2953
				2954	/* Shortcut for emtpy or interned objects */
				2955	if (v == u) {
				2956	Py_DECREF(u);
				2957	Py_DECREF(v);
				2958	return 0;
				2959	}
				2960
				2961	result = unicode_compare(u, v);
				2962
				2963	Py_DECREF(u);
				2964	Py_DECREF(v);
				2965	return result;
				2966
				2967	onError:
				2968	Py_XDECREF(u);
				2969	Py_XDECREF(v);
				2970	return -1;
				2971	}
				2972
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2973	int PyUnicode_Contains(PyObject *container,
				2974	PyObject *element)
				2975	{
				2976	PyUnicodeObject u = NULL, v = NULL;
				2977	int result;
				2978	register const Py_UNICODE p, e;
				2979	register Py_UNICODE ch;
				2980
				2981	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2982	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
				2983	if (v == NULL)
				2984	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2985	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				2986	if (u == NULL) {
				2987	Py_DECREF(v);
				2988	goto onError;
				2989	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2990
				2991	/* Check v in u */
				2992	if (PyUnicode_GET_SIZE(v) != 1) {
				2993	PyErr_SetString(PyExc_TypeError,
				2994	"string member test needs char left operand");
				2995	goto onError;
				2996	}
				2997	ch = *PyUnicode_AS_UNICODE(v);
				2998	p = PyUnicode_AS_UNICODE(u);
				2999	e = p + PyUnicode_GET_SIZE(u);
				3000	result = 0;
				3001	while (p < e) {
				3002	if (*p++ == ch) {
				3003	result = 1;
				3004	break;
				3005	}
				3006	}
				3007
				3008	Py_DECREF(u);
				3009	Py_DECREF(v);
				3010	return result;
				3011
				3012	onError:
				3013	Py_XDECREF(u);
				3014	Py_XDECREF(v);
				3015	return -1;
				3016	}
				3017
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3018	/* Concat to string or Unicode object giving a new Unicode object. */
				3019
				3020	PyObject PyUnicode_Concat(PyObject left,
				3021	PyObject *right)
				3022	{
				3023	PyUnicodeObject u = NULL, v = NULL, *w;
				3024
				3025	/* Coerce the two arguments */
				3026	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3027	if (u == NULL)
				3028	goto onError;
				3029	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3030	if (v == NULL)
				3031	goto onError;
				3032
				3033	/* Shortcuts */
				3034	if (v == unicode_empty) {
				3035	Py_DECREF(v);
				3036	return (PyObject *)u;
				3037	}
				3038	if (u == unicode_empty) {
				3039	Py_DECREF(u);
				3040	return (PyObject *)v;
				3041	}
				3042
				3043	/* Concat the two Unicode strings */
				3044	w = _PyUnicode_New(u->length + v->length);
				3045	if (w == NULL)
				3046	goto onError;
				3047	Py_UNICODE_COPY(w->str, u->str, u->length);
				3048	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3049
				3050	Py_DECREF(u);
				3051	Py_DECREF(v);
				3052	return (PyObject *)w;
				3053
				3054	onError:
				3055	Py_XDECREF(u);
				3056	Py_XDECREF(v);
				3057	return NULL;
				3058	}
				3059
				3060	static char count__doc__[] =
				3061	"S.count(sub[, start[, end]]) -> int\n\
				3062	\n\
				3063	Return the number of occurrences of substring sub in Unicode string\n\
				3064	S[start:end]. Optional arguments start and end are\n\
				3065	interpreted as in slice notation.";
				3066
				3067	static PyObject *
				3068	unicode_count(PyUnicodeObject self, PyObject args)
				3069	{
				3070	PyUnicodeObject *substring;
				3071	int start = 0;
				3072	int end = INT_MAX;
				3073	PyObject *result;
				3074
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3075	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3076	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3077	return NULL;
				3078
				3079	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3080	(PyObject *)substring);
				3081	if (substring == NULL)
				3082	return NULL;
				3083
				3084	if (substring->length == 0) {
				3085	Py_DECREF(substring);
				3086	return PyInt_FromLong((long) 0);
				3087	}
				3088
				3089	if (start < 0)
				3090	start += self->length;
				3091	if (start < 0)
				3092	start = 0;
				3093	if (end > self->length)
				3094	end = self->length;
				3095	if (end < 0)
				3096	end += self->length;
				3097	if (end < 0)
				3098	end = 0;
				3099
				3100	result = PyInt_FromLong((long) count(self, start, end, substring));
				3101
				3102	Py_DECREF(substring);
				3103	return result;
				3104	}
				3105
				3106	static char encode__doc__[] =
				3107	"S.encode([encoding[,errors]]) -> string\n\
				3108	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3109	Return an encoded string version of S. Default encoding is the current\n\
				3110	default string encoding. errors may be given to set a different error\n\
				3111	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3112	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3113
				3114	static PyObject *
				3115	unicode_encode(PyUnicodeObject self, PyObject args)
				3116	{
				3117	char *encoding = NULL;
				3118	char *errors = NULL;
				3119	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3120	return NULL;
				3121	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3122	}
				3123
				3124	static char expandtabs__doc__[] =
				3125	"S.expandtabs([tabsize]) -> unicode\n\
				3126	\n\
				3127	Return a copy of S where all tab characters are expanded using spaces.\n\
				3128	If tabsize is not given, a tab size of 8 characters is assumed.";
				3129
				3130	static PyObject*
				3131	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3132	{
				3133	Py_UNICODE *e;
				3134	Py_UNICODE *p;
				3135	Py_UNICODE *q;
				3136	int i, j;
				3137	PyUnicodeObject *u;
				3138	int tabsize = 8;
				3139
				3140	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3141	return NULL;
				3142
				3143	/* First pass: determine size of ouput string */
				3144	i = j = 0;
				3145	e = self->str + self->length;
				3146	for (p = self->str; p < e; p++)
				3147	if (*p == '\t') {
				3148	if (tabsize > 0)
				3149	j += tabsize - (j % tabsize);
				3150	}
				3151	else {
				3152	j++;
				3153	if (p == '\n' \|\| p == '\r') {
				3154	i += j;
				3155	j = 0;
				3156	}
				3157	}
				3158
				3159	/* Second pass: create output string and fill it */
				3160	u = _PyUnicode_New(i + j);
				3161	if (!u)
				3162	return NULL;
				3163
				3164	j = 0;
				3165	q = u->str;
				3166
				3167	for (p = self->str; p < e; p++)
				3168	if (*p == '\t') {
				3169	if (tabsize > 0) {
				3170	i = tabsize - (j % tabsize);
				3171	j += i;
				3172	while (i--)
				3173	*q++ = ' ';
				3174	}
				3175	}
				3176	else {
				3177	j++;
				3178	q++ = p;
				3179	if (p == '\n' \|\| p == '\r')
				3180	j = 0;
				3181	}
				3182
				3183	return (PyObject*) u;
				3184	}
				3185
				3186	static char find__doc__[] =
				3187	"S.find(sub [,start [,end]]) -> int\n\
				3188	\n\
				3189	Return the lowest index in S where substring sub is found,\n\
				3190	such that sub is contained within s[start,end]. Optional\n\
				3191	arguments start and end are interpreted as in slice notation.\n\
				3192	\n\
				3193	Return -1 on failure.";
				3194
				3195	static PyObject *
				3196	unicode_find(PyUnicodeObject self, PyObject args)
				3197	{
				3198	PyUnicodeObject *substring;
				3199	int start = 0;
				3200	int end = INT_MAX;
				3201	PyObject *result;
				3202
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3203	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3204	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3205	return NULL;
				3206	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3207	(PyObject *)substring);
				3208	if (substring == NULL)
				3209	return NULL;
				3210
				3211	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3212
				3213	Py_DECREF(substring);
				3214	return result;
				3215	}
				3216
				3217	static PyObject *
				3218	unicode_getitem(PyUnicodeObject *self, int index)
				3219	{
				3220	if (index < 0 \|\| index >= self->length) {
				3221	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3222	return NULL;
				3223	}
				3224
				3225	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3226	}
				3227
				3228	static long
				3229	unicode_hash(PyUnicodeObject *self)
				3230	{
				3231	long hash;
				3232	PyObject *utf8;
				3233
				3234	/* Since Unicode objects compare equal to their UTF-8 string
				3235	counterparts, they should also use the UTF-8 strings as basis
				3236	for their hash value. This is needed to assure that strings and
				3237	Unicode objects behave in the same way as dictionary
				3238	keys. Unfortunately, this costs some performance and also some
				3239	memory if the cached UTF-8 representation is not used later
				3240	on. */
				3241	if (self->hash != -1)
				3242	return self->hash;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	3243	utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3244	if (utf8 == NULL)
				3245	return -1;
				3246	hash = PyObject_Hash(utf8);
				3247	if (hash == -1)
				3248	return -1;
				3249	self->hash = hash;
				3250	return hash;
				3251	}
				3252
				3253	static char index__doc__[] =
				3254	"S.index(sub [,start [,end]]) -> int\n\
				3255	\n\
				3256	Like S.find() but raise ValueError when the substring is not found.";
				3257
				3258	static PyObject *
				3259	unicode_index(PyUnicodeObject self, PyObject args)
				3260	{
				3261	int result;
				3262	PyUnicodeObject *substring;
				3263	int start = 0;
				3264	int end = INT_MAX;
				3265
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3266	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3267	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3268	return NULL;
				3269
				3270	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3271	(PyObject *)substring);
				3272	if (substring == NULL)
				3273	return NULL;
				3274
				3275	result = findstring(self, substring, start, end, 1);
				3276
				3277	Py_DECREF(substring);
				3278	if (result < 0) {
				3279	PyErr_SetString(PyExc_ValueError, "substring not found");
				3280	return NULL;
				3281	}
				3282	return PyInt_FromLong(result);
				3283	}
				3284
				3285	static char islower__doc__[] =
				3286	"S.islower() -> int\n\
				3287	\n\
				3288	Return 1 if all cased characters in S are lowercase and there is\n\
				3289	at least one cased character in S, 0 otherwise.";
				3290
				3291	static PyObject*
				3292	unicode_islower(PyUnicodeObject self, PyObject args)
				3293	{
				3294	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3295	register const Py_UNICODE *e;
				3296	int cased;
				3297
				3298	if (!PyArg_NoArgs(args))
				3299	return NULL;
				3300
				3301	/* Shortcut for single character strings */
				3302	if (PyUnicode_GET_SIZE(self) == 1)
				3303	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3304
				3305	e = p + PyUnicode_GET_SIZE(self);
				3306	cased = 0;
				3307	for (; p < e; p++) {
				3308	register const Py_UNICODE ch = *p;
				3309
				3310	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3311	return PyInt_FromLong(0);
				3312	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3313	cased = 1;
				3314	}
				3315	return PyInt_FromLong(cased);
				3316	}
				3317
				3318	static char isupper__doc__[] =
				3319	"S.isupper() -> int\n\
				3320	\n\
				3321	Return 1 if all cased characters in S are uppercase and there is\n\
				3322	at least one cased character in S, 0 otherwise.";
				3323
				3324	static PyObject*
				3325	unicode_isupper(PyUnicodeObject self, PyObject args)
				3326	{
				3327	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3328	register const Py_UNICODE *e;
				3329	int cased;
				3330
				3331	if (!PyArg_NoArgs(args))
				3332	return NULL;
				3333
				3334	/* Shortcut for single character strings */
				3335	if (PyUnicode_GET_SIZE(self) == 1)
				3336	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3337
				3338	e = p + PyUnicode_GET_SIZE(self);
				3339	cased = 0;
				3340	for (; p < e; p++) {
				3341	register const Py_UNICODE ch = *p;
				3342
				3343	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3344	return PyInt_FromLong(0);
				3345	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3346	cased = 1;
				3347	}
				3348	return PyInt_FromLong(cased);
				3349	}
				3350
				3351	static char istitle__doc__[] =
				3352	"S.istitle() -> int\n\
				3353	\n\
				3354	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3355	may only follow uncased characters and lowercase characters only cased\n\
				3356	ones. Return 0 otherwise.";
				3357
				3358	static PyObject*
				3359	unicode_istitle(PyUnicodeObject self, PyObject args)
				3360	{
				3361	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3362	register const Py_UNICODE *e;
				3363	int cased, previous_is_cased;
				3364
				3365	if (!PyArg_NoArgs(args))
				3366	return NULL;
				3367
				3368	/* Shortcut for single character strings */
				3369	if (PyUnicode_GET_SIZE(self) == 1)
				3370	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3371	(Py_UNICODE_ISUPPER(*p) != 0));
				3372
				3373	e = p + PyUnicode_GET_SIZE(self);
				3374	cased = 0;
				3375	previous_is_cased = 0;
				3376	for (; p < e; p++) {
				3377	register const Py_UNICODE ch = *p;
				3378
				3379	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3380	if (previous_is_cased)
				3381	return PyInt_FromLong(0);
				3382	previous_is_cased = 1;
				3383	cased = 1;
				3384	}
				3385	else if (Py_UNICODE_ISLOWER(ch)) {
				3386	if (!previous_is_cased)
				3387	return PyInt_FromLong(0);
				3388	previous_is_cased = 1;
				3389	cased = 1;
				3390	}
				3391	else
				3392	previous_is_cased = 0;
				3393	}
				3394	return PyInt_FromLong(cased);
				3395	}
				3396
				3397	static char isspace__doc__[] =
				3398	"S.isspace() -> int\n\
				3399	\n\
				3400	Return 1 if there are only whitespace characters in S,\n\
				3401	0 otherwise.";
				3402
				3403	static PyObject*
				3404	unicode_isspace(PyUnicodeObject self, PyObject args)
				3405	{
				3406	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3407	register const Py_UNICODE *e;
				3408
				3409	if (!PyArg_NoArgs(args))
				3410	return NULL;
				3411
				3412	/* Shortcut for single character strings */
				3413	if (PyUnicode_GET_SIZE(self) == 1 &&
				3414	Py_UNICODE_ISSPACE(*p))
				3415	return PyInt_FromLong(1);
				3416
				3417	e = p + PyUnicode_GET_SIZE(self);
				3418	for (; p < e; p++) {
				3419	if (!Py_UNICODE_ISSPACE(*p))
				3420	return PyInt_FromLong(0);
				3421	}
				3422	return PyInt_FromLong(1);
				3423	}
				3424
				3425	static char isdecimal__doc__[] =
				3426	"S.isdecimal() -> int\n\
				3427	\n\
				3428	Return 1 if there are only decimal characters in S,\n\
				3429	0 otherwise.";
				3430
				3431	static PyObject*
				3432	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3433	{
				3434	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3435	register const Py_UNICODE *e;
				3436
				3437	if (!PyArg_NoArgs(args))
				3438	return NULL;
				3439
				3440	/* Shortcut for single character strings */
				3441	if (PyUnicode_GET_SIZE(self) == 1 &&
				3442	Py_UNICODE_ISDECIMAL(*p))
				3443	return PyInt_FromLong(1);
				3444
				3445	e = p + PyUnicode_GET_SIZE(self);
				3446	for (; p < e; p++) {
				3447	if (!Py_UNICODE_ISDECIMAL(*p))
				3448	return PyInt_FromLong(0);
				3449	}
				3450	return PyInt_FromLong(1);
				3451	}
				3452
				3453	static char isdigit__doc__[] =
				3454	"S.isdigit() -> int\n\
				3455	\n\
				3456	Return 1 if there are only digit characters in S,\n\
				3457	0 otherwise.";
				3458
				3459	static PyObject*
				3460	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3461	{
				3462	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3463	register const Py_UNICODE *e;
				3464
				3465	if (!PyArg_NoArgs(args))
				3466	return NULL;
				3467
				3468	/* Shortcut for single character strings */
				3469	if (PyUnicode_GET_SIZE(self) == 1 &&
				3470	Py_UNICODE_ISDIGIT(*p))
				3471	return PyInt_FromLong(1);
				3472
				3473	e = p + PyUnicode_GET_SIZE(self);
				3474	for (; p < e; p++) {
				3475	if (!Py_UNICODE_ISDIGIT(*p))
				3476	return PyInt_FromLong(0);
				3477	}
				3478	return PyInt_FromLong(1);
				3479	}
				3480
				3481	static char isnumeric__doc__[] =
				3482	"S.isnumeric() -> int\n\
				3483	\n\
				3484	Return 1 if there are only numeric characters in S,\n\
				3485	0 otherwise.";
				3486
				3487	static PyObject*
				3488	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3489	{
				3490	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3491	register const Py_UNICODE *e;
				3492
				3493	if (!PyArg_NoArgs(args))
				3494	return NULL;
				3495
				3496	/* Shortcut for single character strings */
				3497	if (PyUnicode_GET_SIZE(self) == 1 &&
				3498	Py_UNICODE_ISNUMERIC(*p))
				3499	return PyInt_FromLong(1);
				3500
				3501	e = p + PyUnicode_GET_SIZE(self);
				3502	for (; p < e; p++) {
				3503	if (!Py_UNICODE_ISNUMERIC(*p))
				3504	return PyInt_FromLong(0);
				3505	}
				3506	return PyInt_FromLong(1);
				3507	}
				3508
				3509	static char join__doc__[] =
				3510	"S.join(sequence) -> unicode\n\
				3511	\n\
				3512	Return a string which is the concatenation of the strings in the\n\
				3513	sequence. The separator between elements is S.";
				3514
				3515	static PyObject*
				3516	unicode_join(PyUnicodeObject self, PyObject args)
				3517	{
				3518	PyObject *data;
				3519	if (!PyArg_ParseTuple(args, "O:join", &data))
				3520	return NULL;
				3521
				3522	return PyUnicode_Join((PyObject *)self, data);
				3523	}
				3524
				3525	static int
				3526	unicode_length(PyUnicodeObject *self)
				3527	{
				3528	return self->length;
				3529	}
				3530
				3531	static char ljust__doc__[] =
				3532	"S.ljust(width) -> unicode\n\
				3533	\n\
				3534	Return S left justified in a Unicode string of length width. Padding is\n\
				3535	done using spaces.";
				3536
				3537	static PyObject *
				3538	unicode_ljust(PyUnicodeObject self, PyObject args)
				3539	{
				3540	int width;
				3541	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3542	return NULL;
				3543
				3544	if (self->length >= width) {
				3545	Py_INCREF(self);
				3546	return (PyObject*) self;
				3547	}
				3548
				3549	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3550	}
				3551
				3552	static char lower__doc__[] =
				3553	"S.lower() -> unicode\n\
				3554	\n\
				3555	Return a copy of the string S converted to lowercase.";
				3556
				3557	static PyObject*
				3558	unicode_lower(PyUnicodeObject self, PyObject args)
				3559	{
				3560	if (!PyArg_NoArgs(args))
				3561	return NULL;
				3562	return fixup(self, fixlower);
				3563	}
				3564
				3565	static char lstrip__doc__[] =
				3566	"S.lstrip() -> unicode\n\
				3567	\n\
				3568	Return a copy of the string S with leading whitespace removed.";
				3569
				3570	static PyObject *
				3571	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3572	{
				3573	if (!PyArg_NoArgs(args))
				3574	return NULL;
				3575	return strip(self, 1, 0);
				3576	}
				3577
				3578	static PyObject*
				3579	unicode_repeat(PyUnicodeObject *str, int len)
				3580	{
				3581	PyUnicodeObject *u;
				3582	Py_UNICODE *p;
				3583
				3584	if (len < 0)
				3585	len = 0;
				3586
				3587	if (len == 1) {
				3588	/* no repeat, return original string */
				3589	Py_INCREF(str);
				3590	return (PyObject*) str;
				3591	}
				3592
				3593	u = _PyUnicode_New(len * str->length);
				3594	if (!u)
				3595	return NULL;
				3596
				3597	p = u->str;
				3598
				3599	while (len-- > 0) {
				3600	Py_UNICODE_COPY(p, str->str, str->length);
				3601	p += str->length;
				3602	}
				3603
				3604	return (PyObject*) u;
				3605	}
				3606
				3607	PyObject PyUnicode_Replace(PyObject obj,
				3608	PyObject *subobj,
				3609	PyObject *replobj,
				3610	int maxcount)
				3611	{
				3612	PyObject *self;
				3613	PyObject *str1;
				3614	PyObject *str2;
				3615	PyObject *result;
				3616
				3617	self = PyUnicode_FromObject(obj);
				3618	if (self == NULL)
				3619	return NULL;
				3620	str1 = PyUnicode_FromObject(subobj);
				3621	if (str1 == NULL) {
				3622	Py_DECREF(self);
				3623	return NULL;
				3624	}
				3625	str2 = PyUnicode_FromObject(replobj);
				3626	if (str2 == NULL) {
				3627	Py_DECREF(self);
				3628	Py_DECREF(str1);
				3629	return NULL;
				3630	}
				3631	result = replace((PyUnicodeObject *)self,
				3632	(PyUnicodeObject *)str1,
				3633	(PyUnicodeObject *)str2,
				3634	maxcount);
				3635	Py_DECREF(self);
				3636	Py_DECREF(str1);
				3637	Py_DECREF(str2);
				3638	return result;
				3639	}
				3640
				3641	static char replace__doc__[] =
				3642	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3643	\n\
				3644	Return a copy of S with all occurrences of substring\n\
				3645	old replaced by new. If the optional argument maxsplit is\n\
				3646	given, only the first maxsplit occurrences are replaced.";
				3647
				3648	static PyObject*
				3649	unicode_replace(PyUnicodeObject self, PyObject args)
				3650	{
				3651	PyUnicodeObject *str1;
				3652	PyUnicodeObject *str2;
				3653	int maxcount = -1;
				3654	PyObject *result;
				3655
				3656	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3657	return NULL;
				3658	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3659	if (str1 == NULL)
				3660	return NULL;
				3661	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3662	if (str2 == NULL)
				3663	return NULL;
				3664
				3665	result = replace(self, str1, str2, maxcount);
				3666
				3667	Py_DECREF(str1);
				3668	Py_DECREF(str2);
				3669	return result;
				3670	}
				3671
				3672	static
				3673	PyObject unicode_repr(PyObject unicode)
				3674	{
				3675	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				3676	PyUnicode_GET_SIZE(unicode),
				3677	1);
				3678	}
				3679
				3680	static char rfind__doc__[] =
				3681	"S.rfind(sub [,start [,end]]) -> int\n\
				3682	\n\
				3683	Return the highest index in S where substring sub is found,\n\
				3684	such that sub is contained within s[start,end]. Optional\n\
				3685	arguments start and end are interpreted as in slice notation.\n\
				3686	\n\
				3687	Return -1 on failure.";
				3688
				3689	static PyObject *
				3690	unicode_rfind(PyUnicodeObject self, PyObject args)
				3691	{
				3692	PyUnicodeObject *substring;
				3693	int start = 0;
				3694	int end = INT_MAX;
				3695	PyObject *result;
				3696
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3697	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				3698	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3699	return NULL;
				3700	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3701	(PyObject *)substring);
				3702	if (substring == NULL)
				3703	return NULL;
				3704
				3705	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				3706
				3707	Py_DECREF(substring);
				3708	return result;
				3709	}
				3710
				3711	static char rindex__doc__[] =
				3712	"S.rindex(sub [,start [,end]]) -> int\n\
				3713	\n\
				3714	Like S.rfind() but raise ValueError when the substring is not found.";
				3715
				3716	static PyObject *
				3717	unicode_rindex(PyUnicodeObject self, PyObject args)
				3718	{
				3719	int result;
				3720	PyUnicodeObject *substring;
				3721	int start = 0;
				3722	int end = INT_MAX;
				3723
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3724	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				3725	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3726	return NULL;
				3727	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3728	(PyObject *)substring);
				3729	if (substring == NULL)
				3730	return NULL;
				3731
				3732	result = findstring(self, substring, start, end, -1);
				3733
				3734	Py_DECREF(substring);
				3735	if (result < 0) {
				3736	PyErr_SetString(PyExc_ValueError, "substring not found");
				3737	return NULL;
				3738	}
				3739	return PyInt_FromLong(result);
				3740	}
				3741
				3742	static char rjust__doc__[] =
				3743	"S.rjust(width) -> unicode\n\
				3744	\n\
				3745	Return S right justified in a Unicode string of length width. Padding is\n\
				3746	done using spaces.";
				3747
				3748	static PyObject *
				3749	unicode_rjust(PyUnicodeObject self, PyObject args)
				3750	{
				3751	int width;
				3752	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				3753	return NULL;
				3754
				3755	if (self->length >= width) {
				3756	Py_INCREF(self);
				3757	return (PyObject*) self;
				3758	}
				3759
				3760	return (PyObject*) pad(self, width - self->length, 0, ' ');
				3761	}
				3762
				3763	static char rstrip__doc__[] =
				3764	"S.rstrip() -> unicode\n\
				3765	\n\
				3766	Return a copy of the string S with trailing whitespace removed.";
				3767
				3768	static PyObject *
				3769	unicode_rstrip(PyUnicodeObject self, PyObject args)
				3770	{
				3771	if (!PyArg_NoArgs(args))
				3772	return NULL;
				3773	return strip(self, 0, 1);
				3774	}
				3775
				3776	static PyObject*
				3777	unicode_slice(PyUnicodeObject *self, int start, int end)
				3778	{
				3779	/* standard clamping */
				3780	if (start < 0)
				3781	start = 0;
				3782	if (end < 0)
				3783	end = 0;
				3784	if (end > self->length)
				3785	end = self->length;
				3786	if (start == 0 && end == self->length) {
				3787	/* full slice, return original string */
				3788	Py_INCREF(self);
				3789	return (PyObject*) self;
				3790	}
				3791	if (start > end)
				3792	start = end;
				3793	/* copy slice */
				3794	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				3795	end - start);
				3796	}
				3797
				3798	PyObject PyUnicode_Split(PyObject s,
				3799	PyObject *sep,
				3800	int maxsplit)
				3801	{
				3802	PyObject *result;
				3803
				3804	s = PyUnicode_FromObject(s);
				3805	if (s == NULL)
				3806	return NULL;
				3807	if (sep != NULL) {
				3808	sep = PyUnicode_FromObject(sep);
				3809	if (sep == NULL) {
				3810	Py_DECREF(s);
				3811	return NULL;
				3812	}
				3813	}
				3814
				3815	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				3816
				3817	Py_DECREF(s);
				3818	Py_XDECREF(sep);
				3819	return result;
				3820	}
				3821
				3822	static char split__doc__[] =
				3823	"S.split([sep [,maxsplit]]) -> list of strings\n\
				3824	\n\
				3825	Return a list of the words in S, using sep as the\n\
				3826	delimiter string. If maxsplit is given, at most maxsplit\n\
				3827	splits are done. If sep is not specified, any whitespace string\n\
				3828	is a separator.";
				3829
				3830	static PyObject*
				3831	unicode_split(PyUnicodeObject self, PyObject args)
				3832	{
				3833	PyObject *substring = Py_None;
				3834	int maxcount = -1;
				3835
				3836	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				3837	return NULL;
				3838
				3839	if (substring == Py_None)
				3840	return split(self, NULL, maxcount);
				3841	else if (PyUnicode_Check(substring))
				3842	return split(self, (PyUnicodeObject *)substring, maxcount);
				3843	else
				3844	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				3845	}
				3846
				3847	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3848	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3849	\n\
				3850	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3851	Line breaks are not included in the resulting list unless keepends\n\
				3852	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3853
				3854	static PyObject*
				3855	unicode_splitlines(PyUnicodeObject self, PyObject args)
				3856	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3857	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3858
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3859	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3860	return NULL;
				3861
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3862	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3863	}
				3864
				3865	static
				3866	PyObject unicode_str(PyUnicodeObject self)
				3867	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3868	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3869	}
				3870
				3871	static char strip__doc__[] =
				3872	"S.strip() -> unicode\n\
				3873	\n\
				3874	Return a copy of S with leading and trailing whitespace removed.";
				3875
				3876	static PyObject *
				3877	unicode_strip(PyUnicodeObject self, PyObject args)
				3878	{
				3879	if (!PyArg_NoArgs(args))
				3880	return NULL;
				3881	return strip(self, 1, 1);
				3882	}
				3883
				3884	static char swapcase__doc__[] =
				3885	"S.swapcase() -> unicode\n\
				3886	\n\
				3887	Return a copy of S with uppercase characters converted to lowercase\n\
				3888	and vice versa.";
				3889
				3890	static PyObject*
				3891	unicode_swapcase(PyUnicodeObject self, PyObject args)
				3892	{
				3893	if (!PyArg_NoArgs(args))
				3894	return NULL;
				3895	return fixup(self, fixswapcase);
				3896	}
				3897
				3898	static char translate__doc__[] =
				3899	"S.translate(table) -> unicode\n\
				3900	\n\
				3901	Return a copy of the string S, where all characters have been mapped\n\
				3902	through the given translation table, which must be a mapping of\n\
				3903	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				3904	are left untouched. Characters mapped to None are deleted.";
				3905
				3906	static PyObject*
				3907	unicode_translate(PyUnicodeObject self, PyObject args)
				3908	{
				3909	PyObject *table;
				3910
				3911	if (!PyArg_ParseTuple(args, "O:translate", &table))
				3912	return NULL;
				3913	return PyUnicode_TranslateCharmap(self->str,
				3914	self->length,
				3915	table,
				3916	"ignore");
				3917	}
				3918
				3919	static char upper__doc__[] =
				3920	"S.upper() -> unicode\n\
				3921	\n\
				3922	Return a copy of S converted to uppercase.";
				3923
				3924	static PyObject*
				3925	unicode_upper(PyUnicodeObject self, PyObject args)
				3926	{
				3927	if (!PyArg_NoArgs(args))
				3928	return NULL;
				3929	return fixup(self, fixupper);
				3930	}
				3931
				3932	#if 0
				3933	static char zfill__doc__[] =
				3934	"S.zfill(width) -> unicode\n\
				3935	\n\
				3936	Pad a numeric string x with zeros on the left, to fill a field\n\
				3937	of the specified width. The string x is never truncated.";
				3938
				3939	static PyObject *
				3940	unicode_zfill(PyUnicodeObject self, PyObject args)
				3941	{
				3942	int fill;
				3943	PyUnicodeObject *u;
				3944
				3945	int width;
				3946	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				3947	return NULL;
				3948
				3949	if (self->length >= width) {
				3950	Py_INCREF(self);
				3951	return (PyObject*) self;
				3952	}
				3953
				3954	fill = width - self->length;
				3955
				3956	u = pad(self, fill, 0, '0');
				3957
				3958	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				3959	/* move sign to beginning of string */
				3960	u->str[0] = u->str[fill];
				3961	u->str[fill] = '0';
				3962	}
				3963
				3964	return (PyObject*) u;
				3965	}
				3966	#endif
				3967
				3968	#if 0
				3969	static PyObject*
				3970	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				3971	{
				3972	if (!PyArg_NoArgs(args))
				3973	return NULL;
				3974	return PyInt_FromLong(unicode_freelist_size);
				3975	}
				3976	#endif
				3977
				3978	static char startswith__doc__[] =
				3979	"S.startswith(prefix[, start[, end]]) -> int\n\
				3980	\n\
				3981	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				3982	optional start, test S beginning at that position. With optional end, stop\n\
				3983	comparing S at that position.";
				3984
				3985	static PyObject *
				3986	unicode_startswith(PyUnicodeObject *self,
				3987	PyObject *args)
				3988	{
				3989	PyUnicodeObject *substring;
				3990	int start = 0;
				3991	int end = INT_MAX;
				3992	PyObject *result;
				3993
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3994	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				3995	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3996	return NULL;
				3997	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3998	(PyObject *)substring);
				3999	if (substring == NULL)
				4000	return NULL;
				4001
				4002	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4003
				4004	Py_DECREF(substring);
				4005	return result;
				4006	}
				4007
				4008
				4009	static char endswith__doc__[] =
				4010	"S.endswith(suffix[, start[, end]]) -> int\n\
				4011	\n\
				4012	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4013	optional start, test S beginning at that position. With optional end, stop\n\
				4014	comparing S at that position.";
				4015
				4016	static PyObject *
				4017	unicode_endswith(PyUnicodeObject *self,
				4018	PyObject *args)
				4019	{
				4020	PyUnicodeObject *substring;
				4021	int start = 0;
				4022	int end = INT_MAX;
				4023	PyObject *result;
				4024
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4025	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4026	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4027	return NULL;
				4028	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4029	(PyObject *)substring);
				4030	if (substring == NULL)
				4031	return NULL;
				4032
				4033	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4034
				4035	Py_DECREF(substring);
				4036	return result;
				4037	}
				4038
				4039
				4040	static PyMethodDef unicode_methods[] = {
				4041
				4042	/* Order is according to common usage: often used methods should
				4043	appear first, since lookup is done sequentially. */
				4044
				4045	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4046	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4047	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4048	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4049	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4050	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4051	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4052	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4053	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4054	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4055	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4056	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4057	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4058	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4059	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4060	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4061	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4062	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4063	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4064	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4065	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4066	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4067	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4068	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4069	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4070	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4071	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4072	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4073	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4074	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4075	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4076	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4077	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
				4078	#if 0
				4079	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4080	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4081	#endif
				4082
				4083	#if 0
				4084	/* This one is just used for debugging the implementation. */
				4085	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4086	#endif
				4087
				4088	{NULL, NULL}
				4089	};
				4090
				4091	static PyObject *
				4092	unicode_getattr(PyUnicodeObject self, char name)
				4093	{
				4094	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4095	}
				4096
				4097	static PySequenceMethods unicode_as_sequence = {
				4098	(inquiry) unicode_length, /* sq_length */
				4099	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4100	(intargfunc) unicode_repeat, /* sq_repeat */
				4101	(intargfunc) unicode_getitem, /* sq_item */
				4102	(intintargfunc) unicode_slice, /* sq_slice */
				4103	0, /* sq_ass_item */
				4104	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4105	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4106	};
				4107
				4108	static int
				4109	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4110	int index,
				4111	const void **ptr)
				4112	{
				4113	if (index != 0) {
				4114	PyErr_SetString(PyExc_SystemError,
				4115	"accessing non-existent unicode segment");
				4116	return -1;
				4117	}
				4118	ptr = (void ) self->str;
				4119	return PyUnicode_GET_DATA_SIZE(self);
				4120	}
				4121
				4122	static int
				4123	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4124	const void **ptr)
				4125	{
				4126	PyErr_SetString(PyExc_TypeError,
				4127	"cannot use unicode as modifyable buffer");
				4128	return -1;
				4129	}
				4130
				4131	static int
				4132	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4133	int *lenp)
				4134	{
				4135	if (lenp)
				4136	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4137	return 1;
				4138	}
				4139
				4140	static int
				4141	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4142	int index,
				4143	const void **ptr)
				4144	{
				4145	PyObject *str;
				4146
				4147	if (index != 0) {
				4148	PyErr_SetString(PyExc_SystemError,
				4149	"accessing non-existent unicode segment");
				4150	return -1;
				4151	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	4152	str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4153	if (str == NULL)
				4154	return -1;
				4155	ptr = (void ) PyString_AS_STRING(str);
				4156	return PyString_GET_SIZE(str);
				4157	}
				4158
				4159	/* Helpers for PyUnicode_Format() */
				4160
				4161	static PyObject *
				4162	getnextarg(args, arglen, p_argidx)
				4163	PyObject *args;
				4164	int arglen;
				4165	int *p_argidx;
				4166	{
				4167	int argidx = *p_argidx;
				4168	if (argidx < arglen) {
				4169	(*p_argidx)++;
				4170	if (arglen < 0)
				4171	return args;
				4172	else
				4173	return PyTuple_GetItem(args, argidx);
				4174	}
				4175	PyErr_SetString(PyExc_TypeError,
				4176	"not enough arguments for format string");
				4177	return NULL;
				4178	}
				4179
				4180	#define F_LJUST (1<<0)
				4181	#define F_SIGN (1<<1)
				4182	#define F_BLANK (1<<2)
				4183	#define F_ALT (1<<3)
				4184	#define F_ZERO (1<<4)
				4185
				4186	static
				4187	#ifdef HAVE_STDARG_PROTOTYPES
				4188	int usprintf(register Py_UNICODE buffer, char format, ...)
				4189	#else
				4190	int usprintf(va_alist) va_dcl
				4191	#endif
				4192	{
				4193	register int i;
				4194	int len;
				4195	va_list va;
				4196	char *charbuffer;
				4197	#ifdef HAVE_STDARG_PROTOTYPES
				4198	va_start(va, format);
				4199	#else
				4200	Py_UNICODE *args;
				4201	char *format;
				4202
				4203	va_start(va);
				4204	buffer = va_arg(va, Py_UNICODE *);
				4205	format = va_arg(va, char *);
				4206	#endif
				4207
				4208	/* First, format the string as char array, then expand to Py_UNICODE
				4209	array. */
				4210	charbuffer = (char *)buffer;
				4211	len = vsprintf(charbuffer, format, va);
				4212	for (i = len - 1; i >= 0; i--)
				4213	buffer[i] = (Py_UNICODE) charbuffer[i];
				4214
				4215	va_end(va);
				4216	return len;
				4217	}
				4218
				4219	static int
				4220	formatfloat(Py_UNICODE *buf,
				4221	int flags,
				4222	int prec,
				4223	int type,
				4224	PyObject *v)
				4225	{
				4226	char fmt[20];
				4227	double x;
				4228
				4229	x = PyFloat_AsDouble(v);
				4230	if (x == -1.0 && PyErr_Occurred())
				4231	return -1;
				4232	if (prec < 0)
				4233	prec = 6;
				4234	if (prec > 50)
				4235	prec = 50; /* Arbitrary limitation */
				4236	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4237	type = 'g';
				4238	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
				4239	return usprintf(buf, fmt, x);
				4240	}
				4241
				4242	static int
				4243	formatint(Py_UNICODE *buf,
				4244	int flags,
				4245	int prec,
				4246	int type,
				4247	PyObject *v)
				4248	{
				4249	char fmt[20];
				4250	long x;
				4251
				4252	x = PyInt_AsLong(v);
				4253	if (x == -1 && PyErr_Occurred())
				4254	return -1;
				4255	if (prec < 0)
				4256	prec = 1;
				4257	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4258	return usprintf(buf, fmt, x);
				4259	}
				4260
				4261	static int
				4262	formatchar(Py_UNICODE *buf,
				4263	PyObject *v)
				4264	{
				4265	if (PyUnicode_Check(v))
				4266	buf[0] = PyUnicode_AS_UNICODE(v)[0];
				4267
				4268	else if (PyString_Check(v))
				4269	buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
				4270
				4271	else {
				4272	/* Integer input truncated to a character */
				4273	long x;
				4274	x = PyInt_AsLong(v);
				4275	if (x == -1 && PyErr_Occurred())
				4276	return -1;
				4277	buf[0] = (char) x;
				4278	}
				4279	buf[1] = '\0';
				4280	return 1;
				4281	}
				4282
				4283	PyObject PyUnicode_Format(PyObject format,
				4284	PyObject *args)
				4285	{
				4286	Py_UNICODE fmt, res;
				4287	int fmtcnt, rescnt, reslen, arglen, argidx;
				4288	int args_owned = 0;
				4289	PyUnicodeObject *result = NULL;
				4290	PyObject *dict = NULL;
				4291	PyObject *uformat;
				4292
				4293	if (format == NULL \|\| args == NULL) {
				4294	PyErr_BadInternalCall();
				4295	return NULL;
				4296	}
				4297	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4298	if (uformat == NULL)
				4299	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4300	fmt = PyUnicode_AS_UNICODE(uformat);
				4301	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4302
				4303	reslen = rescnt = fmtcnt + 100;
				4304	result = _PyUnicode_New(reslen);
				4305	if (result == NULL)
				4306	goto onError;
				4307	res = PyUnicode_AS_UNICODE(result);
				4308
				4309	if (PyTuple_Check(args)) {
				4310	arglen = PyTuple_Size(args);
				4311	argidx = 0;
				4312	}
				4313	else {
				4314	arglen = -1;
				4315	argidx = -2;
				4316	}
				4317	if (args->ob_type->tp_as_mapping)
				4318	dict = args;
				4319
				4320	while (--fmtcnt >= 0) {
				4321	if (*fmt != '%') {
				4322	if (--rescnt < 0) {
				4323	rescnt = fmtcnt + 100;
				4324	reslen += rescnt;
				4325	if (_PyUnicode_Resize(result, reslen) < 0)
				4326	return NULL;
				4327	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4328	--rescnt;
				4329	}
				4330	res++ = fmt++;
				4331	}
				4332	else {
				4333	/* Got a format specifier */
				4334	int flags = 0;
				4335	int width = -1;
				4336	int prec = -1;
				4337	int size = 0;
				4338	Py_UNICODE c = '\0';
				4339	Py_UNICODE fill;
				4340	PyObject *v = NULL;
				4341	PyObject *temp = NULL;
				4342	Py_UNICODE *buf;
				4343	Py_UNICODE sign;
				4344	int len;
				4345	Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
				4346
				4347	fmt++;
				4348	if (*fmt == '(') {
				4349	Py_UNICODE *keystart;
				4350	int keylen;
				4351	PyObject *key;
				4352	int pcount = 1;
				4353
				4354	if (dict == NULL) {
				4355	PyErr_SetString(PyExc_TypeError,
				4356	"format requires a mapping");
				4357	goto onError;
				4358	}
				4359	++fmt;
				4360	--fmtcnt;
				4361	keystart = fmt;
				4362	/* Skip over balanced parentheses */
				4363	while (pcount > 0 && --fmtcnt >= 0) {
				4364	if (*fmt == ')')
				4365	--pcount;
				4366	else if (*fmt == '(')
				4367	++pcount;
				4368	fmt++;
				4369	}
				4370	keylen = fmt - keystart - 1;
				4371	if (fmtcnt < 0 \|\| pcount > 0) {
				4372	PyErr_SetString(PyExc_ValueError,
				4373	"incomplete format key");
				4374	goto onError;
				4375	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4376	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4377	then looked up since Python uses strings to hold
				4378	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4379	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4380	key = PyUnicode_EncodeUTF8(keystart,
				4381	keylen,
				4382	NULL);
				4383	if (key == NULL)
				4384	goto onError;
				4385	if (args_owned) {
				4386	Py_DECREF(args);
				4387	args_owned = 0;
				4388	}
				4389	args = PyObject_GetItem(dict, key);
				4390	Py_DECREF(key);
				4391	if (args == NULL) {
				4392	goto onError;
				4393	}
				4394	args_owned = 1;
				4395	arglen = -1;
				4396	argidx = -2;
				4397	}
				4398	while (--fmtcnt >= 0) {
				4399	switch (c = *fmt++) {
				4400	case '-': flags \|= F_LJUST; continue;
				4401	case '+': flags \|= F_SIGN; continue;
				4402	case ' ': flags \|= F_BLANK; continue;
				4403	case '#': flags \|= F_ALT; continue;
				4404	case '0': flags \|= F_ZERO; continue;
				4405	}
				4406	break;
				4407	}
				4408	if (c == '*') {
				4409	v = getnextarg(args, arglen, &argidx);
				4410	if (v == NULL)
				4411	goto onError;
				4412	if (!PyInt_Check(v)) {
				4413	PyErr_SetString(PyExc_TypeError,
				4414	"* wants int");
				4415	goto onError;
				4416	}
				4417	width = PyInt_AsLong(v);
				4418	if (width < 0) {
				4419	flags \|= F_LJUST;
				4420	width = -width;
				4421	}
				4422	if (--fmtcnt >= 0)
				4423	c = *fmt++;
				4424	}
				4425	else if (c >= '0' && c <= '9') {
				4426	width = c - '0';
				4427	while (--fmtcnt >= 0) {
				4428	c = *fmt++;
				4429	if (c < '0' \|\| c > '9')
				4430	break;
				4431	if ((width*10) / 10 != width) {
				4432	PyErr_SetString(PyExc_ValueError,
				4433	"width too big");
				4434	goto onError;
				4435	}
				4436	width = width*10 + (c - '0');
				4437	}
				4438	}
				4439	if (c == '.') {
				4440	prec = 0;
				4441	if (--fmtcnt >= 0)
				4442	c = *fmt++;
				4443	if (c == '*') {
				4444	v = getnextarg(args, arglen, &argidx);
				4445	if (v == NULL)
				4446	goto onError;
				4447	if (!PyInt_Check(v)) {
				4448	PyErr_SetString(PyExc_TypeError,
				4449	"* wants int");
				4450	goto onError;
				4451	}
				4452	prec = PyInt_AsLong(v);
				4453	if (prec < 0)
				4454	prec = 0;
				4455	if (--fmtcnt >= 0)
				4456	c = *fmt++;
				4457	}
				4458	else if (c >= '0' && c <= '9') {
				4459	prec = c - '0';
				4460	while (--fmtcnt >= 0) {
				4461	c = Py_CHARMASK(*fmt++);
				4462	if (c < '0' \|\| c > '9')
				4463	break;
				4464	if ((prec*10) / 10 != prec) {
				4465	PyErr_SetString(PyExc_ValueError,
				4466	"prec too big");
				4467	goto onError;
				4468	}
				4469	prec = prec*10 + (c - '0');
				4470	}
				4471	}
				4472	} /* prec */
				4473	if (fmtcnt >= 0) {
				4474	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4475	size = c;
				4476	if (--fmtcnt >= 0)
				4477	c = *fmt++;
				4478	}
				4479	}
				4480	if (fmtcnt < 0) {
				4481	PyErr_SetString(PyExc_ValueError,
				4482	"incomplete format");
				4483	goto onError;
				4484	}
				4485	if (c != '%') {
				4486	v = getnextarg(args, arglen, &argidx);
				4487	if (v == NULL)
				4488	goto onError;
				4489	}
				4490	sign = 0;
				4491	fill = ' ';
				4492	switch (c) {
				4493
				4494	case '%':
				4495	buf = tmpbuf;
				4496	buf[0] = '%';
				4497	len = 1;
				4498	break;
				4499
				4500	case 's':
				4501	case 'r':
				4502	if (PyUnicode_Check(v) && c == 's') {
				4503	temp = v;
				4504	Py_INCREF(temp);
				4505	}
				4506	else {
				4507	PyObject *unicode;
				4508	if (c == 's')
				4509	temp = PyObject_Str(v);
				4510	else
				4511	temp = PyObject_Repr(v);
				4512	if (temp == NULL)
				4513	goto onError;
				4514	if (!PyString_Check(temp)) {
				4515	/* XXX Note: this should never happen, since
				4516	PyObject_Repr() and PyObject_Str() assure
				4517	this */
				4518	Py_DECREF(temp);
				4519	PyErr_SetString(PyExc_TypeError,
				4520	"%s argument has non-string str()");
				4521	goto onError;
				4522	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4523	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4524	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4525	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4526	"strict");
				4527	Py_DECREF(temp);
				4528	temp = unicode;
				4529	if (temp == NULL)
				4530	goto onError;
				4531	}
				4532	buf = PyUnicode_AS_UNICODE(temp);
				4533	len = PyUnicode_GET_SIZE(temp);
				4534	if (prec >= 0 && len > prec)
				4535	len = prec;
				4536	break;
				4537
				4538	case 'i':
				4539	case 'd':
				4540	case 'u':
				4541	case 'o':
				4542	case 'x':
				4543	case 'X':
				4544	if (c == 'i')
				4545	c = 'd';
				4546	buf = tmpbuf;
				4547	len = formatint(buf, flags, prec, c, v);
				4548	if (len < 0)
				4549	goto onError;
				4550	sign = (c == 'd');
				4551	if (flags & F_ZERO) {
				4552	fill = '0';
				4553	if ((flags&F_ALT) &&
				4554	(c == 'x' \|\| c == 'X') &&
				4555	buf[0] == '0' && buf[1] == c) {
				4556	res++ = buf++;
				4557	res++ = buf++;
				4558	rescnt -= 2;
				4559	len -= 2;
				4560	width -= 2;
				4561	if (width < 0)
				4562	width = 0;
				4563	}
				4564	}
				4565	break;
				4566
				4567	case 'e':
				4568	case 'E':
				4569	case 'f':
				4570	case 'g':
				4571	case 'G':
				4572	buf = tmpbuf;
				4573	len = formatfloat(buf, flags, prec, c, v);
				4574	if (len < 0)
				4575	goto onError;
				4576	sign = 1;
				4577	if (flags&F_ZERO)
				4578	fill = '0';
				4579	break;
				4580
				4581	case 'c':
				4582	buf = tmpbuf;
				4583	len = formatchar(buf, v);
				4584	if (len < 0)
				4585	goto onError;
				4586	break;
				4587
				4588	default:
				4589	PyErr_Format(PyExc_ValueError,
				4590	"unsupported format character '%c' (0x%x)",
				4591	c, c);
				4592	goto onError;
				4593	}
				4594	if (sign) {
				4595	if (buf == '-' \|\| buf == '+') {
				4596	sign = *buf++;
				4597	len--;
				4598	}
				4599	else if (flags & F_SIGN)
				4600	sign = '+';
				4601	else if (flags & F_BLANK)
				4602	sign = ' ';
				4603	else
				4604	sign = 0;
				4605	}
				4606	if (width < len)
				4607	width = len;
				4608	if (rescnt < width + (sign != 0)) {
				4609	reslen -= rescnt;
				4610	rescnt = width + fmtcnt + 100;
				4611	reslen += rescnt;
				4612	if (_PyUnicode_Resize(result, reslen) < 0)
				4613	return NULL;
				4614	res = PyUnicode_AS_UNICODE(result)
				4615	+ reslen - rescnt;
				4616	}
				4617	if (sign) {
				4618	if (fill != ' ')
				4619	*res++ = sign;
				4620	rescnt--;
				4621	if (width > len)
				4622	width--;
				4623	}
				4624	if (width > len && !(flags & F_LJUST)) {
				4625	do {
				4626	--rescnt;
				4627	*res++ = fill;
				4628	} while (--width > len);
				4629	}
				4630	if (sign && fill == ' ')
				4631	*res++ = sign;
				4632	memcpy(res, buf, len * sizeof(Py_UNICODE));
				4633	res += len;
				4634	rescnt -= len;
				4635	while (--width >= len) {
				4636	--rescnt;
				4637	*res++ = ' ';
				4638	}
				4639	if (dict && (argidx < arglen) && c != '%') {
				4640	PyErr_SetString(PyExc_TypeError,
				4641	"not all arguments converted");
				4642	goto onError;
				4643	}
				4644	Py_XDECREF(temp);
				4645	} /* '%' */
				4646	} /* until end */
				4647	if (argidx < arglen && !dict) {
				4648	PyErr_SetString(PyExc_TypeError,
				4649	"not all arguments converted");
				4650	goto onError;
				4651	}
				4652
				4653	if (args_owned) {
				4654	Py_DECREF(args);
				4655	}
				4656	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4657	if (_PyUnicode_Resize(result, reslen - rescnt))
				4658	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4659	return (PyObject *)result;
				4660
				4661	onError:
				4662	Py_XDECREF(result);
				4663	Py_DECREF(uformat);
				4664	if (args_owned) {
				4665	Py_DECREF(args);
				4666	}
				4667	return NULL;
				4668	}
				4669
				4670	static PyBufferProcs unicode_as_buffer = {
				4671	(getreadbufferproc) unicode_buffer_getreadbuf,
				4672	(getwritebufferproc) unicode_buffer_getwritebuf,
				4673	(getsegcountproc) unicode_buffer_getsegcount,
				4674	(getcharbufferproc) unicode_buffer_getcharbuf,
				4675	};
				4676
				4677	PyTypeObject PyUnicode_Type = {
				4678	PyObject_HEAD_INIT(&PyType_Type)
				4679	0, /* ob_size */
				4680	"unicode", /* tp_name */
				4681	sizeof(PyUnicodeObject), /* tp_size */
				4682	0, /* tp_itemsize */
				4683	/* Slots */
				4684	(destructor)_PyUnicode_Free, /* tp_dealloc */
				4685	0, /* tp_print */
				4686	(getattrfunc)unicode_getattr, /* tp_getattr */
				4687	0, /* tp_setattr */
				4688	(cmpfunc) unicode_compare, /* tp_compare */
				4689	(reprfunc) unicode_repr, /* tp_repr */
				4690	0, /* tp_as_number */
				4691	&unicode_as_sequence, /* tp_as_sequence */
				4692	0, /* tp_as_mapping */
				4693	(hashfunc) unicode_hash, /* tp_hash*/
				4694	0, /* tp_call*/
				4695	(reprfunc) unicode_str, /* tp_str */
				4696	(getattrofunc) NULL, /* tp_getattro */
				4697	(setattrofunc) NULL, /* tp_setattro */
				4698	&unicode_as_buffer, /* tp_as_buffer */
				4699	Py_TPFLAGS_DEFAULT, /* tp_flags */
				4700	};
				4701
				4702	/* Initialize the Unicode implementation */
				4703
				4704	void _PyUnicode_Init()
				4705	{
				4706	/* Doublecheck the configuration... */
				4707	if (sizeof(Py_UNICODE) != 2)
				4708	Py_FatalError("Unicode configuration error: "
				4709	"sizeof(Py_UNICODE) != 2 bytes");
				4710
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4711	/* Init the implementation */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4712	unicode_empty = _PyUnicode_New(0);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4713	strcpy(unicode_default_encoding, "utf-8");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4714	}
				4715
				4716	/* Finalize the Unicode implementation */
				4717
				4718	void
				4719	_PyUnicode_Fini()
				4720	{
				4721	PyUnicodeObject *u = unicode_freelist;
				4722
				4723	while (u != NULL) {
				4724	PyUnicodeObject *v = u;
				4725	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4726	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	4727	PyMem_DEL(v->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4728	Py_XDECREF(v->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	4729	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4730	}
				4731	Py_XDECREF(unicode_empty);
				4732	}