Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 1ea83f6ea56de9e444b972d4e33eb0383baacca9 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
				69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	79
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	80	/* Limit for the Unicode object free list */
				81
				82	#define MAX_UNICODE_FREELIST_SIZE 1024
				83
				84	/* Limit for the Unicode object free list stay alive optimization.
				85
				86	The implementation will keep allocated Unicode memory intact for
				87	all objects on the free list having a size less than this
				88	limit. This reduces malloc() overhead for small Unicode objects.
				89
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	90	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	91	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92	malloc()-overhead) bytes of unused garbage.
				93
				94	Setting the limit to 0 effectively turns the feature off.
				95
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	96	Note: This is an experimental feature ! If you get core dumps when
				97	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	98
				99	*/
				100
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	101	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	111	/* --- Globals ------------------------------------------------------------
				112
				113	The globals are initialized by the _PyUnicode_Init() API and should
				114	not be used before calling that API.
				115
				116	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	117
				118	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	119	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	120
				121	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	122	static PyUnicodeObject *unicode_freelist;
				123	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	124
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	125	/* Default encoding to use and assume when NULL is passed as encoding
				126	parameter; it is initialized by _PyUnicode_Init().
				127
				128	Always use the PyUnicode_SetDefaultEncoding() and
				129	PyUnicode_GetDefaultEncoding() APIs to access this global.
				130
				131	*/
				132
				133	static char unicode_default_encoding[100];
				134
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* --- Unicode Object ----------------------------------------------------- */
				136
				137	static
				138	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				139	int length)
				140	{
				141	void *oldstr;
				142
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	143	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	144	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	146
				147	/* Resizing unicode_empty is not allowed. */
				148	if (unicode == unicode_empty) {
				149	PyErr_SetString(PyExc_SystemError,
				150	"can't resize empty unicode object");
				151	return -1;
				152	}
				153
				154	/* We allocate one more byte to make sure the string is
				155	Ux0000 terminated -- XXX is this needed ? */
				156	oldstr = unicode->str;
				157	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				158	if (!unicode->str) {
				159	unicode->str = oldstr;
				160	PyErr_NoMemory();
				161	return -1;
				162	}
				163	unicode->str[length] = 0;
				164	unicode->length = length;
				165
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	166	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	167	/* Reset the object caches */
				168	if (unicode->utf8str) {
				169	Py_DECREF(unicode->utf8str);
				170	unicode->utf8str = NULL;
				171	}
				172	unicode->hash = -1;
				173
				174	return 0;
				175	}
				176
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	177	int PyUnicode_Resize(PyObject **unicode,
				178	int length)
				179	{
				180	PyUnicodeObject *v;
				181
				182	if (unicode == NULL) {
				183	PyErr_BadInternalCall();
				184	return -1;
				185	}
				186	v = (PyUnicodeObject )unicode;
				187	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				188	PyErr_BadInternalCall();
				189	return -1;
				190	}
				191	return _PyUnicode_Resize(v, length);
				192	}
				193
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	194	/* We allocate one more byte to make sure the string is
				195	Ux0000 terminated -- XXX is this needed ?
				196
				197	XXX This allocator could further be enhanced by assuring that the
				198	free list never reduces its size below 1.
				199
				200	*/
				201
				202	static
				203	PyUnicodeObject *_PyUnicode_New(int length)
				204	{
				205	register PyUnicodeObject *unicode;
				206
				207	/* Optimization for empty strings */
				208	if (length == 0 && unicode_empty != NULL) {
				209	Py_INCREF(unicode_empty);
				210	return unicode_empty;
				211	}
				212
				213	/* Unicode freelist & memory allocation */
				214	if (unicode_freelist) {
				215	unicode = unicode_freelist;
				216	unicode_freelist = (PyUnicodeObject *)unicode_freelist;
				217	unicode_freelist_size--;
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	218	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	219	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	220	/* Keep-Alive optimization: we only upsize the buffer,
				221	never downsize it. */
				222	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	223	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	224	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	225	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	226	}
				227	}
				228	else
				229	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				230	}
				231	else {
				232	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				233	if (unicode == NULL)
				234	return NULL;
				235	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				236	}
				237
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	238	if (!unicode->str) {
				239	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	240	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	241	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	242	unicode->str[length] = 0;
				243	unicode->length = length;
				244	unicode->hash = -1;
				245	unicode->utf8str = NULL;
				246	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	247
				248	onError:
				249	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	250	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	251	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	252	}
				253
				254	static
				255	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				256	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	257	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	258	/* Keep-Alive optimization */
				259	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	260	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	261	unicode->str = NULL;
				262	unicode->length = 0;
				263	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	264	if (unicode->utf8str) {
				265	Py_DECREF(unicode->utf8str);
				266	unicode->utf8str = NULL;
				267	}
				268	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	269	(PyUnicodeObject *)unicode = unicode_freelist;
				270	unicode_freelist = unicode;
				271	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	272	}
				273	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	274	PyMem_DEL(unicode->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	275	Py_XDECREF(unicode->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	276	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	277	}
				278	}
				279
				280	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				281	int size)
				282	{
				283	PyUnicodeObject *unicode;
				284
				285	unicode = _PyUnicode_New(size);
				286	if (!unicode)
				287	return NULL;
				288
				289	/* Copy the Unicode data into the new object */
				290	if (u != NULL)
				291	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				292
				293	return (PyObject *)unicode;
				294	}
				295
				296	#ifdef HAVE_WCHAR_H
				297
				298	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				299	int size)
				300	{
				301	PyUnicodeObject *unicode;
				302
				303	if (w == NULL) {
				304	PyErr_BadInternalCall();
				305	return NULL;
				306	}
				307
				308	unicode = _PyUnicode_New(size);
				309	if (!unicode)
				310	return NULL;
				311
				312	/* Copy the wchar_t data into the new object */
				313	#ifdef HAVE_USABLE_WCHAR_T
				314	memcpy(unicode->str, w, size * sizeof(wchar_t));
				315	#else
				316	{
				317	register Py_UNICODE *u;
				318	register int i;
				319	u = PyUnicode_AS_UNICODE(unicode);
				320	for (i = size; i >= 0; i--)
				321	u++ = w++;
				322	}
				323	#endif
				324
				325	return (PyObject *)unicode;
				326	}
				327
				328	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				329	register wchar_t *w,
				330	int size)
				331	{
				332	if (unicode == NULL) {
				333	PyErr_BadInternalCall();
				334	return -1;
				335	}
				336	if (size > PyUnicode_GET_SIZE(unicode))
				337	size = PyUnicode_GET_SIZE(unicode);
				338	#ifdef HAVE_USABLE_WCHAR_T
				339	memcpy(w, unicode->str, size * sizeof(wchar_t));
				340	#else
				341	{
				342	register Py_UNICODE *u;
				343	register int i;
				344	u = PyUnicode_AS_UNICODE(unicode);
				345	for (i = size; i >= 0; i--)
				346	w++ = u++;
				347	}
				348	#endif
				349
				350	return size;
				351	}
				352
				353	#endif
				354
				355	PyObject PyUnicode_FromObject(register PyObject obj)
				356	{
				357	const char *s;
				358	int len;
				359
				360	if (obj == NULL) {
				361	PyErr_BadInternalCall();
				362	return NULL;
				363	}
				364	else if (PyUnicode_Check(obj)) {
				365	Py_INCREF(obj);
				366	return obj;
				367	}
				368	else if (PyString_Check(obj)) {
				369	s = PyString_AS_STRING(obj);
				370	len = PyString_GET_SIZE(obj);
				371	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	372	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				373	/* Overwrite the error message with something more useful in
				374	case of a TypeError. */
				375	if (PyErr_ExceptionMatches(PyExc_TypeError))
				376	PyErr_SetString(PyExc_TypeError,
				377	"coercing to Unicode: need string or charbuffer");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	378	return NULL;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	379	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	380	if (len == 0) {
				381	Py_INCREF(unicode_empty);
				382	return (PyObject *)unicode_empty;
				383	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	384	return PyUnicode_Decode(s, len, NULL, "strict");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	385	}
				386
				387	PyObject PyUnicode_Decode(const char s,
				388	int size,
				389	const char *encoding,
				390	const char *errors)
				391	{
				392	PyObject buffer = NULL, unicode;
				393
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	394	if (encoding == NULL)
				395	encoding = PyUnicode_GetDefaultEncoding();
				396
				397	/* Shortcuts for common default encodings */
				398	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	399	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	400	else if (strcmp(encoding, "latin-1") == 0)
				401	return PyUnicode_DecodeLatin1(s, size, errors);
				402	else if (strcmp(encoding, "ascii") == 0)
				403	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	404
				405	/* Decode via the codec registry */
				406	buffer = PyBuffer_FromMemory((void *)s, size);
				407	if (buffer == NULL)
				408	goto onError;
				409	unicode = PyCodec_Decode(buffer, encoding, errors);
				410	if (unicode == NULL)
				411	goto onError;
				412	if (!PyUnicode_Check(unicode)) {
				413	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	414	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	415	unicode->ob_type->tp_name);
				416	Py_DECREF(unicode);
				417	goto onError;
				418	}
				419	Py_DECREF(buffer);
				420	return unicode;
				421
				422	onError:
				423	Py_XDECREF(buffer);
				424	return NULL;
				425	}
				426
				427	PyObject PyUnicode_Encode(const Py_UNICODE s,
				428	int size,
				429	const char *encoding,
				430	const char *errors)
				431	{
				432	PyObject v, unicode;
				433
				434	unicode = PyUnicode_FromUnicode(s, size);
				435	if (unicode == NULL)
				436	return NULL;
				437	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				438	Py_DECREF(unicode);
				439	return v;
				440	}
				441
				442	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				443	const char *encoding,
				444	const char *errors)
				445	{
				446	PyObject *v;
				447
				448	if (!PyUnicode_Check(unicode)) {
				449	PyErr_BadArgument();
				450	goto onError;
				451	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	452
				453	if (encoding == NULL)
				454	encoding = PyUnicode_GetDefaultEncoding();
				455
				456	/* Shortcuts for common default encodings */
				457	if (errors == NULL) {
				458	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	459	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	460	else if (strcmp(encoding, "latin-1") == 0)
				461	return PyUnicode_AsLatin1String(unicode);
				462	else if (strcmp(encoding, "ascii") == 0)
				463	return PyUnicode_AsASCIIString(unicode);
				464	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	465
				466	/* Encode via the codec registry */
				467	v = PyCodec_Encode(unicode, encoding, errors);
				468	if (v == NULL)
				469	goto onError;
				470	/* XXX Should we really enforce this ? */
				471	if (!PyString_Check(v)) {
				472	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	473	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	474	v->ob_type->tp_name);
				475	Py_DECREF(v);
				476	goto onError;
				477	}
				478	return v;
				479
				480	onError:
				481	return NULL;
				482	}
				483
				484	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				485	{
				486	if (!PyUnicode_Check(unicode)) {
				487	PyErr_BadArgument();
				488	goto onError;
				489	}
				490	return PyUnicode_AS_UNICODE(unicode);
				491
				492	onError:
				493	return NULL;
				494	}
				495
				496	int PyUnicode_GetSize(PyObject *unicode)
				497	{
				498	if (!PyUnicode_Check(unicode)) {
				499	PyErr_BadArgument();
				500	goto onError;
				501	}
				502	return PyUnicode_GET_SIZE(unicode);
				503
				504	onError:
				505	return -1;
				506	}
				507
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	508	const char *PyUnicode_GetDefaultEncoding()
				509	{
				510	return unicode_default_encoding;
				511	}
				512
				513	int PyUnicode_SetDefaultEncoding(const char *encoding)
				514	{
				515	PyObject *v;
				516
				517	/* Make sure the encoding is valid. As side effect, this also
				518	loads the encoding into the codec registry cache. */
				519	v = _PyCodec_Lookup(encoding);
				520	if (v == NULL)
				521	goto onError;
				522	Py_DECREF(v);
				523	strncpy(unicode_default_encoding,
				524	encoding,
				525	sizeof(unicode_default_encoding));
				526	return 0;
				527
				528	onError:
				529	return -1;
				530	}
				531
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	532	/* --- UTF-8 Codec -------------------------------------------------------- */
				533
				534	static
				535	char utf8_code_length[256] = {
				536	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				537	illegal prefix. see RFC 2279 for details */
				538	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				539	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				540	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				541	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				542	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				543	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				544	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				545	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				546	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				547	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				548	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				549	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				550	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				551	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				552	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				553	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				554	};
				555
				556	static
				557	int utf8_decoding_error(const char **source,
				558	Py_UNICODE **dest,
				559	const char *errors,
				560	const char *details)
				561	{
				562	if ((errors == NULL) \|\|
				563	(strcmp(errors,"strict") == 0)) {
				564	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	565	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	566	details);
				567	return -1;
				568	}
				569	else if (strcmp(errors,"ignore") == 0) {
				570	(*source)++;
				571	return 0;
				572	}
				573	else if (strcmp(errors,"replace") == 0) {
				574	(*source)++;
				575	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				576	(*dest)++;
				577	return 0;
				578	}
				579	else {
				580	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	581	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	582	errors);
				583	return -1;
				584	}
				585	}
				586
				587	#define UTF8_ERROR(details) do { \
				588	if (utf8_decoding_error(&s, &p, errors, details)) \
				589	goto onError; \
				590	continue; \
				591	} while (0)
				592
				593	PyObject PyUnicode_DecodeUTF8(const char s,
				594	int size,
				595	const char *errors)
				596	{
				597	int n;
				598	const char *e;
				599	PyUnicodeObject *unicode;
				600	Py_UNICODE *p;
				601
				602	/* Note: size will always be longer than the resulting Unicode
				603	character count */
				604	unicode = _PyUnicode_New(size);
				605	if (!unicode)
				606	return NULL;
				607	if (size == 0)
				608	return (PyObject *)unicode;
				609
				610	/* Unpack UTF-8 encoded data */
				611	p = unicode->str;
				612	e = s + size;
				613
				614	while (s < e) {
				615	register Py_UNICODE ch = (unsigned char)*s;
				616
				617	if (ch < 0x80) {
				618	*p++ = ch;
				619	s++;
				620	continue;
				621	}
				622
				623	n = utf8_code_length[ch];
				624
				625	if (s + n > e)
				626	UTF8_ERROR("unexpected end of data");
				627
				628	switch (n) {
				629
				630	case 0:
				631	UTF8_ERROR("unexpected code byte");
				632	break;
				633
				634	case 1:
				635	UTF8_ERROR("internal error");
				636	break;
				637
				638	case 2:
				639	if ((s[1] & 0xc0) != 0x80)
				640	UTF8_ERROR("invalid data");
				641	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				642	if (ch < 0x80)
				643	UTF8_ERROR("illegal encoding");
				644	else
				645	*p++ = ch;
				646	break;
				647
				648	case 3:
				649	if ((s[1] & 0xc0) != 0x80 \|\|
				650	(s[2] & 0xc0) != 0x80)
				651	UTF8_ERROR("invalid data");
				652	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				653	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				654	UTF8_ERROR("illegal encoding");
				655	else
				656	*p++ = ch;
				657	break;
				658
				659	default:
				660	/* Other sizes are only needed for UCS-4 */
				661	UTF8_ERROR("unsupported Unicode code range");
				662	}
				663	s += n;
				664	}
				665
				666	/* Adjust length */
				667	if (_PyUnicode_Resize(unicode, p - unicode->str))
				668	goto onError;
				669
				670	return (PyObject *)unicode;
				671
				672	onError:
				673	Py_DECREF(unicode);
				674	return NULL;
				675	}
				676
				677	#undef UTF8_ERROR
				678
				679	static
				680	int utf8_encoding_error(const Py_UNICODE **source,
				681	char **dest,
				682	const char *errors,
				683	const char *details)
				684	{
				685	if ((errors == NULL) \|\|
				686	(strcmp(errors,"strict") == 0)) {
				687	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	688	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	689	details);
				690	return -1;
				691	}
				692	else if (strcmp(errors,"ignore") == 0) {
				693	return 0;
				694	}
				695	else if (strcmp(errors,"replace") == 0) {
				696	**dest = '?';
				697	(*dest)++;
				698	return 0;
				699	}
				700	else {
				701	PyErr_Format(PyExc_ValueError,
				702	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	703	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	704	errors);
				705	return -1;
				706	}
				707	}
				708
				709	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				710	int size,
				711	const char *errors)
				712	{
				713	PyObject *v;
				714	char *p;
				715	char *q;
				716
				717	v = PyString_FromStringAndSize(NULL, 3 * size);
				718	if (v == NULL)
				719	return NULL;
				720	if (size == 0)
				721	goto done;
				722
				723	p = q = PyString_AS_STRING(v);
				724	while (size-- > 0) {
				725	Py_UNICODE ch = *s++;
				726	if (ch < 0x80)
				727	*p++ = (char) ch;
				728	else if (ch < 0x0800) {
				729	*p++ = 0xc0 \| (ch >> 6);
				730	*p++ = 0x80 \| (ch & 0x3f);
				731	} else if (0xD800 <= ch && ch <= 0xDFFF) {
				732	/* These byte ranges are reserved for UTF-16 surrogate
				733	bytes which the Python implementation currently does
				734	not support. */
				735	printf("code range problem: U+%04x\n", ch);
				736	if (utf8_encoding_error(&s, &p, errors,
				737	"unsupported code range"))
				738	goto onError;
				739	} else {
				740	*p++ = 0xe0 \| (ch >> 12);
				741	*p++ = 0x80 \| ((ch >> 6) & 0x3f);
				742	*p++ = 0x80 \| (ch & 0x3f);
				743	}
				744	}
				745	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	746	if (_PyString_Resize(&v, p - q))
				747	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	748
				749	done:
				750	return v;
				751
				752	onError:
				753	Py_DECREF(v);
				754	return NULL;
				755	}
				756
				757	/* Return a Python string holding the UTF-8 encoded value of the
				758	Unicode object.
				759
				760	The resulting string is cached in the Unicode object for subsequent
				761	usage by this function. The cached version is needed to implement
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	762	the character buffer interface and will live (at least) as long as
				763	the Unicode object itself.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	764
				765	The refcount of the string is not incremented.
				766
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	767	* Exported for internal use by the interpreter only !!! *
				768
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	769	*/
				770
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	771	PyObject _PyUnicode_AsUTF8String(PyObject unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	772	const char *errors)
				773	{
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	774	PyObject v = ((PyUnicodeObject )unicode)->utf8str;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	775
				776	if (v)
				777	return v;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	778	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				779	PyUnicode_GET_SIZE(unicode),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	780	errors);
				781	if (v && errors == NULL)
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	782	((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	783	return v;
				784	}
				785
				786	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				787	{
				788	PyObject *str;
				789
				790	if (!PyUnicode_Check(unicode)) {
				791	PyErr_BadArgument();
				792	return NULL;
				793	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	794	str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	795	if (str == NULL)
				796	return NULL;
				797	Py_INCREF(str);
				798	return str;
				799	}
				800
				801	/* --- UTF-16 Codec ------------------------------------------------------- */
				802
				803	static
				804	int utf16_decoding_error(const Py_UNICODE **source,
				805	Py_UNICODE **dest,
				806	const char *errors,
				807	const char *details)
				808	{
				809	if ((errors == NULL) \|\|
				810	(strcmp(errors,"strict") == 0)) {
				811	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	812	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	813	details);
				814	return -1;
				815	}
				816	else if (strcmp(errors,"ignore") == 0) {
				817	return 0;
				818	}
				819	else if (strcmp(errors,"replace") == 0) {
				820	if (dest) {
				821	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				822	(*dest)++;
				823	}
				824	return 0;
				825	}
				826	else {
				827	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	828	"UTF-16 decoding error; "
				829	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	830	errors);
				831	return -1;
				832	}
				833	}
				834
				835	#define UTF16_ERROR(details) do { \
				836	if (utf16_decoding_error(&q, &p, errors, details)) \
				837	goto onError; \
				838	continue; \
				839	} while(0)
				840
				841	PyObject PyUnicode_DecodeUTF16(const char s,
				842	int size,
				843	const char *errors,
				844	int *byteorder)
				845	{
				846	PyUnicodeObject *unicode;
				847	Py_UNICODE *p;
				848	const Py_UNICODE q, e;
				849	int bo = 0;
				850
				851	/* size should be an even number */
				852	if (size % sizeof(Py_UNICODE) != 0) {
				853	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				854	return NULL;
				855	/* The remaining input chars are ignored if we fall through
				856	here... */
				857	}
				858
				859	/* Note: size will always be longer than the resulting Unicode
				860	character count */
				861	unicode = _PyUnicode_New(size);
				862	if (!unicode)
				863	return NULL;
				864	if (size == 0)
				865	return (PyObject *)unicode;
				866
				867	/* Unpack UTF-16 encoded data */
				868	p = unicode->str;
				869	q = (Py_UNICODE *)s;
				870	e = q + (size / sizeof(Py_UNICODE));
				871
				872	if (byteorder)
				873	bo = *byteorder;
				874
				875	while (q < e) {
				876	register Py_UNICODE ch = *q++;
				877
				878	/* Check for BOM marks (U+FEFF) in the input and adjust
				879	current byte order setting accordingly. Swap input
				880	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				881	!) */
				882	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				883	if (ch == 0xFEFF) {
				884	bo = -1;
				885	continue;
				886	} else if (ch == 0xFFFE) {
				887	bo = 1;
				888	continue;
				889	}
				890	if (bo == 1)
				891	ch = (ch >> 8) \| (ch << 8);
				892	#else
				893	if (ch == 0xFEFF) {
				894	bo = 1;
				895	continue;
				896	} else if (ch == 0xFFFE) {
				897	bo = -1;
				898	continue;
				899	}
				900	if (bo == -1)
				901	ch = (ch >> 8) \| (ch << 8);
				902	#endif
				903	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				904	*p++ = ch;
				905	continue;
				906	}
				907
				908	/* UTF-16 code pair: */
				909	if (q >= e)
				910	UTF16_ERROR("unexpected end of data");
				911	if (0xDC00 <= q && q <= 0xDFFF) {
				912	q++;
				913	if (0xD800 <= q && q <= 0xDBFF)
				914	/* This is valid data (a UTF-16 surrogate pair), but
				915	we are not able to store this information since our
				916	Py_UNICODE type only has 16 bits... this might
				917	change someday, even though it's unlikely. */
				918	UTF16_ERROR("code pairs are not supported");
				919	else
				920	continue;
				921	}
				922	UTF16_ERROR("illegal encoding");
				923	}
				924
				925	if (byteorder)
				926	*byteorder = bo;
				927
				928	/* Adjust length */
				929	if (_PyUnicode_Resize(unicode, p - unicode->str))
				930	goto onError;
				931
				932	return (PyObject *)unicode;
				933
				934	onError:
				935	Py_DECREF(unicode);
				936	return NULL;
				937	}
				938
				939	#undef UTF16_ERROR
				940
				941	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				942	int size,
				943	const char *errors,
				944	int byteorder)
				945	{
				946	PyObject *v;
				947	Py_UNICODE *p;
				948	char *q;
				949
				950	/* We don't create UTF-16 pairs... */
				951	v = PyString_FromStringAndSize(NULL,
				952	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				953	if (v == NULL)
				954	return NULL;
				955	if (size == 0)
				956	goto done;
				957
				958	q = PyString_AS_STRING(v);
				959	p = (Py_UNICODE *)q;
				960
				961	if (byteorder == 0)
				962	*p++ = 0xFEFF;
				963	if (byteorder == 0 \|\|
				964	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				965	byteorder == -1
				966	#else
				967	byteorder == 1
				968	#endif
				969	)
				970	memcpy(p, s, size * sizeof(Py_UNICODE));
				971	else
				972	while (size-- > 0) {
				973	Py_UNICODE ch = *s++;
				974	*p++ = (ch >> 8) \| (ch << 8);
				975	}
				976	done:
				977	return v;
				978	}
				979
				980	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				981	{
				982	if (!PyUnicode_Check(unicode)) {
				983	PyErr_BadArgument();
				984	return NULL;
				985	}
				986	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				987	PyUnicode_GET_SIZE(unicode),
				988	NULL,
				989	0);
				990	}
				991
				992	/* --- Unicode Escape Codec ----------------------------------------------- */
				993
				994	static
				995	int unicodeescape_decoding_error(const char **source,
				996	unsigned int *x,
				997	const char *errors,
				998	const char *details)
				999	{
				1000	if ((errors == NULL) \|\|
				1001	(strcmp(errors,"strict") == 0)) {
				1002	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1003	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1004	details);
				1005	return -1;
				1006	}
				1007	else if (strcmp(errors,"ignore") == 0) {
				1008	return 0;
				1009	}
				1010	else if (strcmp(errors,"replace") == 0) {
				1011	*x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
				1012	return 0;
				1013	}
				1014	else {
				1015	PyErr_Format(PyExc_ValueError,
				1016	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1017	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1018	errors);
				1019	return -1;
				1020	}
				1021	}
				1022
				1023	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1024	int size,
				1025	const char *errors)
				1026	{
				1027	PyUnicodeObject *v;
				1028	Py_UNICODE p = NULL, buf = NULL;
				1029	const char *end;
				1030
				1031	/* Escaped strings will always be longer than the resulting
				1032	Unicode string, so we start with size here and then reduce the
				1033	length after conversion to the true value. */
				1034	v = _PyUnicode_New(size);
				1035	if (v == NULL)
				1036	goto onError;
				1037	if (size == 0)
				1038	return (PyObject *)v;
				1039	p = buf = PyUnicode_AS_UNICODE(v);
				1040	end = s + size;
				1041	while (s < end) {
				1042	unsigned char c;
				1043	unsigned int x;
				1044	int i;
				1045
				1046	/* Non-escape characters are interpreted as Unicode ordinals */
				1047	if (*s != '\\') {
				1048	p++ = (unsigned char)s++;
				1049	continue;
				1050	}
				1051
				1052	/* \ - Escapes */
				1053	s++;
				1054	switch (*s++) {
				1055
				1056	/* \x escapes */
				1057	case '\n': break;
				1058	case '\\': *p++ = '\\'; break;
				1059	case '\'': *p++ = '\''; break;
				1060	case '\"': *p++ = '\"'; break;
				1061	case 'b': *p++ = '\b'; break;
				1062	case 'f': p++ = '\014'; break; / FF */
				1063	case 't': *p++ = '\t'; break;
				1064	case 'n': *p++ = '\n'; break;
				1065	case 'r': *p++ = '\r'; break;
				1066	case 'v': p++ = '\013'; break; / VT */
				1067	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1068
				1069	/* \OOO (octal) escapes */
				1070	case '0': case '1': case '2': case '3':
				1071	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1072	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1073	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1074	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1075	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1076	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1077	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1078	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1079	break;
				1080
				1081	/* \xXXXX escape with 0-4 hex digits */
				1082	case 'x':
				1083	x = 0;
				1084	c = (unsigned char)*s;
				1085	if (isxdigit(c)) {
				1086	do {
				1087	x = (x<<4) & ~0xF;
				1088	if ('0' <= c && c <= '9')
				1089	x += c - '0';
				1090	else if ('a' <= c && c <= 'f')
				1091	x += 10 + c - 'a';
				1092	else
				1093	x += 10 + c - 'A';
				1094	c = (unsigned char)*++s;
				1095	} while (isxdigit(c));
				1096	*p++ = x;
				1097	} else {
				1098	*p++ = '\\';
				1099	*p++ = (unsigned char)s[-1];
				1100	}
				1101	break;
				1102
				1103	/* \uXXXX with 4 hex digits */
				1104	case 'u':
				1105	for (x = 0, i = 0; i < 4; i++) {
				1106	c = (unsigned char)s[i];
				1107	if (!isxdigit(c)) {
				1108	if (unicodeescape_decoding_error(&s, &x, errors,
				1109	"truncated \\uXXXX"))
				1110	goto onError;
				1111	i++;
				1112	break;
				1113	}
				1114	x = (x<<4) & ~0xF;
				1115	if (c >= '0' && c <= '9')
				1116	x += c - '0';
				1117	else if (c >= 'a' && c <= 'f')
				1118	x += 10 + c - 'a';
				1119	else
				1120	x += 10 + c - 'A';
				1121	}
				1122	s += i;
				1123	*p++ = x;
				1124	break;
				1125
				1126	default:
				1127	*p++ = '\\';
				1128	*p++ = (unsigned char)s[-1];
				1129	break;
				1130	}
				1131	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1132	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1133	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1134	return (PyObject *)v;
				1135
				1136	onError:
				1137	Py_XDECREF(v);
				1138	return NULL;
				1139	}
				1140
				1141	/* Return a Unicode-Escape string version of the Unicode object.
				1142
				1143	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1144	appropriate.
				1145
				1146	*/
				1147
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1148	static const Py_UNICODE findchar(const Py_UNICODE s,
				1149	int size,
				1150	Py_UNICODE ch);
				1151
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1152	static
				1153	PyObject unicodeescape_string(const Py_UNICODE s,
				1154	int size,
				1155	int quotes)
				1156	{
				1157	PyObject *repr;
				1158	char *p;
				1159	char *q;
				1160
				1161	static const char *hexdigit = "0123456789ABCDEF";
				1162
				1163	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1164	if (repr == NULL)
				1165	return NULL;
				1166
				1167	p = q = PyString_AS_STRING(repr);
				1168
				1169	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1170	*p++ = 'u';
				1171	*p++ = (findchar(s, size, '\'') &&
				1172	!findchar(s, size, '"')) ? '"' : '\'';
				1173	}
				1174	while (size-- > 0) {
				1175	Py_UNICODE ch = *s++;
				1176	/* Escape quotes */
				1177	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1178	*p++ = '\\';
				1179	*p++ = (char) ch;
				1180	}
				1181	/* Map 16-bit characters to '\uxxxx' */
				1182	else if (ch >= 256) {
				1183	*p++ = '\\';
				1184	*p++ = 'u';
				1185	*p++ = hexdigit[(ch >> 12) & 0xf];
				1186	*p++ = hexdigit[(ch >> 8) & 0xf];
				1187	*p++ = hexdigit[(ch >> 4) & 0xf];
				1188	*p++ = hexdigit[ch & 15];
				1189	}
				1190	/* Map non-printable US ASCII to '\ooo' */
				1191	else if (ch < ' ' \|\| ch >= 128) {
				1192	*p++ = '\\';
				1193	*p++ = hexdigit[(ch >> 6) & 7];
				1194	*p++ = hexdigit[(ch >> 3) & 7];
				1195	*p++ = hexdigit[ch & 7];
				1196	}
				1197	/* Copy everything else as-is */
				1198	else
				1199	*p++ = (char) ch;
				1200	}
				1201	if (quotes)
				1202	*p++ = q[1];
				1203
				1204	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1205	if (_PyString_Resize(&repr, p - q))
				1206	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1207
				1208	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1209
				1210	onError:
				1211	Py_DECREF(repr);
				1212	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1213	}
				1214
				1215	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1216	int size)
				1217	{
				1218	return unicodeescape_string(s, size, 0);
				1219	}
				1220
				1221	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1222	{
				1223	if (!PyUnicode_Check(unicode)) {
				1224	PyErr_BadArgument();
				1225	return NULL;
				1226	}
				1227	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1228	PyUnicode_GET_SIZE(unicode));
				1229	}
				1230
				1231	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1232
				1233	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1234	int size,
				1235	const char *errors)
				1236	{
				1237	PyUnicodeObject *v;
				1238	Py_UNICODE p, buf;
				1239	const char *end;
				1240	const char *bs;
				1241
				1242	/* Escaped strings will always be longer than the resulting
				1243	Unicode string, so we start with size here and then reduce the
				1244	length after conversion to the true value. */
				1245	v = _PyUnicode_New(size);
				1246	if (v == NULL)
				1247	goto onError;
				1248	if (size == 0)
				1249	return (PyObject *)v;
				1250	p = buf = PyUnicode_AS_UNICODE(v);
				1251	end = s + size;
				1252	while (s < end) {
				1253	unsigned char c;
				1254	unsigned int x;
				1255	int i;
				1256
				1257	/* Non-escape characters are interpreted as Unicode ordinals */
				1258	if (*s != '\\') {
				1259	p++ = (unsigned char)s++;
				1260	continue;
				1261	}
				1262
				1263	/* \u-escapes are only interpreted iff the number of leading
				1264	backslashes if odd */
				1265	bs = s;
				1266	for (;s < end;) {
				1267	if (*s != '\\')
				1268	break;
				1269	p++ = (unsigned char)s++;
				1270	}
				1271	if (((s - bs) & 1) == 0 \|\|
				1272	s >= end \|\|
				1273	*s != 'u') {
				1274	continue;
				1275	}
				1276	p--;
				1277	s++;
				1278
				1279	/* \uXXXX with 4 hex digits */
				1280	for (x = 0, i = 0; i < 4; i++) {
				1281	c = (unsigned char)s[i];
				1282	if (!isxdigit(c)) {
				1283	if (unicodeescape_decoding_error(&s, &x, errors,
				1284	"truncated \\uXXXX"))
				1285	goto onError;
				1286	i++;
				1287	break;
				1288	}
				1289	x = (x<<4) & ~0xF;
				1290	if (c >= '0' && c <= '9')
				1291	x += c - '0';
				1292	else if (c >= 'a' && c <= 'f')
				1293	x += 10 + c - 'a';
				1294	else
				1295	x += 10 + c - 'A';
				1296	}
				1297	s += i;
				1298	*p++ = x;
				1299	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1300	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1301	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1302	return (PyObject *)v;
				1303
				1304	onError:
				1305	Py_XDECREF(v);
				1306	return NULL;
				1307	}
				1308
				1309	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1310	int size)
				1311	{
				1312	PyObject *repr;
				1313	char *p;
				1314	char *q;
				1315
				1316	static const char *hexdigit = "0123456789ABCDEF";
				1317
				1318	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1319	if (repr == NULL)
				1320	return NULL;
				1321
				1322	p = q = PyString_AS_STRING(repr);
				1323	while (size-- > 0) {
				1324	Py_UNICODE ch = *s++;
				1325	/* Map 16-bit characters to '\uxxxx' */
				1326	if (ch >= 256) {
				1327	*p++ = '\\';
				1328	*p++ = 'u';
				1329	*p++ = hexdigit[(ch >> 12) & 0xf];
				1330	*p++ = hexdigit[(ch >> 8) & 0xf];
				1331	*p++ = hexdigit[(ch >> 4) & 0xf];
				1332	*p++ = hexdigit[ch & 15];
				1333	}
				1334	/* Copy everything else as-is */
				1335	else
				1336	*p++ = (char) ch;
				1337	}
				1338	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1339	if (_PyString_Resize(&repr, p - q))
				1340	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1341
				1342	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1343
				1344	onError:
				1345	Py_DECREF(repr);
				1346	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1347	}
				1348
				1349	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1350	{
				1351	if (!PyUnicode_Check(unicode)) {
				1352	PyErr_BadArgument();
				1353	return NULL;
				1354	}
				1355	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1356	PyUnicode_GET_SIZE(unicode));
				1357	}
				1358
				1359	/* --- Latin-1 Codec ------------------------------------------------------ */
				1360
				1361	PyObject PyUnicode_DecodeLatin1(const char s,
				1362	int size,
				1363	const char *errors)
				1364	{
				1365	PyUnicodeObject *v;
				1366	Py_UNICODE *p;
				1367
				1368	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1369	v = _PyUnicode_New(size);
				1370	if (v == NULL)
				1371	goto onError;
				1372	if (size == 0)
				1373	return (PyObject *)v;
				1374	p = PyUnicode_AS_UNICODE(v);
				1375	while (size-- > 0)
				1376	p++ = (unsigned char)s++;
				1377	return (PyObject *)v;
				1378
				1379	onError:
				1380	Py_XDECREF(v);
				1381	return NULL;
				1382	}
				1383
				1384	static
				1385	int latin1_encoding_error(const Py_UNICODE **source,
				1386	char **dest,
				1387	const char *errors,
				1388	const char *details)
				1389	{
				1390	if ((errors == NULL) \|\|
				1391	(strcmp(errors,"strict") == 0)) {
				1392	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1393	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1394	details);
				1395	return -1;
				1396	}
				1397	else if (strcmp(errors,"ignore") == 0) {
				1398	return 0;
				1399	}
				1400	else if (strcmp(errors,"replace") == 0) {
				1401	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1402	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1403	return 0;
				1404	}
				1405	else {
				1406	PyErr_Format(PyExc_ValueError,
				1407	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1408	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1409	errors);
				1410	return -1;
				1411	}
				1412	}
				1413
				1414	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1415	int size,
				1416	const char *errors)
				1417	{
				1418	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1419	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1420	repr = PyString_FromStringAndSize(NULL, size);
				1421	if (repr == NULL)
				1422	return NULL;
				1423
				1424	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1425	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1426	while (size-- > 0) {
				1427	Py_UNICODE ch = *p++;
				1428	if (ch >= 256) {
				1429	if (latin1_encoding_error(&p, &s, errors,
				1430	"ordinal not in range(256)"))
				1431	goto onError;
				1432	}
				1433	else
				1434	*s++ = (char)ch;
				1435	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1436	/* Resize if error handling skipped some characters */
				1437	if (s - start < PyString_GET_SIZE(repr))
				1438	if (_PyString_Resize(&repr, s - start))
				1439	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1440	return repr;
				1441
				1442	onError:
				1443	Py_DECREF(repr);
				1444	return NULL;
				1445	}
				1446
				1447	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1448	{
				1449	if (!PyUnicode_Check(unicode)) {
				1450	PyErr_BadArgument();
				1451	return NULL;
				1452	}
				1453	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1454	PyUnicode_GET_SIZE(unicode),
				1455	NULL);
				1456	}
				1457
				1458	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1459
				1460	static
				1461	int ascii_decoding_error(const char **source,
				1462	Py_UNICODE **dest,
				1463	const char *errors,
				1464	const char *details)
				1465	{
				1466	if ((errors == NULL) \|\|
				1467	(strcmp(errors,"strict") == 0)) {
				1468	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1469	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1470	details);
				1471	return -1;
				1472	}
				1473	else if (strcmp(errors,"ignore") == 0) {
				1474	return 0;
				1475	}
				1476	else if (strcmp(errors,"replace") == 0) {
				1477	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1478	(*dest)++;
				1479	return 0;
				1480	}
				1481	else {
				1482	PyErr_Format(PyExc_ValueError,
				1483	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1484	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1485	errors);
				1486	return -1;
				1487	}
				1488	}
				1489
				1490	PyObject PyUnicode_DecodeASCII(const char s,
				1491	int size,
				1492	const char *errors)
				1493	{
				1494	PyUnicodeObject *v;
				1495	Py_UNICODE *p;
				1496
				1497	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1498	v = _PyUnicode_New(size);
				1499	if (v == NULL)
				1500	goto onError;
				1501	if (size == 0)
				1502	return (PyObject *)v;
				1503	p = PyUnicode_AS_UNICODE(v);
				1504	while (size-- > 0) {
				1505	register unsigned char c;
				1506
				1507	c = (unsigned char)*s++;
				1508	if (c < 128)
				1509	*p++ = c;
				1510	else if (ascii_decoding_error(&s, &p, errors,
				1511	"ordinal not in range(128)"))
				1512	goto onError;
				1513	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1514	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1515	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1516	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1517	return (PyObject *)v;
				1518
				1519	onError:
				1520	Py_XDECREF(v);
				1521	return NULL;
				1522	}
				1523
				1524	static
				1525	int ascii_encoding_error(const Py_UNICODE **source,
				1526	char **dest,
				1527	const char *errors,
				1528	const char *details)
				1529	{
				1530	if ((errors == NULL) \|\|
				1531	(strcmp(errors,"strict") == 0)) {
				1532	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1533	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1534	details);
				1535	return -1;
				1536	}
				1537	else if (strcmp(errors,"ignore") == 0) {
				1538	return 0;
				1539	}
				1540	else if (strcmp(errors,"replace") == 0) {
				1541	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1542	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1543	return 0;
				1544	}
				1545	else {
				1546	PyErr_Format(PyExc_ValueError,
				1547	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1548	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1549	errors);
				1550	return -1;
				1551	}
				1552	}
				1553
				1554	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1555	int size,
				1556	const char *errors)
				1557	{
				1558	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1559	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1560	repr = PyString_FromStringAndSize(NULL, size);
				1561	if (repr == NULL)
				1562	return NULL;
				1563
				1564	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1565	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1566	while (size-- > 0) {
				1567	Py_UNICODE ch = *p++;
				1568	if (ch >= 128) {
				1569	if (ascii_encoding_error(&p, &s, errors,
				1570	"ordinal not in range(128)"))
				1571	goto onError;
				1572	}
				1573	else
				1574	*s++ = (char)ch;
				1575	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1576	/* Resize if error handling skipped some characters */
				1577	if (s - start < PyString_GET_SIZE(repr))
				1578	if (_PyString_Resize(&repr, s - start))
				1579	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1580	return repr;
				1581
				1582	onError:
				1583	Py_DECREF(repr);
				1584	return NULL;
				1585	}
				1586
				1587	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1588	{
				1589	if (!PyUnicode_Check(unicode)) {
				1590	PyErr_BadArgument();
				1591	return NULL;
				1592	}
				1593	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1594	PyUnicode_GET_SIZE(unicode),
				1595	NULL);
				1596	}
				1597
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1598	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1599
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1600	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1601
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1602	PyObject PyUnicode_DecodeMBCS(const char s,
				1603	int size,
				1604	const char *errors)
				1605	{
				1606	PyUnicodeObject *v;
				1607	Py_UNICODE *p;
				1608
				1609	/* First get the size of the result */
				1610	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1611	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1612	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1613
				1614	v = _PyUnicode_New(usize);
				1615	if (v == NULL)
				1616	return NULL;
				1617	if (usize == 0)
				1618	return (PyObject *)v;
				1619	p = PyUnicode_AS_UNICODE(v);
				1620	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1621	Py_DECREF(v);
				1622	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1623	}
				1624
				1625	return (PyObject *)v;
				1626	}
				1627
				1628	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1629	int size,
				1630	const char *errors)
				1631	{
				1632	PyObject *repr;
				1633	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1634	DWORD mbcssize;
				1635
				1636	/* If there are no characters, bail now! */
				1637	if (size==0)
				1638	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1639
				1640	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1641	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1642	if (mbcssize==0)
				1643	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1644
				1645	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1646	if (repr == NULL)
				1647	return NULL;
				1648	if (mbcssize==0)
				1649	return repr;
				1650
				1651	/* Do the conversion */
				1652	s = PyString_AS_STRING(repr);
				1653	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1654	Py_DECREF(repr);
				1655	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1656	}
				1657	return repr;
				1658	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1659
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1660	#endif /* MS_WIN32 */
				1661
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1662	/* --- Character Mapping Codec -------------------------------------------- */
				1663
				1664	static
				1665	int charmap_decoding_error(const char **source,
				1666	Py_UNICODE **dest,
				1667	const char *errors,
				1668	const char *details)
				1669	{
				1670	if ((errors == NULL) \|\|
				1671	(strcmp(errors,"strict") == 0)) {
				1672	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1673	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1674	details);
				1675	return -1;
				1676	}
				1677	else if (strcmp(errors,"ignore") == 0) {
				1678	return 0;
				1679	}
				1680	else if (strcmp(errors,"replace") == 0) {
				1681	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1682	(*dest)++;
				1683	return 0;
				1684	}
				1685	else {
				1686	PyErr_Format(PyExc_ValueError,
				1687	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1688	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1689	errors);
				1690	return -1;
				1691	}
				1692	}
				1693
				1694	PyObject PyUnicode_DecodeCharmap(const char s,
				1695	int size,
				1696	PyObject *mapping,
				1697	const char *errors)
				1698	{
				1699	PyUnicodeObject *v;
				1700	Py_UNICODE *p;
				1701
				1702	/* Default to Latin-1 */
				1703	if (mapping == NULL)
				1704	return PyUnicode_DecodeLatin1(s, size, errors);
				1705
				1706	v = _PyUnicode_New(size);
				1707	if (v == NULL)
				1708	goto onError;
				1709	if (size == 0)
				1710	return (PyObject *)v;
				1711	p = PyUnicode_AS_UNICODE(v);
				1712	while (size-- > 0) {
				1713	unsigned char ch = *s++;
				1714	PyObject w, x;
				1715
				1716	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1717	w = PyInt_FromLong((long)ch);
				1718	if (w == NULL)
				1719	goto onError;
				1720	x = PyObject_GetItem(mapping, w);
				1721	Py_DECREF(w);
				1722	if (x == NULL) {
				1723	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1724	/* No mapping found: default to Latin-1 mapping */
				1725	PyErr_Clear();
				1726	*p++ = (Py_UNICODE)ch;
				1727	continue;
				1728	}
				1729	goto onError;
				1730	}
				1731
				1732	/* Apply mapping */
				1733	if (PyInt_Check(x)) {
				1734	int value = PyInt_AS_LONG(x);
				1735	if (value < 0 \|\| value > 65535) {
				1736	PyErr_SetString(PyExc_TypeError,
				1737	"character mapping must be in range(65336)");
				1738	Py_DECREF(x);
				1739	goto onError;
				1740	}
				1741	*p++ = (Py_UNICODE)value;
				1742	}
				1743	else if (x == Py_None) {
				1744	/* undefined mapping */
				1745	if (charmap_decoding_error(&s, &p, errors,
				1746	"character maps to <undefined>")) {
				1747	Py_DECREF(x);
				1748	goto onError;
				1749	}
				1750	}
				1751	else if (PyUnicode_Check(x)) {
				1752	if (PyUnicode_GET_SIZE(x) != 1) {
				1753	/* 1-n mapping */
				1754	PyErr_SetString(PyExc_NotImplementedError,
				1755	"1-n mappings are currently not implemented");
				1756	Py_DECREF(x);
				1757	goto onError;
				1758	}
				1759	p++ = PyUnicode_AS_UNICODE(x);
				1760	}
				1761	else {
				1762	/* wrong return value */
				1763	PyErr_SetString(PyExc_TypeError,
				1764	"character mapping must return integer, None or unicode");
				1765	Py_DECREF(x);
				1766	goto onError;
				1767	}
				1768	Py_DECREF(x);
				1769	}
				1770	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1771	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1772	goto onError;
				1773	return (PyObject *)v;
				1774
				1775	onError:
				1776	Py_XDECREF(v);
				1777	return NULL;
				1778	}
				1779
				1780	static
				1781	int charmap_encoding_error(const Py_UNICODE **source,
				1782	char **dest,
				1783	const char *errors,
				1784	const char *details)
				1785	{
				1786	if ((errors == NULL) \|\|
				1787	(strcmp(errors,"strict") == 0)) {
				1788	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1789	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1790	details);
				1791	return -1;
				1792	}
				1793	else if (strcmp(errors,"ignore") == 0) {
				1794	return 0;
				1795	}
				1796	else if (strcmp(errors,"replace") == 0) {
				1797	**dest = '?';
				1798	(*dest)++;
				1799	return 0;
				1800	}
				1801	else {
				1802	PyErr_Format(PyExc_ValueError,
				1803	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1804	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1805	errors);
				1806	return -1;
				1807	}
				1808	}
				1809
				1810	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1811	int size,
				1812	PyObject *mapping,
				1813	const char *errors)
				1814	{
				1815	PyObject *v;
				1816	char *s;
				1817
				1818	/* Default to Latin-1 */
				1819	if (mapping == NULL)
				1820	return PyUnicode_EncodeLatin1(p, size, errors);
				1821
				1822	v = PyString_FromStringAndSize(NULL, size);
				1823	if (v == NULL)
				1824	return NULL;
				1825	s = PyString_AS_STRING(v);
				1826	while (size-- > 0) {
				1827	Py_UNICODE ch = *p++;
				1828	PyObject w, x;
				1829
				1830	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				1831	w = PyInt_FromLong((long)ch);
				1832	if (w == NULL)
				1833	goto onError;
				1834	x = PyObject_GetItem(mapping, w);
				1835	Py_DECREF(w);
				1836	if (x == NULL) {
				1837	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1838	/* No mapping found: default to Latin-1 mapping if possible */
				1839	PyErr_Clear();
				1840	if (ch < 256) {
				1841	*s++ = (char)ch;
				1842	continue;
				1843	}
				1844	else if (!charmap_encoding_error(&p, &s, errors,
				1845	"missing character mapping"))
				1846	continue;
				1847	}
				1848	goto onError;
				1849	}
				1850
				1851	/* Apply mapping */
				1852	if (PyInt_Check(x)) {
				1853	int value = PyInt_AS_LONG(x);
				1854	if (value < 0 \|\| value > 255) {
				1855	PyErr_SetString(PyExc_TypeError,
				1856	"character mapping must be in range(256)");
				1857	Py_DECREF(x);
				1858	goto onError;
				1859	}
				1860	*s++ = (char)value;
				1861	}
				1862	else if (x == Py_None) {
				1863	/* undefined mapping */
				1864	if (charmap_encoding_error(&p, &s, errors,
				1865	"character maps to <undefined>")) {
				1866	Py_DECREF(x);
				1867	goto onError;
				1868	}
				1869	}
				1870	else if (PyString_Check(x)) {
				1871	if (PyString_GET_SIZE(x) != 1) {
				1872	/* 1-n mapping */
				1873	PyErr_SetString(PyExc_NotImplementedError,
				1874	"1-n mappings are currently not implemented");
				1875	Py_DECREF(x);
				1876	goto onError;
				1877	}
				1878	s++ = PyString_AS_STRING(x);
				1879	}
				1880	else {
				1881	/* wrong return value */
				1882	PyErr_SetString(PyExc_TypeError,
				1883	"character mapping must return integer, None or unicode");
				1884	Py_DECREF(x);
				1885	goto onError;
				1886	}
				1887	Py_DECREF(x);
				1888	}
				1889	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				1890	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				1891	goto onError;
				1892	return v;
				1893
				1894	onError:
				1895	Py_DECREF(v);
				1896	return NULL;
				1897	}
				1898
				1899	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				1900	PyObject *mapping)
				1901	{
				1902	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				1903	PyErr_BadArgument();
				1904	return NULL;
				1905	}
				1906	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				1907	PyUnicode_GET_SIZE(unicode),
				1908	mapping,
				1909	NULL);
				1910	}
				1911
				1912	static
				1913	int translate_error(const Py_UNICODE **source,
				1914	Py_UNICODE **dest,
				1915	const char *errors,
				1916	const char *details)
				1917	{
				1918	if ((errors == NULL) \|\|
				1919	(strcmp(errors,"strict") == 0)) {
				1920	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1921	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1922	details);
				1923	return -1;
				1924	}
				1925	else if (strcmp(errors,"ignore") == 0) {
				1926	return 0;
				1927	}
				1928	else if (strcmp(errors,"replace") == 0) {
				1929	**dest = '?';
				1930	(*dest)++;
				1931	return 0;
				1932	}
				1933	else {
				1934	PyErr_Format(PyExc_ValueError,
				1935	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1936	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1937	errors);
				1938	return -1;
				1939	}
				1940	}
				1941
				1942	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				1943	int size,
				1944	PyObject *mapping,
				1945	const char *errors)
				1946	{
				1947	PyUnicodeObject *v;
				1948	Py_UNICODE *p;
				1949
				1950	if (mapping == NULL) {
				1951	PyErr_BadArgument();
				1952	return NULL;
				1953	}
				1954
				1955	/* Output will never be longer than input */
				1956	v = _PyUnicode_New(size);
				1957	if (v == NULL)
				1958	goto onError;
				1959	if (size == 0)
				1960	goto done;
				1961	p = PyUnicode_AS_UNICODE(v);
				1962	while (size-- > 0) {
				1963	Py_UNICODE ch = *s++;
				1964	PyObject w, x;
				1965
				1966	/* Get mapping */
				1967	w = PyInt_FromLong(ch);
				1968	if (w == NULL)
				1969	goto onError;
				1970	x = PyObject_GetItem(mapping, w);
				1971	Py_DECREF(w);
				1972	if (x == NULL) {
				1973	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1974	/* No mapping found: default to 1-1 mapping */
				1975	PyErr_Clear();
				1976	*p++ = ch;
				1977	continue;
				1978	}
				1979	goto onError;
				1980	}
				1981
				1982	/* Apply mapping */
				1983	if (PyInt_Check(x))
				1984	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				1985	else if (x == Py_None) {
				1986	/* undefined mapping */
				1987	if (translate_error(&s, &p, errors,
				1988	"character maps to <undefined>")) {
				1989	Py_DECREF(x);
				1990	goto onError;
				1991	}
				1992	}
				1993	else if (PyUnicode_Check(x)) {
				1994	if (PyUnicode_GET_SIZE(x) != 1) {
				1995	/* 1-n mapping */
				1996	PyErr_SetString(PyExc_NotImplementedError,
				1997	"1-n mappings are currently not implemented");
				1998	Py_DECREF(x);
				1999	goto onError;
				2000	}
				2001	p++ = PyUnicode_AS_UNICODE(x);
				2002	}
				2003	else {
				2004	/* wrong return value */
				2005	PyErr_SetString(PyExc_TypeError,
				2006	"translate mapping must return integer, None or unicode");
				2007	Py_DECREF(x);
				2008	goto onError;
				2009	}
				2010	Py_DECREF(x);
				2011	}
				2012	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2013	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2014	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2015
				2016	done:
				2017	return (PyObject *)v;
				2018
				2019	onError:
				2020	Py_XDECREF(v);
				2021	return NULL;
				2022	}
				2023
				2024	PyObject PyUnicode_Translate(PyObject str,
				2025	PyObject *mapping,
				2026	const char *errors)
				2027	{
				2028	PyObject *result;
				2029
				2030	str = PyUnicode_FromObject(str);
				2031	if (str == NULL)
				2032	goto onError;
				2033	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2034	PyUnicode_GET_SIZE(str),
				2035	mapping,
				2036	errors);
				2037	Py_DECREF(str);
				2038	return result;
				2039
				2040	onError:
				2041	Py_XDECREF(str);
				2042	return NULL;
				2043	}
				2044
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2045	/* --- Decimal Encoder ---------------------------------------------------- */
				2046
				2047	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2048	int length,
				2049	char *output,
				2050	const char *errors)
				2051	{
				2052	Py_UNICODE p, end;
				2053
				2054	if (output == NULL) {
				2055	PyErr_BadArgument();
				2056	return -1;
				2057	}
				2058
				2059	p = s;
				2060	end = s + length;
				2061	while (p < end) {
				2062	register Py_UNICODE ch = *p++;
				2063	int decimal;
				2064
				2065	if (Py_UNICODE_ISSPACE(ch)) {
				2066	*output++ = ' ';
				2067	continue;
				2068	}
				2069	decimal = Py_UNICODE_TODECIMAL(ch);
				2070	if (decimal >= 0) {
				2071	*output++ = '0' + decimal;
				2072	continue;
				2073	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2074	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2075	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2076	continue;
				2077	}
				2078	/* All other characters are considered invalid */
				2079	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2080	PyErr_SetString(PyExc_ValueError,
				2081	"invalid decimal Unicode string");
				2082	goto onError;
				2083	}
				2084	else if (strcmp(errors, "ignore") == 0)
				2085	continue;
				2086	else if (strcmp(errors, "replace") == 0) {
				2087	*output++ = '?';
				2088	continue;
				2089	}
				2090	}
				2091	/* 0-terminate the output string */
				2092	*output++ = '\0';
				2093	return 0;
				2094
				2095	onError:
				2096	return -1;
				2097	}
				2098
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2099	/* --- Helpers ------------------------------------------------------------ */
				2100
				2101	static
				2102	int count(PyUnicodeObject *self,
				2103	int start,
				2104	int end,
				2105	PyUnicodeObject *substring)
				2106	{
				2107	int count = 0;
				2108
				2109	end -= substring->length;
				2110
				2111	while (start <= end)
				2112	if (Py_UNICODE_MATCH(self, start, substring)) {
				2113	count++;
				2114	start += substring->length;
				2115	} else
				2116	start++;
				2117
				2118	return count;
				2119	}
				2120
				2121	int PyUnicode_Count(PyObject *str,
				2122	PyObject *substr,
				2123	int start,
				2124	int end)
				2125	{
				2126	int result;
				2127
				2128	str = PyUnicode_FromObject(str);
				2129	if (str == NULL)
				2130	return -1;
				2131	substr = PyUnicode_FromObject(substr);
				2132	if (substr == NULL) {
				2133	Py_DECREF(substr);
				2134	return -1;
				2135	}
				2136
				2137	result = count((PyUnicodeObject *)str,
				2138	start, end,
				2139	(PyUnicodeObject *)substr);
				2140
				2141	Py_DECREF(str);
				2142	Py_DECREF(substr);
				2143	return result;
				2144	}
				2145
				2146	static
				2147	int findstring(PyUnicodeObject *self,
				2148	PyUnicodeObject *substring,
				2149	int start,
				2150	int end,
				2151	int direction)
				2152	{
				2153	if (start < 0)
				2154	start += self->length;
				2155	if (start < 0)
				2156	start = 0;
				2157
				2158	if (substring->length == 0)
				2159	return start;
				2160
				2161	if (end > self->length)
				2162	end = self->length;
				2163	if (end < 0)
				2164	end += self->length;
				2165	if (end < 0)
				2166	end = 0;
				2167
				2168	end -= substring->length;
				2169
				2170	if (direction < 0) {
				2171	for (; end >= start; end--)
				2172	if (Py_UNICODE_MATCH(self, end, substring))
				2173	return end;
				2174	} else {
				2175	for (; start <= end; start++)
				2176	if (Py_UNICODE_MATCH(self, start, substring))
				2177	return start;
				2178	}
				2179
				2180	return -1;
				2181	}
				2182
				2183	int PyUnicode_Find(PyObject *str,
				2184	PyObject *substr,
				2185	int start,
				2186	int end,
				2187	int direction)
				2188	{
				2189	int result;
				2190
				2191	str = PyUnicode_FromObject(str);
				2192	if (str == NULL)
				2193	return -1;
				2194	substr = PyUnicode_FromObject(substr);
				2195	if (substr == NULL) {
				2196	Py_DECREF(substr);
				2197	return -1;
				2198	}
				2199
				2200	result = findstring((PyUnicodeObject *)str,
				2201	(PyUnicodeObject *)substr,
				2202	start, end, direction);
				2203	Py_DECREF(str);
				2204	Py_DECREF(substr);
				2205	return result;
				2206	}
				2207
				2208	static
				2209	int tailmatch(PyUnicodeObject *self,
				2210	PyUnicodeObject *substring,
				2211	int start,
				2212	int end,
				2213	int direction)
				2214	{
				2215	if (start < 0)
				2216	start += self->length;
				2217	if (start < 0)
				2218	start = 0;
				2219
				2220	if (substring->length == 0)
				2221	return 1;
				2222
				2223	if (end > self->length)
				2224	end = self->length;
				2225	if (end < 0)
				2226	end += self->length;
				2227	if (end < 0)
				2228	end = 0;
				2229
				2230	end -= substring->length;
				2231	if (end < start)
				2232	return 0;
				2233
				2234	if (direction > 0) {
				2235	if (Py_UNICODE_MATCH(self, end, substring))
				2236	return 1;
				2237	} else {
				2238	if (Py_UNICODE_MATCH(self, start, substring))
				2239	return 1;
				2240	}
				2241
				2242	return 0;
				2243	}
				2244
				2245	int PyUnicode_Tailmatch(PyObject *str,
				2246	PyObject *substr,
				2247	int start,
				2248	int end,
				2249	int direction)
				2250	{
				2251	int result;
				2252
				2253	str = PyUnicode_FromObject(str);
				2254	if (str == NULL)
				2255	return -1;
				2256	substr = PyUnicode_FromObject(substr);
				2257	if (substr == NULL) {
				2258	Py_DECREF(substr);
				2259	return -1;
				2260	}
				2261
				2262	result = tailmatch((PyUnicodeObject *)str,
				2263	(PyUnicodeObject *)substr,
				2264	start, end, direction);
				2265	Py_DECREF(str);
				2266	Py_DECREF(substr);
				2267	return result;
				2268	}
				2269
				2270	static
				2271	const Py_UNICODE findchar(const Py_UNICODE s,
				2272	int size,
				2273	Py_UNICODE ch)
				2274	{
				2275	/* like wcschr, but doesn't stop at NULL characters */
				2276
				2277	while (size-- > 0) {
				2278	if (*s == ch)
				2279	return s;
				2280	s++;
				2281	}
				2282
				2283	return NULL;
				2284	}
				2285
				2286	/* Apply fixfct filter to the Unicode object self and return a
				2287	reference to the modified object */
				2288
				2289	static
				2290	PyObject fixup(PyUnicodeObject self,
				2291	int (fixfct)(PyUnicodeObject s))
				2292	{
				2293
				2294	PyUnicodeObject *u;
				2295
				2296	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2297	self->length);
				2298	if (u == NULL)
				2299	return NULL;
				2300	if (!fixfct(u)) {
				2301	/* fixfct should return TRUE if it modified the buffer. If
				2302	FALSE, return a reference to the original buffer instead
				2303	(to save space, not time) */
				2304	Py_INCREF(self);
				2305	Py_DECREF(u);
				2306	return (PyObject*) self;
				2307	}
				2308	return (PyObject*) u;
				2309	}
				2310
				2311	static
				2312	int fixupper(PyUnicodeObject *self)
				2313	{
				2314	int len = self->length;
				2315	Py_UNICODE *s = self->str;
				2316	int status = 0;
				2317
				2318	while (len-- > 0) {
				2319	register Py_UNICODE ch;
				2320
				2321	ch = Py_UNICODE_TOUPPER(*s);
				2322	if (ch != *s) {
				2323	status = 1;
				2324	*s = ch;
				2325	}
				2326	s++;
				2327	}
				2328
				2329	return status;
				2330	}
				2331
				2332	static
				2333	int fixlower(PyUnicodeObject *self)
				2334	{
				2335	int len = self->length;
				2336	Py_UNICODE *s = self->str;
				2337	int status = 0;
				2338
				2339	while (len-- > 0) {
				2340	register Py_UNICODE ch;
				2341
				2342	ch = Py_UNICODE_TOLOWER(*s);
				2343	if (ch != *s) {
				2344	status = 1;
				2345	*s = ch;
				2346	}
				2347	s++;
				2348	}
				2349
				2350	return status;
				2351	}
				2352
				2353	static
				2354	int fixswapcase(PyUnicodeObject *self)
				2355	{
				2356	int len = self->length;
				2357	Py_UNICODE *s = self->str;
				2358	int status = 0;
				2359
				2360	while (len-- > 0) {
				2361	if (Py_UNICODE_ISUPPER(*s)) {
				2362	s = Py_UNICODE_TOLOWER(s);
				2363	status = 1;
				2364	} else if (Py_UNICODE_ISLOWER(*s)) {
				2365	s = Py_UNICODE_TOUPPER(s);
				2366	status = 1;
				2367	}
				2368	s++;
				2369	}
				2370
				2371	return status;
				2372	}
				2373
				2374	static
				2375	int fixcapitalize(PyUnicodeObject *self)
				2376	{
				2377	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2378	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2379	return 1;
				2380	}
				2381	return 0;
				2382	}
				2383
				2384	static
				2385	int fixtitle(PyUnicodeObject *self)
				2386	{
				2387	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2388	register Py_UNICODE *e;
				2389	int previous_is_cased;
				2390
				2391	/* Shortcut for single character strings */
				2392	if (PyUnicode_GET_SIZE(self) == 1) {
				2393	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2394	if (*p != ch) {
				2395	*p = ch;
				2396	return 1;
				2397	}
				2398	else
				2399	return 0;
				2400	}
				2401
				2402	e = p + PyUnicode_GET_SIZE(self);
				2403	previous_is_cased = 0;
				2404	for (; p < e; p++) {
				2405	register const Py_UNICODE ch = *p;
				2406
				2407	if (previous_is_cased)
				2408	*p = Py_UNICODE_TOLOWER(ch);
				2409	else
				2410	*p = Py_UNICODE_TOTITLE(ch);
				2411
				2412	if (Py_UNICODE_ISLOWER(ch) \|\|
				2413	Py_UNICODE_ISUPPER(ch) \|\|
				2414	Py_UNICODE_ISTITLE(ch))
				2415	previous_is_cased = 1;
				2416	else
				2417	previous_is_cased = 0;
				2418	}
				2419	return 1;
				2420	}
				2421
				2422	PyObject PyUnicode_Join(PyObject separator,
				2423	PyObject *seq)
				2424	{
				2425	Py_UNICODE *sep;
				2426	int seplen;
				2427	PyUnicodeObject *res = NULL;
				2428	int reslen = 0;
				2429	Py_UNICODE *p;
				2430	int seqlen = 0;
				2431	int sz = 100;
				2432	int i;
				2433
				2434	seqlen = PySequence_Length(seq);
				2435	if (seqlen < 0 && PyErr_Occurred())
				2436	return NULL;
				2437
				2438	if (separator == NULL) {
				2439	Py_UNICODE blank = ' ';
				2440	sep = &blank;
				2441	seplen = 1;
				2442	}
				2443	else {
				2444	separator = PyUnicode_FromObject(separator);
				2445	if (separator == NULL)
				2446	return NULL;
				2447	sep = PyUnicode_AS_UNICODE(separator);
				2448	seplen = PyUnicode_GET_SIZE(separator);
				2449	}
				2450
				2451	res = _PyUnicode_New(sz);
				2452	if (res == NULL)
				2453	goto onError;
				2454	p = PyUnicode_AS_UNICODE(res);
				2455	reslen = 0;
				2456
				2457	for (i = 0; i < seqlen; i++) {
				2458	int itemlen;
				2459	PyObject *item;
				2460
				2461	item = PySequence_GetItem(seq, i);
				2462	if (item == NULL)
				2463	goto onError;
				2464	if (!PyUnicode_Check(item)) {
				2465	PyObject *v;
				2466	v = PyUnicode_FromObject(item);
				2467	Py_DECREF(item);
				2468	item = v;
				2469	if (item == NULL)
				2470	goto onError;
				2471	}
				2472	itemlen = PyUnicode_GET_SIZE(item);
				2473	while (reslen + itemlen + seplen >= sz) {
				2474	if (_PyUnicode_Resize(res, sz*2))
				2475	goto onError;
				2476	sz *= 2;
				2477	p = PyUnicode_AS_UNICODE(res) + reslen;
				2478	}
				2479	if (i > 0) {
				2480	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2481	p += seplen;
				2482	reslen += seplen;
				2483	}
				2484	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2485	p += itemlen;
				2486	reslen += itemlen;
				2487	Py_DECREF(item);
				2488	}
				2489	if (_PyUnicode_Resize(res, reslen))
				2490	goto onError;
				2491
				2492	Py_XDECREF(separator);
				2493	return (PyObject *)res;
				2494
				2495	onError:
				2496	Py_XDECREF(separator);
				2497	Py_DECREF(res);
				2498	return NULL;
				2499	}
				2500
				2501	static
				2502	PyUnicodeObject pad(PyUnicodeObject self,
				2503	int left,
				2504	int right,
				2505	Py_UNICODE fill)
				2506	{
				2507	PyUnicodeObject *u;
				2508
				2509	if (left < 0)
				2510	left = 0;
				2511	if (right < 0)
				2512	right = 0;
				2513
				2514	if (left == 0 && right == 0) {
				2515	Py_INCREF(self);
				2516	return self;
				2517	}
				2518
				2519	u = _PyUnicode_New(left + self->length + right);
				2520	if (u) {
				2521	if (left)
				2522	Py_UNICODE_FILL(u->str, fill, left);
				2523	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2524	if (right)
				2525	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2526	}
				2527
				2528	return u;
				2529	}
				2530
				2531	#define SPLIT_APPEND(data, left, right) \
				2532	str = PyUnicode_FromUnicode(data + left, right - left); \
				2533	if (!str) \
				2534	goto onError; \
				2535	if (PyList_Append(list, str)) { \
				2536	Py_DECREF(str); \
				2537	goto onError; \
				2538	} \
				2539	else \
				2540	Py_DECREF(str);
				2541
				2542	static
				2543	PyObject split_whitespace(PyUnicodeObject self,
				2544	PyObject *list,
				2545	int maxcount)
				2546	{
				2547	register int i;
				2548	register int j;
				2549	int len = self->length;
				2550	PyObject *str;
				2551
				2552	for (i = j = 0; i < len; ) {
				2553	/* find a token */
				2554	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2555	i++;
				2556	j = i;
				2557	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2558	i++;
				2559	if (j < i) {
				2560	if (maxcount-- <= 0)
				2561	break;
				2562	SPLIT_APPEND(self->str, j, i);
				2563	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2564	i++;
				2565	j = i;
				2566	}
				2567	}
				2568	if (j < len) {
				2569	SPLIT_APPEND(self->str, j, len);
				2570	}
				2571	return list;
				2572
				2573	onError:
				2574	Py_DECREF(list);
				2575	return NULL;
				2576	}
				2577
				2578	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2579	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2580	{
				2581	register int i;
				2582	register int j;
				2583	int len;
				2584	PyObject *list;
				2585	PyObject *str;
				2586	Py_UNICODE *data;
				2587
				2588	string = PyUnicode_FromObject(string);
				2589	if (string == NULL)
				2590	return NULL;
				2591	data = PyUnicode_AS_UNICODE(string);
				2592	len = PyUnicode_GET_SIZE(string);
				2593
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2594	list = PyList_New(0);
				2595	if (!list)
				2596	goto onError;
				2597
				2598	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2599	int eol;
				2600
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2601	/* Find a line and append it */
				2602	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2603	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2604
				2605	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2606	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2607	if (i < len) {
				2608	if (data[i] == '\r' && i + 1 < len &&
				2609	data[i+1] == '\n')
				2610	i += 2;
				2611	else
				2612	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2613	if (keepends)
				2614	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2615	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2616	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2617	j = i;
				2618	}
				2619	if (j < len) {
				2620	SPLIT_APPEND(data, j, len);
				2621	}
				2622
				2623	Py_DECREF(string);
				2624	return list;
				2625
				2626	onError:
				2627	Py_DECREF(list);
				2628	Py_DECREF(string);
				2629	return NULL;
				2630	}
				2631
				2632	static
				2633	PyObject split_char(PyUnicodeObject self,
				2634	PyObject *list,
				2635	Py_UNICODE ch,
				2636	int maxcount)
				2637	{
				2638	register int i;
				2639	register int j;
				2640	int len = self->length;
				2641	PyObject *str;
				2642
				2643	for (i = j = 0; i < len; ) {
				2644	if (self->str[i] == ch) {
				2645	if (maxcount-- <= 0)
				2646	break;
				2647	SPLIT_APPEND(self->str, j, i);
				2648	i = j = i + 1;
				2649	} else
				2650	i++;
				2651	}
				2652	if (j <= len) {
				2653	SPLIT_APPEND(self->str, j, len);
				2654	}
				2655	return list;
				2656
				2657	onError:
				2658	Py_DECREF(list);
				2659	return NULL;
				2660	}
				2661
				2662	static
				2663	PyObject split_substring(PyUnicodeObject self,
				2664	PyObject *list,
				2665	PyUnicodeObject *substring,
				2666	int maxcount)
				2667	{
				2668	register int i;
				2669	register int j;
				2670	int len = self->length;
				2671	int sublen = substring->length;
				2672	PyObject *str;
				2673
				2674	for (i = j = 0; i < len - sublen; ) {
				2675	if (Py_UNICODE_MATCH(self, i, substring)) {
				2676	if (maxcount-- <= 0)
				2677	break;
				2678	SPLIT_APPEND(self->str, j, i);
				2679	i = j = i + sublen;
				2680	} else
				2681	i++;
				2682	}
				2683	if (j <= len) {
				2684	SPLIT_APPEND(self->str, j, len);
				2685	}
				2686	return list;
				2687
				2688	onError:
				2689	Py_DECREF(list);
				2690	return NULL;
				2691	}
				2692
				2693	#undef SPLIT_APPEND
				2694
				2695	static
				2696	PyObject split(PyUnicodeObject self,
				2697	PyUnicodeObject *substring,
				2698	int maxcount)
				2699	{
				2700	PyObject *list;
				2701
				2702	if (maxcount < 0)
				2703	maxcount = INT_MAX;
				2704
				2705	list = PyList_New(0);
				2706	if (!list)
				2707	return NULL;
				2708
				2709	if (substring == NULL)
				2710	return split_whitespace(self,list,maxcount);
				2711
				2712	else if (substring->length == 1)
				2713	return split_char(self,list,substring->str[0],maxcount);
				2714
				2715	else if (substring->length == 0) {
				2716	Py_DECREF(list);
				2717	PyErr_SetString(PyExc_ValueError, "empty separator");
				2718	return NULL;
				2719	}
				2720	else
				2721	return split_substring(self,list,substring,maxcount);
				2722	}
				2723
				2724	static
				2725	PyObject strip(PyUnicodeObject self,
				2726	int left,
				2727	int right)
				2728	{
				2729	Py_UNICODE *p = self->str;
				2730	int start = 0;
				2731	int end = self->length;
				2732
				2733	if (left)
				2734	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2735	start++;
				2736
				2737	if (right)
				2738	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2739	end--;
				2740
				2741	if (start == 0 && end == self->length) {
				2742	/* couldn't strip anything off, return original string */
				2743	Py_INCREF(self);
				2744	return (PyObject*) self;
				2745	}
				2746
				2747	return (PyObject*) PyUnicode_FromUnicode(
				2748	self->str + start,
				2749	end - start
				2750	);
				2751	}
				2752
				2753	static
				2754	PyObject replace(PyUnicodeObject self,
				2755	PyUnicodeObject *str1,
				2756	PyUnicodeObject *str2,
				2757	int maxcount)
				2758	{
				2759	PyUnicodeObject *u;
				2760
				2761	if (maxcount < 0)
				2762	maxcount = INT_MAX;
				2763
				2764	if (str1->length == 1 && str2->length == 1) {
				2765	int i;
				2766
				2767	/* replace characters */
				2768	if (!findchar(self->str, self->length, str1->str[0])) {
				2769	/* nothing to replace, return original string */
				2770	Py_INCREF(self);
				2771	u = self;
				2772	} else {
				2773	Py_UNICODE u1 = str1->str[0];
				2774	Py_UNICODE u2 = str2->str[0];
				2775
				2776	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2777	self->str,
				2778	self->length
				2779	);
				2780	if (u)
				2781	for (i = 0; i < u->length; i++)
				2782	if (u->str[i] == u1) {
				2783	if (--maxcount < 0)
				2784	break;
				2785	u->str[i] = u2;
				2786	}
				2787	}
				2788
				2789	} else {
				2790	int n, i;
				2791	Py_UNICODE *p;
				2792
				2793	/* replace strings */
				2794	n = count(self, 0, self->length, str1);
				2795	if (n > maxcount)
				2796	n = maxcount;
				2797	if (n == 0) {
				2798	/* nothing to replace, return original string */
				2799	Py_INCREF(self);
				2800	u = self;
				2801	} else {
				2802	u = _PyUnicode_New(
				2803	self->length + n * (str2->length - str1->length));
				2804	if (u) {
				2805	i = 0;
				2806	p = u->str;
				2807	while (i <= self->length - str1->length)
				2808	if (Py_UNICODE_MATCH(self, i, str1)) {
				2809	/* replace string segment */
				2810	Py_UNICODE_COPY(p, str2->str, str2->length);
				2811	p += str2->length;
				2812	i += str1->length;
				2813	if (--n <= 0) {
				2814	/* copy remaining part */
				2815	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				2816	break;
				2817	}
				2818	} else
				2819	*p++ = self->str[i++];
				2820	}
				2821	}
				2822	}
				2823
				2824	return (PyObject *) u;
				2825	}
				2826
				2827	/* --- Unicode Object Methods --------------------------------------------- */
				2828
				2829	static char title__doc__[] =
				2830	"S.title() -> unicode\n\
				2831	\n\
				2832	Return a titlecased version of S, i.e. words start with title case\n\
				2833	characters, all remaining cased characters have lower case.";
				2834
				2835	static PyObject*
				2836	unicode_title(PyUnicodeObject self, PyObject args)
				2837	{
				2838	if (!PyArg_NoArgs(args))
				2839	return NULL;
				2840	return fixup(self, fixtitle);
				2841	}
				2842
				2843	static char capitalize__doc__[] =
				2844	"S.capitalize() -> unicode\n\
				2845	\n\
				2846	Return a capitalized version of S, i.e. make the first character\n\
				2847	have upper case.";
				2848
				2849	static PyObject*
				2850	unicode_capitalize(PyUnicodeObject self, PyObject args)
				2851	{
				2852	if (!PyArg_NoArgs(args))
				2853	return NULL;
				2854	return fixup(self, fixcapitalize);
				2855	}
				2856
				2857	#if 0
				2858	static char capwords__doc__[] =
				2859	"S.capwords() -> unicode\n\
				2860	\n\
				2861	Apply .capitalize() to all words in S and return the result with\n\
				2862	normalized whitespace (all whitespace strings are replaced by ' ').";
				2863
				2864	static PyObject*
				2865	unicode_capwords(PyUnicodeObject self, PyObject args)
				2866	{
				2867	PyObject *list;
				2868	PyObject *item;
				2869	int i;
				2870
				2871	if (!PyArg_NoArgs(args))
				2872	return NULL;
				2873
				2874	/* Split into words */
				2875	list = split(self, NULL, -1);
				2876	if (!list)
				2877	return NULL;
				2878
				2879	/* Capitalize each word */
				2880	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				2881	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				2882	fixcapitalize);
				2883	if (item == NULL)
				2884	goto onError;
				2885	Py_DECREF(PyList_GET_ITEM(list, i));
				2886	PyList_SET_ITEM(list, i, item);
				2887	}
				2888
				2889	/* Join the words to form a new string */
				2890	item = PyUnicode_Join(NULL, list);
				2891
				2892	onError:
				2893	Py_DECREF(list);
				2894	return (PyObject *)item;
				2895	}
				2896	#endif
				2897
				2898	static char center__doc__[] =
				2899	"S.center(width) -> unicode\n\
				2900	\n\
				2901	Return S centered in a Unicode string of length width. Padding is done\n\
				2902	using spaces.";
				2903
				2904	static PyObject *
				2905	unicode_center(PyUnicodeObject self, PyObject args)
				2906	{
				2907	int marg, left;
				2908	int width;
				2909
				2910	if (!PyArg_ParseTuple(args, "i:center", &width))
				2911	return NULL;
				2912
				2913	if (self->length >= width) {
				2914	Py_INCREF(self);
				2915	return (PyObject*) self;
				2916	}
				2917
				2918	marg = width - self->length;
				2919	left = marg / 2 + (marg & width & 1);
				2920
				2921	return (PyObject*) pad(self, left, marg - left, ' ');
				2922	}
				2923
				2924	static int
				2925	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				2926	{
				2927	int len1, len2;
				2928	Py_UNICODE *s1 = str1->str;
				2929	Py_UNICODE *s2 = str2->str;
				2930
				2931	len1 = str1->length;
				2932	len2 = str2->length;
				2933
				2934	while (len1 > 0 && len2 > 0) {
				2935	int cmp = (s1++) - (s2++);
				2936	if (cmp)
				2937	/* This should make Christian happy! */
				2938	return (cmp < 0) ? -1 : (cmp != 0);
				2939	len1--, len2--;
				2940	}
				2941
				2942	return (len1 < len2) ? -1 : (len1 != len2);
				2943	}
				2944
				2945	int PyUnicode_Compare(PyObject *left,
				2946	PyObject *right)
				2947	{
				2948	PyUnicodeObject u = NULL, v = NULL;
				2949	int result;
				2950
				2951	/* Coerce the two arguments */
				2952	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2953	if (u == NULL)
				2954	goto onError;
				2955	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2956	if (v == NULL)
				2957	goto onError;
				2958
				2959	/* Shortcut for emtpy or interned objects */
				2960	if (v == u) {
				2961	Py_DECREF(u);
				2962	Py_DECREF(v);
				2963	return 0;
				2964	}
				2965
				2966	result = unicode_compare(u, v);
				2967
				2968	Py_DECREF(u);
				2969	Py_DECREF(v);
				2970	return result;
				2971
				2972	onError:
				2973	Py_XDECREF(u);
				2974	Py_XDECREF(v);
				2975	return -1;
				2976	}
				2977
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2978	int PyUnicode_Contains(PyObject *container,
				2979	PyObject *element)
				2980	{
				2981	PyUnicodeObject u = NULL, v = NULL;
				2982	int result;
				2983	register const Py_UNICODE p, e;
				2984	register Py_UNICODE ch;
				2985
				2986	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2987	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
				2988	if (v == NULL)
				2989	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2990	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				2991	if (u == NULL) {
				2992	Py_DECREF(v);
				2993	goto onError;
				2994	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2995
				2996	/* Check v in u */
				2997	if (PyUnicode_GET_SIZE(v) != 1) {
				2998	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame^]	2999	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3000	goto onError;
				3001	}
				3002	ch = *PyUnicode_AS_UNICODE(v);
				3003	p = PyUnicode_AS_UNICODE(u);
				3004	e = p + PyUnicode_GET_SIZE(u);
				3005	result = 0;
				3006	while (p < e) {
				3007	if (*p++ == ch) {
				3008	result = 1;
				3009	break;
				3010	}
				3011	}
				3012
				3013	Py_DECREF(u);
				3014	Py_DECREF(v);
				3015	return result;
				3016
				3017	onError:
				3018	Py_XDECREF(u);
				3019	Py_XDECREF(v);
				3020	return -1;
				3021	}
				3022
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3023	/* Concat to string or Unicode object giving a new Unicode object. */
				3024
				3025	PyObject PyUnicode_Concat(PyObject left,
				3026	PyObject *right)
				3027	{
				3028	PyUnicodeObject u = NULL, v = NULL, *w;
				3029
				3030	/* Coerce the two arguments */
				3031	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3032	if (u == NULL)
				3033	goto onError;
				3034	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3035	if (v == NULL)
				3036	goto onError;
				3037
				3038	/* Shortcuts */
				3039	if (v == unicode_empty) {
				3040	Py_DECREF(v);
				3041	return (PyObject *)u;
				3042	}
				3043	if (u == unicode_empty) {
				3044	Py_DECREF(u);
				3045	return (PyObject *)v;
				3046	}
				3047
				3048	/* Concat the two Unicode strings */
				3049	w = _PyUnicode_New(u->length + v->length);
				3050	if (w == NULL)
				3051	goto onError;
				3052	Py_UNICODE_COPY(w->str, u->str, u->length);
				3053	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3054
				3055	Py_DECREF(u);
				3056	Py_DECREF(v);
				3057	return (PyObject *)w;
				3058
				3059	onError:
				3060	Py_XDECREF(u);
				3061	Py_XDECREF(v);
				3062	return NULL;
				3063	}
				3064
				3065	static char count__doc__[] =
				3066	"S.count(sub[, start[, end]]) -> int\n\
				3067	\n\
				3068	Return the number of occurrences of substring sub in Unicode string\n\
				3069	S[start:end]. Optional arguments start and end are\n\
				3070	interpreted as in slice notation.";
				3071
				3072	static PyObject *
				3073	unicode_count(PyUnicodeObject self, PyObject args)
				3074	{
				3075	PyUnicodeObject *substring;
				3076	int start = 0;
				3077	int end = INT_MAX;
				3078	PyObject *result;
				3079
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3080	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3081	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3082	return NULL;
				3083
				3084	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3085	(PyObject *)substring);
				3086	if (substring == NULL)
				3087	return NULL;
				3088
				3089	if (substring->length == 0) {
				3090	Py_DECREF(substring);
				3091	return PyInt_FromLong((long) 0);
				3092	}
				3093
				3094	if (start < 0)
				3095	start += self->length;
				3096	if (start < 0)
				3097	start = 0;
				3098	if (end > self->length)
				3099	end = self->length;
				3100	if (end < 0)
				3101	end += self->length;
				3102	if (end < 0)
				3103	end = 0;
				3104
				3105	result = PyInt_FromLong((long) count(self, start, end, substring));
				3106
				3107	Py_DECREF(substring);
				3108	return result;
				3109	}
				3110
				3111	static char encode__doc__[] =
				3112	"S.encode([encoding[,errors]]) -> string\n\
				3113	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3114	Return an encoded string version of S. Default encoding is the current\n\
				3115	default string encoding. errors may be given to set a different error\n\
				3116	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3117	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3118
				3119	static PyObject *
				3120	unicode_encode(PyUnicodeObject self, PyObject args)
				3121	{
				3122	char *encoding = NULL;
				3123	char *errors = NULL;
				3124	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3125	return NULL;
				3126	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3127	}
				3128
				3129	static char expandtabs__doc__[] =
				3130	"S.expandtabs([tabsize]) -> unicode\n\
				3131	\n\
				3132	Return a copy of S where all tab characters are expanded using spaces.\n\
				3133	If tabsize is not given, a tab size of 8 characters is assumed.";
				3134
				3135	static PyObject*
				3136	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3137	{
				3138	Py_UNICODE *e;
				3139	Py_UNICODE *p;
				3140	Py_UNICODE *q;
				3141	int i, j;
				3142	PyUnicodeObject *u;
				3143	int tabsize = 8;
				3144
				3145	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3146	return NULL;
				3147
				3148	/* First pass: determine size of ouput string */
				3149	i = j = 0;
				3150	e = self->str + self->length;
				3151	for (p = self->str; p < e; p++)
				3152	if (*p == '\t') {
				3153	if (tabsize > 0)
				3154	j += tabsize - (j % tabsize);
				3155	}
				3156	else {
				3157	j++;
				3158	if (p == '\n' \|\| p == '\r') {
				3159	i += j;
				3160	j = 0;
				3161	}
				3162	}
				3163
				3164	/* Second pass: create output string and fill it */
				3165	u = _PyUnicode_New(i + j);
				3166	if (!u)
				3167	return NULL;
				3168
				3169	j = 0;
				3170	q = u->str;
				3171
				3172	for (p = self->str; p < e; p++)
				3173	if (*p == '\t') {
				3174	if (tabsize > 0) {
				3175	i = tabsize - (j % tabsize);
				3176	j += i;
				3177	while (i--)
				3178	*q++ = ' ';
				3179	}
				3180	}
				3181	else {
				3182	j++;
				3183	q++ = p;
				3184	if (p == '\n' \|\| p == '\r')
				3185	j = 0;
				3186	}
				3187
				3188	return (PyObject*) u;
				3189	}
				3190
				3191	static char find__doc__[] =
				3192	"S.find(sub [,start [,end]]) -> int\n\
				3193	\n\
				3194	Return the lowest index in S where substring sub is found,\n\
				3195	such that sub is contained within s[start,end]. Optional\n\
				3196	arguments start and end are interpreted as in slice notation.\n\
				3197	\n\
				3198	Return -1 on failure.";
				3199
				3200	static PyObject *
				3201	unicode_find(PyUnicodeObject self, PyObject args)
				3202	{
				3203	PyUnicodeObject *substring;
				3204	int start = 0;
				3205	int end = INT_MAX;
				3206	PyObject *result;
				3207
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3208	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3209	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3210	return NULL;
				3211	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3212	(PyObject *)substring);
				3213	if (substring == NULL)
				3214	return NULL;
				3215
				3216	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3217
				3218	Py_DECREF(substring);
				3219	return result;
				3220	}
				3221
				3222	static PyObject *
				3223	unicode_getitem(PyUnicodeObject *self, int index)
				3224	{
				3225	if (index < 0 \|\| index >= self->length) {
				3226	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3227	return NULL;
				3228	}
				3229
				3230	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3231	}
				3232
				3233	static long
				3234	unicode_hash(PyUnicodeObject *self)
				3235	{
				3236	long hash;
				3237	PyObject *utf8;
				3238
				3239	/* Since Unicode objects compare equal to their UTF-8 string
				3240	counterparts, they should also use the UTF-8 strings as basis
				3241	for their hash value. This is needed to assure that strings and
				3242	Unicode objects behave in the same way as dictionary
				3243	keys. Unfortunately, this costs some performance and also some
				3244	memory if the cached UTF-8 representation is not used later
				3245	on. */
				3246	if (self->hash != -1)
				3247	return self->hash;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	3248	utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3249	if (utf8 == NULL)
				3250	return -1;
				3251	hash = PyObject_Hash(utf8);
				3252	if (hash == -1)
				3253	return -1;
				3254	self->hash = hash;
				3255	return hash;
				3256	}
				3257
				3258	static char index__doc__[] =
				3259	"S.index(sub [,start [,end]]) -> int\n\
				3260	\n\
				3261	Like S.find() but raise ValueError when the substring is not found.";
				3262
				3263	static PyObject *
				3264	unicode_index(PyUnicodeObject self, PyObject args)
				3265	{
				3266	int result;
				3267	PyUnicodeObject *substring;
				3268	int start = 0;
				3269	int end = INT_MAX;
				3270
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3271	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3272	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3273	return NULL;
				3274
				3275	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3276	(PyObject *)substring);
				3277	if (substring == NULL)
				3278	return NULL;
				3279
				3280	result = findstring(self, substring, start, end, 1);
				3281
				3282	Py_DECREF(substring);
				3283	if (result < 0) {
				3284	PyErr_SetString(PyExc_ValueError, "substring not found");
				3285	return NULL;
				3286	}
				3287	return PyInt_FromLong(result);
				3288	}
				3289
				3290	static char islower__doc__[] =
				3291	"S.islower() -> int\n\
				3292	\n\
				3293	Return 1 if all cased characters in S are lowercase and there is\n\
				3294	at least one cased character in S, 0 otherwise.";
				3295
				3296	static PyObject*
				3297	unicode_islower(PyUnicodeObject self, PyObject args)
				3298	{
				3299	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3300	register const Py_UNICODE *e;
				3301	int cased;
				3302
				3303	if (!PyArg_NoArgs(args))
				3304	return NULL;
				3305
				3306	/* Shortcut for single character strings */
				3307	if (PyUnicode_GET_SIZE(self) == 1)
				3308	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3309
				3310	e = p + PyUnicode_GET_SIZE(self);
				3311	cased = 0;
				3312	for (; p < e; p++) {
				3313	register const Py_UNICODE ch = *p;
				3314
				3315	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3316	return PyInt_FromLong(0);
				3317	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3318	cased = 1;
				3319	}
				3320	return PyInt_FromLong(cased);
				3321	}
				3322
				3323	static char isupper__doc__[] =
				3324	"S.isupper() -> int\n\
				3325	\n\
				3326	Return 1 if all cased characters in S are uppercase and there is\n\
				3327	at least one cased character in S, 0 otherwise.";
				3328
				3329	static PyObject*
				3330	unicode_isupper(PyUnicodeObject self, PyObject args)
				3331	{
				3332	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3333	register const Py_UNICODE *e;
				3334	int cased;
				3335
				3336	if (!PyArg_NoArgs(args))
				3337	return NULL;
				3338
				3339	/* Shortcut for single character strings */
				3340	if (PyUnicode_GET_SIZE(self) == 1)
				3341	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3342
				3343	e = p + PyUnicode_GET_SIZE(self);
				3344	cased = 0;
				3345	for (; p < e; p++) {
				3346	register const Py_UNICODE ch = *p;
				3347
				3348	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3349	return PyInt_FromLong(0);
				3350	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3351	cased = 1;
				3352	}
				3353	return PyInt_FromLong(cased);
				3354	}
				3355
				3356	static char istitle__doc__[] =
				3357	"S.istitle() -> int\n\
				3358	\n\
				3359	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3360	may only follow uncased characters and lowercase characters only cased\n\
				3361	ones. Return 0 otherwise.";
				3362
				3363	static PyObject*
				3364	unicode_istitle(PyUnicodeObject self, PyObject args)
				3365	{
				3366	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3367	register const Py_UNICODE *e;
				3368	int cased, previous_is_cased;
				3369
				3370	if (!PyArg_NoArgs(args))
				3371	return NULL;
				3372
				3373	/* Shortcut for single character strings */
				3374	if (PyUnicode_GET_SIZE(self) == 1)
				3375	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3376	(Py_UNICODE_ISUPPER(*p) != 0));
				3377
				3378	e = p + PyUnicode_GET_SIZE(self);
				3379	cased = 0;
				3380	previous_is_cased = 0;
				3381	for (; p < e; p++) {
				3382	register const Py_UNICODE ch = *p;
				3383
				3384	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3385	if (previous_is_cased)
				3386	return PyInt_FromLong(0);
				3387	previous_is_cased = 1;
				3388	cased = 1;
				3389	}
				3390	else if (Py_UNICODE_ISLOWER(ch)) {
				3391	if (!previous_is_cased)
				3392	return PyInt_FromLong(0);
				3393	previous_is_cased = 1;
				3394	cased = 1;
				3395	}
				3396	else
				3397	previous_is_cased = 0;
				3398	}
				3399	return PyInt_FromLong(cased);
				3400	}
				3401
				3402	static char isspace__doc__[] =
				3403	"S.isspace() -> int\n\
				3404	\n\
				3405	Return 1 if there are only whitespace characters in S,\n\
				3406	0 otherwise.";
				3407
				3408	static PyObject*
				3409	unicode_isspace(PyUnicodeObject self, PyObject args)
				3410	{
				3411	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3412	register const Py_UNICODE *e;
				3413
				3414	if (!PyArg_NoArgs(args))
				3415	return NULL;
				3416
				3417	/* Shortcut for single character strings */
				3418	if (PyUnicode_GET_SIZE(self) == 1 &&
				3419	Py_UNICODE_ISSPACE(*p))
				3420	return PyInt_FromLong(1);
				3421
				3422	e = p + PyUnicode_GET_SIZE(self);
				3423	for (; p < e; p++) {
				3424	if (!Py_UNICODE_ISSPACE(*p))
				3425	return PyInt_FromLong(0);
				3426	}
				3427	return PyInt_FromLong(1);
				3428	}
				3429
				3430	static char isdecimal__doc__[] =
				3431	"S.isdecimal() -> int\n\
				3432	\n\
				3433	Return 1 if there are only decimal characters in S,\n\
				3434	0 otherwise.";
				3435
				3436	static PyObject*
				3437	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3438	{
				3439	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3440	register const Py_UNICODE *e;
				3441
				3442	if (!PyArg_NoArgs(args))
				3443	return NULL;
				3444
				3445	/* Shortcut for single character strings */
				3446	if (PyUnicode_GET_SIZE(self) == 1 &&
				3447	Py_UNICODE_ISDECIMAL(*p))
				3448	return PyInt_FromLong(1);
				3449
				3450	e = p + PyUnicode_GET_SIZE(self);
				3451	for (; p < e; p++) {
				3452	if (!Py_UNICODE_ISDECIMAL(*p))
				3453	return PyInt_FromLong(0);
				3454	}
				3455	return PyInt_FromLong(1);
				3456	}
				3457
				3458	static char isdigit__doc__[] =
				3459	"S.isdigit() -> int\n\
				3460	\n\
				3461	Return 1 if there are only digit characters in S,\n\
				3462	0 otherwise.";
				3463
				3464	static PyObject*
				3465	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3466	{
				3467	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3468	register const Py_UNICODE *e;
				3469
				3470	if (!PyArg_NoArgs(args))
				3471	return NULL;
				3472
				3473	/* Shortcut for single character strings */
				3474	if (PyUnicode_GET_SIZE(self) == 1 &&
				3475	Py_UNICODE_ISDIGIT(*p))
				3476	return PyInt_FromLong(1);
				3477
				3478	e = p + PyUnicode_GET_SIZE(self);
				3479	for (; p < e; p++) {
				3480	if (!Py_UNICODE_ISDIGIT(*p))
				3481	return PyInt_FromLong(0);
				3482	}
				3483	return PyInt_FromLong(1);
				3484	}
				3485
				3486	static char isnumeric__doc__[] =
				3487	"S.isnumeric() -> int\n\
				3488	\n\
				3489	Return 1 if there are only numeric characters in S,\n\
				3490	0 otherwise.";
				3491
				3492	static PyObject*
				3493	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3494	{
				3495	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3496	register const Py_UNICODE *e;
				3497
				3498	if (!PyArg_NoArgs(args))
				3499	return NULL;
				3500
				3501	/* Shortcut for single character strings */
				3502	if (PyUnicode_GET_SIZE(self) == 1 &&
				3503	Py_UNICODE_ISNUMERIC(*p))
				3504	return PyInt_FromLong(1);
				3505
				3506	e = p + PyUnicode_GET_SIZE(self);
				3507	for (; p < e; p++) {
				3508	if (!Py_UNICODE_ISNUMERIC(*p))
				3509	return PyInt_FromLong(0);
				3510	}
				3511	return PyInt_FromLong(1);
				3512	}
				3513
				3514	static char join__doc__[] =
				3515	"S.join(sequence) -> unicode\n\
				3516	\n\
				3517	Return a string which is the concatenation of the strings in the\n\
				3518	sequence. The separator between elements is S.";
				3519
				3520	static PyObject*
				3521	unicode_join(PyUnicodeObject self, PyObject args)
				3522	{
				3523	PyObject *data;
				3524	if (!PyArg_ParseTuple(args, "O:join", &data))
				3525	return NULL;
				3526
				3527	return PyUnicode_Join((PyObject *)self, data);
				3528	}
				3529
				3530	static int
				3531	unicode_length(PyUnicodeObject *self)
				3532	{
				3533	return self->length;
				3534	}
				3535
				3536	static char ljust__doc__[] =
				3537	"S.ljust(width) -> unicode\n\
				3538	\n\
				3539	Return S left justified in a Unicode string of length width. Padding is\n\
				3540	done using spaces.";
				3541
				3542	static PyObject *
				3543	unicode_ljust(PyUnicodeObject self, PyObject args)
				3544	{
				3545	int width;
				3546	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3547	return NULL;
				3548
				3549	if (self->length >= width) {
				3550	Py_INCREF(self);
				3551	return (PyObject*) self;
				3552	}
				3553
				3554	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3555	}
				3556
				3557	static char lower__doc__[] =
				3558	"S.lower() -> unicode\n\
				3559	\n\
				3560	Return a copy of the string S converted to lowercase.";
				3561
				3562	static PyObject*
				3563	unicode_lower(PyUnicodeObject self, PyObject args)
				3564	{
				3565	if (!PyArg_NoArgs(args))
				3566	return NULL;
				3567	return fixup(self, fixlower);
				3568	}
				3569
				3570	static char lstrip__doc__[] =
				3571	"S.lstrip() -> unicode\n\
				3572	\n\
				3573	Return a copy of the string S with leading whitespace removed.";
				3574
				3575	static PyObject *
				3576	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3577	{
				3578	if (!PyArg_NoArgs(args))
				3579	return NULL;
				3580	return strip(self, 1, 0);
				3581	}
				3582
				3583	static PyObject*
				3584	unicode_repeat(PyUnicodeObject *str, int len)
				3585	{
				3586	PyUnicodeObject *u;
				3587	Py_UNICODE *p;
				3588
				3589	if (len < 0)
				3590	len = 0;
				3591
				3592	if (len == 1) {
				3593	/* no repeat, return original string */
				3594	Py_INCREF(str);
				3595	return (PyObject*) str;
				3596	}
				3597
				3598	u = _PyUnicode_New(len * str->length);
				3599	if (!u)
				3600	return NULL;
				3601
				3602	p = u->str;
				3603
				3604	while (len-- > 0) {
				3605	Py_UNICODE_COPY(p, str->str, str->length);
				3606	p += str->length;
				3607	}
				3608
				3609	return (PyObject*) u;
				3610	}
				3611
				3612	PyObject PyUnicode_Replace(PyObject obj,
				3613	PyObject *subobj,
				3614	PyObject *replobj,
				3615	int maxcount)
				3616	{
				3617	PyObject *self;
				3618	PyObject *str1;
				3619	PyObject *str2;
				3620	PyObject *result;
				3621
				3622	self = PyUnicode_FromObject(obj);
				3623	if (self == NULL)
				3624	return NULL;
				3625	str1 = PyUnicode_FromObject(subobj);
				3626	if (str1 == NULL) {
				3627	Py_DECREF(self);
				3628	return NULL;
				3629	}
				3630	str2 = PyUnicode_FromObject(replobj);
				3631	if (str2 == NULL) {
				3632	Py_DECREF(self);
				3633	Py_DECREF(str1);
				3634	return NULL;
				3635	}
				3636	result = replace((PyUnicodeObject *)self,
				3637	(PyUnicodeObject *)str1,
				3638	(PyUnicodeObject *)str2,
				3639	maxcount);
				3640	Py_DECREF(self);
				3641	Py_DECREF(str1);
				3642	Py_DECREF(str2);
				3643	return result;
				3644	}
				3645
				3646	static char replace__doc__[] =
				3647	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3648	\n\
				3649	Return a copy of S with all occurrences of substring\n\
				3650	old replaced by new. If the optional argument maxsplit is\n\
				3651	given, only the first maxsplit occurrences are replaced.";
				3652
				3653	static PyObject*
				3654	unicode_replace(PyUnicodeObject self, PyObject args)
				3655	{
				3656	PyUnicodeObject *str1;
				3657	PyUnicodeObject *str2;
				3658	int maxcount = -1;
				3659	PyObject *result;
				3660
				3661	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3662	return NULL;
				3663	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3664	if (str1 == NULL)
				3665	return NULL;
				3666	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3667	if (str2 == NULL)
				3668	return NULL;
				3669
				3670	result = replace(self, str1, str2, maxcount);
				3671
				3672	Py_DECREF(str1);
				3673	Py_DECREF(str2);
				3674	return result;
				3675	}
				3676
				3677	static
				3678	PyObject unicode_repr(PyObject unicode)
				3679	{
				3680	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				3681	PyUnicode_GET_SIZE(unicode),
				3682	1);
				3683	}
				3684
				3685	static char rfind__doc__[] =
				3686	"S.rfind(sub [,start [,end]]) -> int\n\
				3687	\n\
				3688	Return the highest index in S where substring sub is found,\n\
				3689	such that sub is contained within s[start,end]. Optional\n\
				3690	arguments start and end are interpreted as in slice notation.\n\
				3691	\n\
				3692	Return -1 on failure.";
				3693
				3694	static PyObject *
				3695	unicode_rfind(PyUnicodeObject self, PyObject args)
				3696	{
				3697	PyUnicodeObject *substring;
				3698	int start = 0;
				3699	int end = INT_MAX;
				3700	PyObject *result;
				3701
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3702	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				3703	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3704	return NULL;
				3705	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3706	(PyObject *)substring);
				3707	if (substring == NULL)
				3708	return NULL;
				3709
				3710	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				3711
				3712	Py_DECREF(substring);
				3713	return result;
				3714	}
				3715
				3716	static char rindex__doc__[] =
				3717	"S.rindex(sub [,start [,end]]) -> int\n\
				3718	\n\
				3719	Like S.rfind() but raise ValueError when the substring is not found.";
				3720
				3721	static PyObject *
				3722	unicode_rindex(PyUnicodeObject self, PyObject args)
				3723	{
				3724	int result;
				3725	PyUnicodeObject *substring;
				3726	int start = 0;
				3727	int end = INT_MAX;
				3728
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3729	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				3730	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3731	return NULL;
				3732	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3733	(PyObject *)substring);
				3734	if (substring == NULL)
				3735	return NULL;
				3736
				3737	result = findstring(self, substring, start, end, -1);
				3738
				3739	Py_DECREF(substring);
				3740	if (result < 0) {
				3741	PyErr_SetString(PyExc_ValueError, "substring not found");
				3742	return NULL;
				3743	}
				3744	return PyInt_FromLong(result);
				3745	}
				3746
				3747	static char rjust__doc__[] =
				3748	"S.rjust(width) -> unicode\n\
				3749	\n\
				3750	Return S right justified in a Unicode string of length width. Padding is\n\
				3751	done using spaces.";
				3752
				3753	static PyObject *
				3754	unicode_rjust(PyUnicodeObject self, PyObject args)
				3755	{
				3756	int width;
				3757	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				3758	return NULL;
				3759
				3760	if (self->length >= width) {
				3761	Py_INCREF(self);
				3762	return (PyObject*) self;
				3763	}
				3764
				3765	return (PyObject*) pad(self, width - self->length, 0, ' ');
				3766	}
				3767
				3768	static char rstrip__doc__[] =
				3769	"S.rstrip() -> unicode\n\
				3770	\n\
				3771	Return a copy of the string S with trailing whitespace removed.";
				3772
				3773	static PyObject *
				3774	unicode_rstrip(PyUnicodeObject self, PyObject args)
				3775	{
				3776	if (!PyArg_NoArgs(args))
				3777	return NULL;
				3778	return strip(self, 0, 1);
				3779	}
				3780
				3781	static PyObject*
				3782	unicode_slice(PyUnicodeObject *self, int start, int end)
				3783	{
				3784	/* standard clamping */
				3785	if (start < 0)
				3786	start = 0;
				3787	if (end < 0)
				3788	end = 0;
				3789	if (end > self->length)
				3790	end = self->length;
				3791	if (start == 0 && end == self->length) {
				3792	/* full slice, return original string */
				3793	Py_INCREF(self);
				3794	return (PyObject*) self;
				3795	}
				3796	if (start > end)
				3797	start = end;
				3798	/* copy slice */
				3799	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				3800	end - start);
				3801	}
				3802
				3803	PyObject PyUnicode_Split(PyObject s,
				3804	PyObject *sep,
				3805	int maxsplit)
				3806	{
				3807	PyObject *result;
				3808
				3809	s = PyUnicode_FromObject(s);
				3810	if (s == NULL)
				3811	return NULL;
				3812	if (sep != NULL) {
				3813	sep = PyUnicode_FromObject(sep);
				3814	if (sep == NULL) {
				3815	Py_DECREF(s);
				3816	return NULL;
				3817	}
				3818	}
				3819
				3820	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				3821
				3822	Py_DECREF(s);
				3823	Py_XDECREF(sep);
				3824	return result;
				3825	}
				3826
				3827	static char split__doc__[] =
				3828	"S.split([sep [,maxsplit]]) -> list of strings\n\
				3829	\n\
				3830	Return a list of the words in S, using sep as the\n\
				3831	delimiter string. If maxsplit is given, at most maxsplit\n\
				3832	splits are done. If sep is not specified, any whitespace string\n\
				3833	is a separator.";
				3834
				3835	static PyObject*
				3836	unicode_split(PyUnicodeObject self, PyObject args)
				3837	{
				3838	PyObject *substring = Py_None;
				3839	int maxcount = -1;
				3840
				3841	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				3842	return NULL;
				3843
				3844	if (substring == Py_None)
				3845	return split(self, NULL, maxcount);
				3846	else if (PyUnicode_Check(substring))
				3847	return split(self, (PyUnicodeObject *)substring, maxcount);
				3848	else
				3849	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				3850	}
				3851
				3852	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3853	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3854	\n\
				3855	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3856	Line breaks are not included in the resulting list unless keepends\n\
				3857	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3858
				3859	static PyObject*
				3860	unicode_splitlines(PyUnicodeObject self, PyObject args)
				3861	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3862	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3863
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3864	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3865	return NULL;
				3866
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3867	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3868	}
				3869
				3870	static
				3871	PyObject unicode_str(PyUnicodeObject self)
				3872	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3873	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3874	}
				3875
				3876	static char strip__doc__[] =
				3877	"S.strip() -> unicode\n\
				3878	\n\
				3879	Return a copy of S with leading and trailing whitespace removed.";
				3880
				3881	static PyObject *
				3882	unicode_strip(PyUnicodeObject self, PyObject args)
				3883	{
				3884	if (!PyArg_NoArgs(args))
				3885	return NULL;
				3886	return strip(self, 1, 1);
				3887	}
				3888
				3889	static char swapcase__doc__[] =
				3890	"S.swapcase() -> unicode\n\
				3891	\n\
				3892	Return a copy of S with uppercase characters converted to lowercase\n\
				3893	and vice versa.";
				3894
				3895	static PyObject*
				3896	unicode_swapcase(PyUnicodeObject self, PyObject args)
				3897	{
				3898	if (!PyArg_NoArgs(args))
				3899	return NULL;
				3900	return fixup(self, fixswapcase);
				3901	}
				3902
				3903	static char translate__doc__[] =
				3904	"S.translate(table) -> unicode\n\
				3905	\n\
				3906	Return a copy of the string S, where all characters have been mapped\n\
				3907	through the given translation table, which must be a mapping of\n\
				3908	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				3909	are left untouched. Characters mapped to None are deleted.";
				3910
				3911	static PyObject*
				3912	unicode_translate(PyUnicodeObject self, PyObject args)
				3913	{
				3914	PyObject *table;
				3915
				3916	if (!PyArg_ParseTuple(args, "O:translate", &table))
				3917	return NULL;
				3918	return PyUnicode_TranslateCharmap(self->str,
				3919	self->length,
				3920	table,
				3921	"ignore");
				3922	}
				3923
				3924	static char upper__doc__[] =
				3925	"S.upper() -> unicode\n\
				3926	\n\
				3927	Return a copy of S converted to uppercase.";
				3928
				3929	static PyObject*
				3930	unicode_upper(PyUnicodeObject self, PyObject args)
				3931	{
				3932	if (!PyArg_NoArgs(args))
				3933	return NULL;
				3934	return fixup(self, fixupper);
				3935	}
				3936
				3937	#if 0
				3938	static char zfill__doc__[] =
				3939	"S.zfill(width) -> unicode\n\
				3940	\n\
				3941	Pad a numeric string x with zeros on the left, to fill a field\n\
				3942	of the specified width. The string x is never truncated.";
				3943
				3944	static PyObject *
				3945	unicode_zfill(PyUnicodeObject self, PyObject args)
				3946	{
				3947	int fill;
				3948	PyUnicodeObject *u;
				3949
				3950	int width;
				3951	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				3952	return NULL;
				3953
				3954	if (self->length >= width) {
				3955	Py_INCREF(self);
				3956	return (PyObject*) self;
				3957	}
				3958
				3959	fill = width - self->length;
				3960
				3961	u = pad(self, fill, 0, '0');
				3962
				3963	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				3964	/* move sign to beginning of string */
				3965	u->str[0] = u->str[fill];
				3966	u->str[fill] = '0';
				3967	}
				3968
				3969	return (PyObject*) u;
				3970	}
				3971	#endif
				3972
				3973	#if 0
				3974	static PyObject*
				3975	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				3976	{
				3977	if (!PyArg_NoArgs(args))
				3978	return NULL;
				3979	return PyInt_FromLong(unicode_freelist_size);
				3980	}
				3981	#endif
				3982
				3983	static char startswith__doc__[] =
				3984	"S.startswith(prefix[, start[, end]]) -> int\n\
				3985	\n\
				3986	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				3987	optional start, test S beginning at that position. With optional end, stop\n\
				3988	comparing S at that position.";
				3989
				3990	static PyObject *
				3991	unicode_startswith(PyUnicodeObject *self,
				3992	PyObject *args)
				3993	{
				3994	PyUnicodeObject *substring;
				3995	int start = 0;
				3996	int end = INT_MAX;
				3997	PyObject *result;
				3998
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3999	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4000	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4001	return NULL;
				4002	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4003	(PyObject *)substring);
				4004	if (substring == NULL)
				4005	return NULL;
				4006
				4007	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4008
				4009	Py_DECREF(substring);
				4010	return result;
				4011	}
				4012
				4013
				4014	static char endswith__doc__[] =
				4015	"S.endswith(suffix[, start[, end]]) -> int\n\
				4016	\n\
				4017	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4018	optional start, test S beginning at that position. With optional end, stop\n\
				4019	comparing S at that position.";
				4020
				4021	static PyObject *
				4022	unicode_endswith(PyUnicodeObject *self,
				4023	PyObject *args)
				4024	{
				4025	PyUnicodeObject *substring;
				4026	int start = 0;
				4027	int end = INT_MAX;
				4028	PyObject *result;
				4029
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4030	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4031	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4032	return NULL;
				4033	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4034	(PyObject *)substring);
				4035	if (substring == NULL)
				4036	return NULL;
				4037
				4038	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4039
				4040	Py_DECREF(substring);
				4041	return result;
				4042	}
				4043
				4044
				4045	static PyMethodDef unicode_methods[] = {
				4046
				4047	/* Order is according to common usage: often used methods should
				4048	appear first, since lookup is done sequentially. */
				4049
				4050	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4051	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4052	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4053	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4054	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4055	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4056	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4057	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4058	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4059	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4060	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4061	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4062	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4063	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4064	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4065	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4066	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4067	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4068	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4069	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4070	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4071	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4072	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4073	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4074	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4075	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4076	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4077	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4078	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4079	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4080	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4081	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4082	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
				4083	#if 0
				4084	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4085	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4086	#endif
				4087
				4088	#if 0
				4089	/* This one is just used for debugging the implementation. */
				4090	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4091	#endif
				4092
				4093	{NULL, NULL}
				4094	};
				4095
				4096	static PyObject *
				4097	unicode_getattr(PyUnicodeObject self, char name)
				4098	{
				4099	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4100	}
				4101
				4102	static PySequenceMethods unicode_as_sequence = {
				4103	(inquiry) unicode_length, /* sq_length */
				4104	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4105	(intargfunc) unicode_repeat, /* sq_repeat */
				4106	(intargfunc) unicode_getitem, /* sq_item */
				4107	(intintargfunc) unicode_slice, /* sq_slice */
				4108	0, /* sq_ass_item */
				4109	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4110	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4111	};
				4112
				4113	static int
				4114	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4115	int index,
				4116	const void **ptr)
				4117	{
				4118	if (index != 0) {
				4119	PyErr_SetString(PyExc_SystemError,
				4120	"accessing non-existent unicode segment");
				4121	return -1;
				4122	}
				4123	ptr = (void ) self->str;
				4124	return PyUnicode_GET_DATA_SIZE(self);
				4125	}
				4126
				4127	static int
				4128	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4129	const void **ptr)
				4130	{
				4131	PyErr_SetString(PyExc_TypeError,
				4132	"cannot use unicode as modifyable buffer");
				4133	return -1;
				4134	}
				4135
				4136	static int
				4137	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4138	int *lenp)
				4139	{
				4140	if (lenp)
				4141	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4142	return 1;
				4143	}
				4144
				4145	static int
				4146	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4147	int index,
				4148	const void **ptr)
				4149	{
				4150	PyObject *str;
				4151
				4152	if (index != 0) {
				4153	PyErr_SetString(PyExc_SystemError,
				4154	"accessing non-existent unicode segment");
				4155	return -1;
				4156	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	4157	str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4158	if (str == NULL)
				4159	return -1;
				4160	ptr = (void ) PyString_AS_STRING(str);
				4161	return PyString_GET_SIZE(str);
				4162	}
				4163
				4164	/* Helpers for PyUnicode_Format() */
				4165
				4166	static PyObject *
				4167	getnextarg(args, arglen, p_argidx)
				4168	PyObject *args;
				4169	int arglen;
				4170	int *p_argidx;
				4171	{
				4172	int argidx = *p_argidx;
				4173	if (argidx < arglen) {
				4174	(*p_argidx)++;
				4175	if (arglen < 0)
				4176	return args;
				4177	else
				4178	return PyTuple_GetItem(args, argidx);
				4179	}
				4180	PyErr_SetString(PyExc_TypeError,
				4181	"not enough arguments for format string");
				4182	return NULL;
				4183	}
				4184
				4185	#define F_LJUST (1<<0)
				4186	#define F_SIGN (1<<1)
				4187	#define F_BLANK (1<<2)
				4188	#define F_ALT (1<<3)
				4189	#define F_ZERO (1<<4)
				4190
				4191	static
				4192	#ifdef HAVE_STDARG_PROTOTYPES
				4193	int usprintf(register Py_UNICODE buffer, char format, ...)
				4194	#else
				4195	int usprintf(va_alist) va_dcl
				4196	#endif
				4197	{
				4198	register int i;
				4199	int len;
				4200	va_list va;
				4201	char *charbuffer;
				4202	#ifdef HAVE_STDARG_PROTOTYPES
				4203	va_start(va, format);
				4204	#else
				4205	Py_UNICODE *args;
				4206	char *format;
				4207
				4208	va_start(va);
				4209	buffer = va_arg(va, Py_UNICODE *);
				4210	format = va_arg(va, char *);
				4211	#endif
				4212
				4213	/* First, format the string as char array, then expand to Py_UNICODE
				4214	array. */
				4215	charbuffer = (char *)buffer;
				4216	len = vsprintf(charbuffer, format, va);
				4217	for (i = len - 1; i >= 0; i--)
				4218	buffer[i] = (Py_UNICODE) charbuffer[i];
				4219
				4220	va_end(va);
				4221	return len;
				4222	}
				4223
				4224	static int
				4225	formatfloat(Py_UNICODE *buf,
				4226	int flags,
				4227	int prec,
				4228	int type,
				4229	PyObject *v)
				4230	{
				4231	char fmt[20];
				4232	double x;
				4233
				4234	x = PyFloat_AsDouble(v);
				4235	if (x == -1.0 && PyErr_Occurred())
				4236	return -1;
				4237	if (prec < 0)
				4238	prec = 6;
				4239	if (prec > 50)
				4240	prec = 50; /* Arbitrary limitation */
				4241	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4242	type = 'g';
				4243	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
				4244	return usprintf(buf, fmt, x);
				4245	}
				4246
				4247	static int
				4248	formatint(Py_UNICODE *buf,
				4249	int flags,
				4250	int prec,
				4251	int type,
				4252	PyObject *v)
				4253	{
				4254	char fmt[20];
				4255	long x;
				4256
				4257	x = PyInt_AsLong(v);
				4258	if (x == -1 && PyErr_Occurred())
				4259	return -1;
				4260	if (prec < 0)
				4261	prec = 1;
				4262	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4263	return usprintf(buf, fmt, x);
				4264	}
				4265
				4266	static int
				4267	formatchar(Py_UNICODE *buf,
				4268	PyObject *v)
				4269	{
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4270	if (PyUnicode_Check(v)) {
				4271	if (PyUnicode_GET_SIZE(v) != 1)
				4272	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4273	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4274	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4275
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4276	else if (PyString_Check(v)) {
				4277	if (PyString_GET_SIZE(v) != 1)
				4278	goto onError;
				4279	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4280	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4281
				4282	else {
				4283	/* Integer input truncated to a character */
				4284	long x;
				4285	x = PyInt_AsLong(v);
				4286	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4287	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4288	buf[0] = (char) x;
				4289	}
				4290	buf[1] = '\0';
				4291	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4292
				4293	onError:
				4294	PyErr_SetString(PyExc_TypeError,
				4295	"%c requires int or char");
				4296	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4297	}
				4298
				4299	PyObject PyUnicode_Format(PyObject format,
				4300	PyObject *args)
				4301	{
				4302	Py_UNICODE fmt, res;
				4303	int fmtcnt, rescnt, reslen, arglen, argidx;
				4304	int args_owned = 0;
				4305	PyUnicodeObject *result = NULL;
				4306	PyObject *dict = NULL;
				4307	PyObject *uformat;
				4308
				4309	if (format == NULL \|\| args == NULL) {
				4310	PyErr_BadInternalCall();
				4311	return NULL;
				4312	}
				4313	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4314	if (uformat == NULL)
				4315	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4316	fmt = PyUnicode_AS_UNICODE(uformat);
				4317	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4318
				4319	reslen = rescnt = fmtcnt + 100;
				4320	result = _PyUnicode_New(reslen);
				4321	if (result == NULL)
				4322	goto onError;
				4323	res = PyUnicode_AS_UNICODE(result);
				4324
				4325	if (PyTuple_Check(args)) {
				4326	arglen = PyTuple_Size(args);
				4327	argidx = 0;
				4328	}
				4329	else {
				4330	arglen = -1;
				4331	argidx = -2;
				4332	}
				4333	if (args->ob_type->tp_as_mapping)
				4334	dict = args;
				4335
				4336	while (--fmtcnt >= 0) {
				4337	if (*fmt != '%') {
				4338	if (--rescnt < 0) {
				4339	rescnt = fmtcnt + 100;
				4340	reslen += rescnt;
				4341	if (_PyUnicode_Resize(result, reslen) < 0)
				4342	return NULL;
				4343	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4344	--rescnt;
				4345	}
				4346	res++ = fmt++;
				4347	}
				4348	else {
				4349	/* Got a format specifier */
				4350	int flags = 0;
				4351	int width = -1;
				4352	int prec = -1;
				4353	int size = 0;
				4354	Py_UNICODE c = '\0';
				4355	Py_UNICODE fill;
				4356	PyObject *v = NULL;
				4357	PyObject *temp = NULL;
				4358	Py_UNICODE *buf;
				4359	Py_UNICODE sign;
				4360	int len;
				4361	Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
				4362
				4363	fmt++;
				4364	if (*fmt == '(') {
				4365	Py_UNICODE *keystart;
				4366	int keylen;
				4367	PyObject *key;
				4368	int pcount = 1;
				4369
				4370	if (dict == NULL) {
				4371	PyErr_SetString(PyExc_TypeError,
				4372	"format requires a mapping");
				4373	goto onError;
				4374	}
				4375	++fmt;
				4376	--fmtcnt;
				4377	keystart = fmt;
				4378	/* Skip over balanced parentheses */
				4379	while (pcount > 0 && --fmtcnt >= 0) {
				4380	if (*fmt == ')')
				4381	--pcount;
				4382	else if (*fmt == '(')
				4383	++pcount;
				4384	fmt++;
				4385	}
				4386	keylen = fmt - keystart - 1;
				4387	if (fmtcnt < 0 \|\| pcount > 0) {
				4388	PyErr_SetString(PyExc_ValueError,
				4389	"incomplete format key");
				4390	goto onError;
				4391	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4392	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4393	then looked up since Python uses strings to hold
				4394	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4395	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4396	key = PyUnicode_EncodeUTF8(keystart,
				4397	keylen,
				4398	NULL);
				4399	if (key == NULL)
				4400	goto onError;
				4401	if (args_owned) {
				4402	Py_DECREF(args);
				4403	args_owned = 0;
				4404	}
				4405	args = PyObject_GetItem(dict, key);
				4406	Py_DECREF(key);
				4407	if (args == NULL) {
				4408	goto onError;
				4409	}
				4410	args_owned = 1;
				4411	arglen = -1;
				4412	argidx = -2;
				4413	}
				4414	while (--fmtcnt >= 0) {
				4415	switch (c = *fmt++) {
				4416	case '-': flags \|= F_LJUST; continue;
				4417	case '+': flags \|= F_SIGN; continue;
				4418	case ' ': flags \|= F_BLANK; continue;
				4419	case '#': flags \|= F_ALT; continue;
				4420	case '0': flags \|= F_ZERO; continue;
				4421	}
				4422	break;
				4423	}
				4424	if (c == '*') {
				4425	v = getnextarg(args, arglen, &argidx);
				4426	if (v == NULL)
				4427	goto onError;
				4428	if (!PyInt_Check(v)) {
				4429	PyErr_SetString(PyExc_TypeError,
				4430	"* wants int");
				4431	goto onError;
				4432	}
				4433	width = PyInt_AsLong(v);
				4434	if (width < 0) {
				4435	flags \|= F_LJUST;
				4436	width = -width;
				4437	}
				4438	if (--fmtcnt >= 0)
				4439	c = *fmt++;
				4440	}
				4441	else if (c >= '0' && c <= '9') {
				4442	width = c - '0';
				4443	while (--fmtcnt >= 0) {
				4444	c = *fmt++;
				4445	if (c < '0' \|\| c > '9')
				4446	break;
				4447	if ((width*10) / 10 != width) {
				4448	PyErr_SetString(PyExc_ValueError,
				4449	"width too big");
				4450	goto onError;
				4451	}
				4452	width = width*10 + (c - '0');
				4453	}
				4454	}
				4455	if (c == '.') {
				4456	prec = 0;
				4457	if (--fmtcnt >= 0)
				4458	c = *fmt++;
				4459	if (c == '*') {
				4460	v = getnextarg(args, arglen, &argidx);
				4461	if (v == NULL)
				4462	goto onError;
				4463	if (!PyInt_Check(v)) {
				4464	PyErr_SetString(PyExc_TypeError,
				4465	"* wants int");
				4466	goto onError;
				4467	}
				4468	prec = PyInt_AsLong(v);
				4469	if (prec < 0)
				4470	prec = 0;
				4471	if (--fmtcnt >= 0)
				4472	c = *fmt++;
				4473	}
				4474	else if (c >= '0' && c <= '9') {
				4475	prec = c - '0';
				4476	while (--fmtcnt >= 0) {
				4477	c = Py_CHARMASK(*fmt++);
				4478	if (c < '0' \|\| c > '9')
				4479	break;
				4480	if ((prec*10) / 10 != prec) {
				4481	PyErr_SetString(PyExc_ValueError,
				4482	"prec too big");
				4483	goto onError;
				4484	}
				4485	prec = prec*10 + (c - '0');
				4486	}
				4487	}
				4488	} /* prec */
				4489	if (fmtcnt >= 0) {
				4490	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4491	size = c;
				4492	if (--fmtcnt >= 0)
				4493	c = *fmt++;
				4494	}
				4495	}
				4496	if (fmtcnt < 0) {
				4497	PyErr_SetString(PyExc_ValueError,
				4498	"incomplete format");
				4499	goto onError;
				4500	}
				4501	if (c != '%') {
				4502	v = getnextarg(args, arglen, &argidx);
				4503	if (v == NULL)
				4504	goto onError;
				4505	}
				4506	sign = 0;
				4507	fill = ' ';
				4508	switch (c) {
				4509
				4510	case '%':
				4511	buf = tmpbuf;
				4512	buf[0] = '%';
				4513	len = 1;
				4514	break;
				4515
				4516	case 's':
				4517	case 'r':
				4518	if (PyUnicode_Check(v) && c == 's') {
				4519	temp = v;
				4520	Py_INCREF(temp);
				4521	}
				4522	else {
				4523	PyObject *unicode;
				4524	if (c == 's')
				4525	temp = PyObject_Str(v);
				4526	else
				4527	temp = PyObject_Repr(v);
				4528	if (temp == NULL)
				4529	goto onError;
				4530	if (!PyString_Check(temp)) {
				4531	/* XXX Note: this should never happen, since
				4532	PyObject_Repr() and PyObject_Str() assure
				4533	this */
				4534	Py_DECREF(temp);
				4535	PyErr_SetString(PyExc_TypeError,
				4536	"%s argument has non-string str()");
				4537	goto onError;
				4538	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4539	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4540	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4541	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4542	"strict");
				4543	Py_DECREF(temp);
				4544	temp = unicode;
				4545	if (temp == NULL)
				4546	goto onError;
				4547	}
				4548	buf = PyUnicode_AS_UNICODE(temp);
				4549	len = PyUnicode_GET_SIZE(temp);
				4550	if (prec >= 0 && len > prec)
				4551	len = prec;
				4552	break;
				4553
				4554	case 'i':
				4555	case 'd':
				4556	case 'u':
				4557	case 'o':
				4558	case 'x':
				4559	case 'X':
				4560	if (c == 'i')
				4561	c = 'd';
				4562	buf = tmpbuf;
				4563	len = formatint(buf, flags, prec, c, v);
				4564	if (len < 0)
				4565	goto onError;
				4566	sign = (c == 'd');
				4567	if (flags & F_ZERO) {
				4568	fill = '0';
				4569	if ((flags&F_ALT) &&
				4570	(c == 'x' \|\| c == 'X') &&
				4571	buf[0] == '0' && buf[1] == c) {
				4572	res++ = buf++;
				4573	res++ = buf++;
				4574	rescnt -= 2;
				4575	len -= 2;
				4576	width -= 2;
				4577	if (width < 0)
				4578	width = 0;
				4579	}
				4580	}
				4581	break;
				4582
				4583	case 'e':
				4584	case 'E':
				4585	case 'f':
				4586	case 'g':
				4587	case 'G':
				4588	buf = tmpbuf;
				4589	len = formatfloat(buf, flags, prec, c, v);
				4590	if (len < 0)
				4591	goto onError;
				4592	sign = 1;
				4593	if (flags&F_ZERO)
				4594	fill = '0';
				4595	break;
				4596
				4597	case 'c':
				4598	buf = tmpbuf;
				4599	len = formatchar(buf, v);
				4600	if (len < 0)
				4601	goto onError;
				4602	break;
				4603
				4604	default:
				4605	PyErr_Format(PyExc_ValueError,
				4606	"unsupported format character '%c' (0x%x)",
				4607	c, c);
				4608	goto onError;
				4609	}
				4610	if (sign) {
				4611	if (buf == '-' \|\| buf == '+') {
				4612	sign = *buf++;
				4613	len--;
				4614	}
				4615	else if (flags & F_SIGN)
				4616	sign = '+';
				4617	else if (flags & F_BLANK)
				4618	sign = ' ';
				4619	else
				4620	sign = 0;
				4621	}
				4622	if (width < len)
				4623	width = len;
				4624	if (rescnt < width + (sign != 0)) {
				4625	reslen -= rescnt;
				4626	rescnt = width + fmtcnt + 100;
				4627	reslen += rescnt;
				4628	if (_PyUnicode_Resize(result, reslen) < 0)
				4629	return NULL;
				4630	res = PyUnicode_AS_UNICODE(result)
				4631	+ reslen - rescnt;
				4632	}
				4633	if (sign) {
				4634	if (fill != ' ')
				4635	*res++ = sign;
				4636	rescnt--;
				4637	if (width > len)
				4638	width--;
				4639	}
				4640	if (width > len && !(flags & F_LJUST)) {
				4641	do {
				4642	--rescnt;
				4643	*res++ = fill;
				4644	} while (--width > len);
				4645	}
				4646	if (sign && fill == ' ')
				4647	*res++ = sign;
				4648	memcpy(res, buf, len * sizeof(Py_UNICODE));
				4649	res += len;
				4650	rescnt -= len;
				4651	while (--width >= len) {
				4652	--rescnt;
				4653	*res++ = ' ';
				4654	}
				4655	if (dict && (argidx < arglen) && c != '%') {
				4656	PyErr_SetString(PyExc_TypeError,
				4657	"not all arguments converted");
				4658	goto onError;
				4659	}
				4660	Py_XDECREF(temp);
				4661	} /* '%' */
				4662	} /* until end */
				4663	if (argidx < arglen && !dict) {
				4664	PyErr_SetString(PyExc_TypeError,
				4665	"not all arguments converted");
				4666	goto onError;
				4667	}
				4668
				4669	if (args_owned) {
				4670	Py_DECREF(args);
				4671	}
				4672	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4673	if (_PyUnicode_Resize(result, reslen - rescnt))
				4674	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4675	return (PyObject *)result;
				4676
				4677	onError:
				4678	Py_XDECREF(result);
				4679	Py_DECREF(uformat);
				4680	if (args_owned) {
				4681	Py_DECREF(args);
				4682	}
				4683	return NULL;
				4684	}
				4685
				4686	static PyBufferProcs unicode_as_buffer = {
				4687	(getreadbufferproc) unicode_buffer_getreadbuf,
				4688	(getwritebufferproc) unicode_buffer_getwritebuf,
				4689	(getsegcountproc) unicode_buffer_getsegcount,
				4690	(getcharbufferproc) unicode_buffer_getcharbuf,
				4691	};
				4692
				4693	PyTypeObject PyUnicode_Type = {
				4694	PyObject_HEAD_INIT(&PyType_Type)
				4695	0, /* ob_size */
				4696	"unicode", /* tp_name */
				4697	sizeof(PyUnicodeObject), /* tp_size */
				4698	0, /* tp_itemsize */
				4699	/* Slots */
				4700	(destructor)_PyUnicode_Free, /* tp_dealloc */
				4701	0, /* tp_print */
				4702	(getattrfunc)unicode_getattr, /* tp_getattr */
				4703	0, /* tp_setattr */
				4704	(cmpfunc) unicode_compare, /* tp_compare */
				4705	(reprfunc) unicode_repr, /* tp_repr */
				4706	0, /* tp_as_number */
				4707	&unicode_as_sequence, /* tp_as_sequence */
				4708	0, /* tp_as_mapping */
				4709	(hashfunc) unicode_hash, /* tp_hash*/
				4710	0, /* tp_call*/
				4711	(reprfunc) unicode_str, /* tp_str */
				4712	(getattrofunc) NULL, /* tp_getattro */
				4713	(setattrofunc) NULL, /* tp_setattro */
				4714	&unicode_as_buffer, /* tp_as_buffer */
				4715	Py_TPFLAGS_DEFAULT, /* tp_flags */
				4716	};
				4717
				4718	/* Initialize the Unicode implementation */
				4719
				4720	void _PyUnicode_Init()
				4721	{
				4722	/* Doublecheck the configuration... */
				4723	if (sizeof(Py_UNICODE) != 2)
				4724	Py_FatalError("Unicode configuration error: "
				4725	"sizeof(Py_UNICODE) != 2 bytes");
				4726
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4727	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4728	unicode_freelist = NULL;
				4729	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4730	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	4731	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4732	}
				4733
				4734	/* Finalize the Unicode implementation */
				4735
				4736	void
				4737	_PyUnicode_Fini()
				4738	{
				4739	PyUnicodeObject *u = unicode_freelist;
				4740
				4741	while (u != NULL) {
				4742	PyUnicodeObject *v = u;
				4743	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4744	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	4745	PyMem_DEL(v->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4746	Py_XDECREF(v->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	4747	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4748	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4749	unicode_freelist = NULL;
				4750	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4751	Py_XDECREF(unicode_empty);
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4752	unicode_empty = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4753	}