Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: dcef11bc8c459c55565c3370aa378c051652610c [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	69	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	#if defined(HAVE_LIMITS_H)
				72	#include <limits.h>
				73	#else
				74	#define INT_MAX 2147483647
				75	#endif
				76
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	77	#ifdef MS_WIN32
				78	#include <windows.h>
				79	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	80
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	81	/* Limit for the Unicode object free list */
				82
				83	#define MAX_UNICODE_FREELIST_SIZE 1024
				84
				85	/* Limit for the Unicode object free list stay alive optimization.
				86
				87	The implementation will keep allocated Unicode memory intact for
				88	all objects on the free list having a size less than this
				89	limit. This reduces malloc() overhead for small Unicode objects.
				90
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	91	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	92	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	93	malloc()-overhead) bytes of unused garbage.
				94
				95	Setting the limit to 0 effectively turns the feature off.
				96
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	97	Note: This is an experimental feature ! If you get core dumps when
				98	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	99
				100	*/
				101
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	102	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	103
				104	/* Endianness switches; defaults to little endian */
				105
				106	#ifdef WORDS_BIGENDIAN
				107	# define BYTEORDER_IS_BIG_ENDIAN
				108	#else
				109	# define BYTEORDER_IS_LITTLE_ENDIAN
				110	#endif
				111
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	112	/* --- Globals ------------------------------------------------------------
				113
				114	The globals are initialized by the _PyUnicode_Init() API and should
				115	not be used before calling that API.
				116
				117	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118
				119	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	120	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	121
				122	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	123	static PyUnicodeObject *unicode_freelist;
				124	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	125
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	126	/* Default encoding to use and assume when NULL is passed as encoding
				127	parameter; it is initialized by _PyUnicode_Init().
				128
				129	Always use the PyUnicode_SetDefaultEncoding() and
				130	PyUnicode_GetDefaultEncoding() APIs to access this global.
				131
				132	*/
				133
				134	static char unicode_default_encoding[100];
				135
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	136	/* --- Unicode Object ----------------------------------------------------- */
				137
				138	static
				139	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				140	int length)
				141	{
				142	void *oldstr;
				143
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	144	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	145	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	146	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	147
				148	/* Resizing unicode_empty is not allowed. */
				149	if (unicode == unicode_empty) {
				150	PyErr_SetString(PyExc_SystemError,
				151	"can't resize empty unicode object");
				152	return -1;
				153	}
				154
				155	/* We allocate one more byte to make sure the string is
				156	Ux0000 terminated -- XXX is this needed ? */
				157	oldstr = unicode->str;
				158	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				159	if (!unicode->str) {
				160	unicode->str = oldstr;
				161	PyErr_NoMemory();
				162	return -1;
				163	}
				164	unicode->str[length] = 0;
				165	unicode->length = length;
				166
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	167	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	168	/* Reset the object caches */
				169	if (unicode->utf8str) {
				170	Py_DECREF(unicode->utf8str);
				171	unicode->utf8str = NULL;
				172	}
				173	unicode->hash = -1;
				174
				175	return 0;
				176	}
				177
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	178	int PyUnicode_Resize(PyObject **unicode,
				179	int length)
				180	{
				181	PyUnicodeObject *v;
				182
				183	if (unicode == NULL) {
				184	PyErr_BadInternalCall();
				185	return -1;
				186	}
				187	v = (PyUnicodeObject )unicode;
				188	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				189	PyErr_BadInternalCall();
				190	return -1;
				191	}
				192	return _PyUnicode_Resize(v, length);
				193	}
				194
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	195	/* We allocate one more byte to make sure the string is
				196	Ux0000 terminated -- XXX is this needed ?
				197
				198	XXX This allocator could further be enhanced by assuring that the
				199	free list never reduces its size below 1.
				200
				201	*/
				202
				203	static
				204	PyUnicodeObject *_PyUnicode_New(int length)
				205	{
				206	register PyUnicodeObject *unicode;
				207
				208	/* Optimization for empty strings */
				209	if (length == 0 && unicode_empty != NULL) {
				210	Py_INCREF(unicode_empty);
				211	return unicode_empty;
				212	}
				213
				214	/* Unicode freelist & memory allocation */
				215	if (unicode_freelist) {
				216	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	217	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	219	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	220	/* Keep-Alive optimization: we only upsize the buffer,
				221	never downsize it. */
				222	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	223	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	224	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	225	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	226	}
				227	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	228	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	230	}
				231	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	232	}
				233	else {
				234	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				235	if (unicode == NULL)
				236	return NULL;
				237	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				238	}
				239
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	240	if (!unicode->str) {
				241	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	242	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	243	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	244	unicode->str[length] = 0;
				245	unicode->length = length;
				246	unicode->hash = -1;
				247	unicode->utf8str = NULL;
				248	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	249
				250	onError:
				251	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	252	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	253	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	254	}
				255
				256	static
				257	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				258	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	259	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	260	/* Keep-Alive optimization */
				261	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	262	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	263	unicode->str = NULL;
				264	unicode->length = 0;
				265	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	266	if (unicode->utf8str) {
				267	Py_DECREF(unicode->utf8str);
				268	unicode->utf8str = NULL;
				269	}
				270	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	271	(PyUnicodeObject *)unicode = unicode_freelist;
				272	unicode_freelist = unicode;
				273	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	274	}
				275	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	276	PyMem_DEL(unicode->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	277	Py_XDECREF(unicode->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	278	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	279	}
				280	}
				281
				282	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				283	int size)
				284	{
				285	PyUnicodeObject *unicode;
				286
				287	unicode = _PyUnicode_New(size);
				288	if (!unicode)
				289	return NULL;
				290
				291	/* Copy the Unicode data into the new object */
				292	if (u != NULL)
				293	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				294
				295	return (PyObject *)unicode;
				296	}
				297
				298	#ifdef HAVE_WCHAR_H
				299
				300	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				301	int size)
				302	{
				303	PyUnicodeObject *unicode;
				304
				305	if (w == NULL) {
				306	PyErr_BadInternalCall();
				307	return NULL;
				308	}
				309
				310	unicode = _PyUnicode_New(size);
				311	if (!unicode)
				312	return NULL;
				313
				314	/* Copy the wchar_t data into the new object */
				315	#ifdef HAVE_USABLE_WCHAR_T
				316	memcpy(unicode->str, w, size * sizeof(wchar_t));
				317	#else
				318	{
				319	register Py_UNICODE *u;
				320	register int i;
				321	u = PyUnicode_AS_UNICODE(unicode);
				322	for (i = size; i >= 0; i--)
				323	u++ = w++;
				324	}
				325	#endif
				326
				327	return (PyObject *)unicode;
				328	}
				329
				330	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				331	register wchar_t *w,
				332	int size)
				333	{
				334	if (unicode == NULL) {
				335	PyErr_BadInternalCall();
				336	return -1;
				337	}
				338	if (size > PyUnicode_GET_SIZE(unicode))
				339	size = PyUnicode_GET_SIZE(unicode);
				340	#ifdef HAVE_USABLE_WCHAR_T
				341	memcpy(w, unicode->str, size * sizeof(wchar_t));
				342	#else
				343	{
				344	register Py_UNICODE *u;
				345	register int i;
				346	u = PyUnicode_AS_UNICODE(unicode);
				347	for (i = size; i >= 0; i--)
				348	w++ = u++;
				349	}
				350	#endif
				351
				352	return size;
				353	}
				354
				355	#endif
				356
				357	PyObject PyUnicode_FromObject(register PyObject obj)
				358	{
				359	const char *s;
				360	int len;
				361
				362	if (obj == NULL) {
				363	PyErr_BadInternalCall();
				364	return NULL;
				365	}
				366	else if (PyUnicode_Check(obj)) {
				367	Py_INCREF(obj);
				368	return obj;
				369	}
				370	else if (PyString_Check(obj)) {
				371	s = PyString_AS_STRING(obj);
				372	len = PyString_GET_SIZE(obj);
				373	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	374	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				375	/* Overwrite the error message with something more useful in
				376	case of a TypeError. */
				377	if (PyErr_ExceptionMatches(PyExc_TypeError))
				378	PyErr_SetString(PyExc_TypeError,
				379	"coercing to Unicode: need string or charbuffer");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	380	return NULL;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	381	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	382	if (len == 0) {
				383	Py_INCREF(unicode_empty);
				384	return (PyObject *)unicode_empty;
				385	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	386	return PyUnicode_Decode(s, len, NULL, "strict");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	387	}
				388
				389	PyObject PyUnicode_Decode(const char s,
				390	int size,
				391	const char *encoding,
				392	const char *errors)
				393	{
				394	PyObject buffer = NULL, unicode;
				395
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	396	if (encoding == NULL)
				397	encoding = PyUnicode_GetDefaultEncoding();
				398
				399	/* Shortcuts for common default encodings */
				400	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	401	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	402	else if (strcmp(encoding, "latin-1") == 0)
				403	return PyUnicode_DecodeLatin1(s, size, errors);
				404	else if (strcmp(encoding, "ascii") == 0)
				405	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	406
				407	/* Decode via the codec registry */
				408	buffer = PyBuffer_FromMemory((void *)s, size);
				409	if (buffer == NULL)
				410	goto onError;
				411	unicode = PyCodec_Decode(buffer, encoding, errors);
				412	if (unicode == NULL)
				413	goto onError;
				414	if (!PyUnicode_Check(unicode)) {
				415	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	416	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	417	unicode->ob_type->tp_name);
				418	Py_DECREF(unicode);
				419	goto onError;
				420	}
				421	Py_DECREF(buffer);
				422	return unicode;
				423
				424	onError:
				425	Py_XDECREF(buffer);
				426	return NULL;
				427	}
				428
				429	PyObject PyUnicode_Encode(const Py_UNICODE s,
				430	int size,
				431	const char *encoding,
				432	const char *errors)
				433	{
				434	PyObject v, unicode;
				435
				436	unicode = PyUnicode_FromUnicode(s, size);
				437	if (unicode == NULL)
				438	return NULL;
				439	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				440	Py_DECREF(unicode);
				441	return v;
				442	}
				443
				444	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				445	const char *encoding,
				446	const char *errors)
				447	{
				448	PyObject *v;
				449
				450	if (!PyUnicode_Check(unicode)) {
				451	PyErr_BadArgument();
				452	goto onError;
				453	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	454
				455	if (encoding == NULL)
				456	encoding = PyUnicode_GetDefaultEncoding();
				457
				458	/* Shortcuts for common default encodings */
				459	if (errors == NULL) {
				460	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	461	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	462	else if (strcmp(encoding, "latin-1") == 0)
				463	return PyUnicode_AsLatin1String(unicode);
				464	else if (strcmp(encoding, "ascii") == 0)
				465	return PyUnicode_AsASCIIString(unicode);
				466	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	467
				468	/* Encode via the codec registry */
				469	v = PyCodec_Encode(unicode, encoding, errors);
				470	if (v == NULL)
				471	goto onError;
				472	/* XXX Should we really enforce this ? */
				473	if (!PyString_Check(v)) {
				474	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	475	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	476	v->ob_type->tp_name);
				477	Py_DECREF(v);
				478	goto onError;
				479	}
				480	return v;
				481
				482	onError:
				483	return NULL;
				484	}
				485
				486	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				487	{
				488	if (!PyUnicode_Check(unicode)) {
				489	PyErr_BadArgument();
				490	goto onError;
				491	}
				492	return PyUnicode_AS_UNICODE(unicode);
				493
				494	onError:
				495	return NULL;
				496	}
				497
				498	int PyUnicode_GetSize(PyObject *unicode)
				499	{
				500	if (!PyUnicode_Check(unicode)) {
				501	PyErr_BadArgument();
				502	goto onError;
				503	}
				504	return PyUnicode_GET_SIZE(unicode);
				505
				506	onError:
				507	return -1;
				508	}
				509
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	510	const char *PyUnicode_GetDefaultEncoding()
				511	{
				512	return unicode_default_encoding;
				513	}
				514
				515	int PyUnicode_SetDefaultEncoding(const char *encoding)
				516	{
				517	PyObject *v;
				518
				519	/* Make sure the encoding is valid. As side effect, this also
				520	loads the encoding into the codec registry cache. */
				521	v = _PyCodec_Lookup(encoding);
				522	if (v == NULL)
				523	goto onError;
				524	Py_DECREF(v);
				525	strncpy(unicode_default_encoding,
				526	encoding,
				527	sizeof(unicode_default_encoding));
				528	return 0;
				529
				530	onError:
				531	return -1;
				532	}
				533
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	534	/* --- UTF-8 Codec -------------------------------------------------------- */
				535
				536	static
				537	char utf8_code_length[256] = {
				538	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				539	illegal prefix. see RFC 2279 for details */
				540	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				541	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				542	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				543	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				544	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				545	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				546	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				547	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				548	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				549	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				550	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				551	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				552	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				553	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				554	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				555	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				556	};
				557
				558	static
				559	int utf8_decoding_error(const char **source,
				560	Py_UNICODE **dest,
				561	const char *errors,
				562	const char *details)
				563	{
				564	if ((errors == NULL) \|\|
				565	(strcmp(errors,"strict") == 0)) {
				566	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	567	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	568	details);
				569	return -1;
				570	}
				571	else if (strcmp(errors,"ignore") == 0) {
				572	(*source)++;
				573	return 0;
				574	}
				575	else if (strcmp(errors,"replace") == 0) {
				576	(*source)++;
				577	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				578	(*dest)++;
				579	return 0;
				580	}
				581	else {
				582	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	583	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	584	errors);
				585	return -1;
				586	}
				587	}
				588
				589	#define UTF8_ERROR(details) do { \
				590	if (utf8_decoding_error(&s, &p, errors, details)) \
				591	goto onError; \
				592	continue; \
				593	} while (0)
				594
				595	PyObject PyUnicode_DecodeUTF8(const char s,
				596	int size,
				597	const char *errors)
				598	{
				599	int n;
				600	const char *e;
				601	PyUnicodeObject *unicode;
				602	Py_UNICODE *p;
				603
				604	/* Note: size will always be longer than the resulting Unicode
				605	character count */
				606	unicode = _PyUnicode_New(size);
				607	if (!unicode)
				608	return NULL;
				609	if (size == 0)
				610	return (PyObject *)unicode;
				611
				612	/* Unpack UTF-8 encoded data */
				613	p = unicode->str;
				614	e = s + size;
				615
				616	while (s < e) {
				617	register Py_UNICODE ch = (unsigned char)*s;
				618
				619	if (ch < 0x80) {
				620	*p++ = ch;
				621	s++;
				622	continue;
				623	}
				624
				625	n = utf8_code_length[ch];
				626
				627	if (s + n > e)
				628	UTF8_ERROR("unexpected end of data");
				629
				630	switch (n) {
				631
				632	case 0:
				633	UTF8_ERROR("unexpected code byte");
				634	break;
				635
				636	case 1:
				637	UTF8_ERROR("internal error");
				638	break;
				639
				640	case 2:
				641	if ((s[1] & 0xc0) != 0x80)
				642	UTF8_ERROR("invalid data");
				643	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				644	if (ch < 0x80)
				645	UTF8_ERROR("illegal encoding");
				646	else
				647	*p++ = ch;
				648	break;
				649
				650	case 3:
				651	if ((s[1] & 0xc0) != 0x80 \|\|
				652	(s[2] & 0xc0) != 0x80)
				653	UTF8_ERROR("invalid data");
				654	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				655	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				656	UTF8_ERROR("illegal encoding");
				657	else
				658	*p++ = ch;
				659	break;
				660
				661	default:
				662	/* Other sizes are only needed for UCS-4 */
				663	UTF8_ERROR("unsupported Unicode code range");
				664	}
				665	s += n;
				666	}
				667
				668	/* Adjust length */
				669	if (_PyUnicode_Resize(unicode, p - unicode->str))
				670	goto onError;
				671
				672	return (PyObject *)unicode;
				673
				674	onError:
				675	Py_DECREF(unicode);
				676	return NULL;
				677	}
				678
				679	#undef UTF8_ERROR
				680
				681	static
				682	int utf8_encoding_error(const Py_UNICODE **source,
				683	char **dest,
				684	const char *errors,
				685	const char *details)
				686	{
				687	if ((errors == NULL) \|\|
				688	(strcmp(errors,"strict") == 0)) {
				689	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	690	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	691	details);
				692	return -1;
				693	}
				694	else if (strcmp(errors,"ignore") == 0) {
				695	return 0;
				696	}
				697	else if (strcmp(errors,"replace") == 0) {
				698	**dest = '?';
				699	(*dest)++;
				700	return 0;
				701	}
				702	else {
				703	PyErr_Format(PyExc_ValueError,
				704	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	705	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	706	errors);
				707	return -1;
				708	}
				709	}
				710
				711	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				712	int size,
				713	const char *errors)
				714	{
				715	PyObject *v;
				716	char *p;
				717	char *q;
				718
				719	v = PyString_FromStringAndSize(NULL, 3 * size);
				720	if (v == NULL)
				721	return NULL;
				722	if (size == 0)
				723	goto done;
				724
				725	p = q = PyString_AS_STRING(v);
				726	while (size-- > 0) {
				727	Py_UNICODE ch = *s++;
				728	if (ch < 0x80)
				729	*p++ = (char) ch;
				730	else if (ch < 0x0800) {
				731	*p++ = 0xc0 \| (ch >> 6);
				732	*p++ = 0x80 \| (ch & 0x3f);
				733	} else if (0xD800 <= ch && ch <= 0xDFFF) {
				734	/* These byte ranges are reserved for UTF-16 surrogate
				735	bytes which the Python implementation currently does
				736	not support. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	737	if (utf8_encoding_error(&s, &p, errors,
				738	"unsupported code range"))
				739	goto onError;
				740	} else {
				741	*p++ = 0xe0 \| (ch >> 12);
				742	*p++ = 0x80 \| ((ch >> 6) & 0x3f);
				743	*p++ = 0x80 \| (ch & 0x3f);
				744	}
				745	}
				746	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	747	if (_PyString_Resize(&v, p - q))
				748	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	749
				750	done:
				751	return v;
				752
				753	onError:
				754	Py_DECREF(v);
				755	return NULL;
				756	}
				757
				758	/* Return a Python string holding the UTF-8 encoded value of the
				759	Unicode object.
				760
				761	The resulting string is cached in the Unicode object for subsequent
				762	usage by this function. The cached version is needed to implement
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	763	the character buffer interface and will live (at least) as long as
				764	the Unicode object itself.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	765
				766	The refcount of the string is not incremented.
				767
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	768	* Exported for internal use by the interpreter only !!! *
				769
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	770	*/
				771
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	772	PyObject _PyUnicode_AsUTF8String(PyObject unicode,
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	773	const char *errors)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	774	{
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	775	PyObject v = ((PyUnicodeObject )unicode)->utf8str;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	776
				777	if (v)
				778	return v;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	779	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				780	PyUnicode_GET_SIZE(unicode),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	781	errors);
				782	if (v && errors == NULL)
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	783	((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	784	return v;
				785	}
				786
				787	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				788	{
				789	PyObject *str;
				790
				791	if (!PyUnicode_Check(unicode)) {
				792	PyErr_BadArgument();
				793	return NULL;
				794	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	795	str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	796	if (str == NULL)
				797	return NULL;
				798	Py_INCREF(str);
				799	return str;
				800	}
				801
				802	/* --- UTF-16 Codec ------------------------------------------------------- */
				803
				804	static
				805	int utf16_decoding_error(const Py_UNICODE **source,
				806	Py_UNICODE **dest,
				807	const char *errors,
				808	const char *details)
				809	{
				810	if ((errors == NULL) \|\|
				811	(strcmp(errors,"strict") == 0)) {
				812	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	813	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	814	details);
				815	return -1;
				816	}
				817	else if (strcmp(errors,"ignore") == 0) {
				818	return 0;
				819	}
				820	else if (strcmp(errors,"replace") == 0) {
				821	if (dest) {
				822	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				823	(*dest)++;
				824	}
				825	return 0;
				826	}
				827	else {
				828	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	829	"UTF-16 decoding error; "
				830	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	831	errors);
				832	return -1;
				833	}
				834	}
				835
				836	#define UTF16_ERROR(details) do { \
				837	if (utf16_decoding_error(&q, &p, errors, details)) \
				838	goto onError; \
				839	continue; \
				840	} while(0)
				841
				842	PyObject PyUnicode_DecodeUTF16(const char s,
				843	int size,
				844	const char *errors,
				845	int *byteorder)
				846	{
				847	PyUnicodeObject *unicode;
				848	Py_UNICODE *p;
				849	const Py_UNICODE q, e;
				850	int bo = 0;
				851
				852	/* size should be an even number */
				853	if (size % sizeof(Py_UNICODE) != 0) {
				854	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				855	return NULL;
				856	/* The remaining input chars are ignored if we fall through
				857	here... */
				858	}
				859
				860	/* Note: size will always be longer than the resulting Unicode
				861	character count */
				862	unicode = _PyUnicode_New(size);
				863	if (!unicode)
				864	return NULL;
				865	if (size == 0)
				866	return (PyObject *)unicode;
				867
				868	/* Unpack UTF-16 encoded data */
				869	p = unicode->str;
				870	q = (Py_UNICODE *)s;
				871	e = q + (size / sizeof(Py_UNICODE));
				872
				873	if (byteorder)
				874	bo = *byteorder;
				875
				876	while (q < e) {
				877	register Py_UNICODE ch = *q++;
				878
				879	/* Check for BOM marks (U+FEFF) in the input and adjust
				880	current byte order setting accordingly. Swap input
				881	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				882	!) */
				883	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				884	if (ch == 0xFEFF) {
				885	bo = -1;
				886	continue;
				887	} else if (ch == 0xFFFE) {
				888	bo = 1;
				889	continue;
				890	}
				891	if (bo == 1)
				892	ch = (ch >> 8) \| (ch << 8);
				893	#else
				894	if (ch == 0xFEFF) {
				895	bo = 1;
				896	continue;
				897	} else if (ch == 0xFFFE) {
				898	bo = -1;
				899	continue;
				900	}
				901	if (bo == -1)
				902	ch = (ch >> 8) \| (ch << 8);
				903	#endif
				904	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				905	*p++ = ch;
				906	continue;
				907	}
				908
				909	/* UTF-16 code pair: */
				910	if (q >= e)
				911	UTF16_ERROR("unexpected end of data");
				912	if (0xDC00 <= q && q <= 0xDFFF) {
				913	q++;
				914	if (0xD800 <= q && q <= 0xDBFF)
				915	/* This is valid data (a UTF-16 surrogate pair), but
				916	we are not able to store this information since our
				917	Py_UNICODE type only has 16 bits... this might
				918	change someday, even though it's unlikely. */
				919	UTF16_ERROR("code pairs are not supported");
				920	else
				921	continue;
				922	}
				923	UTF16_ERROR("illegal encoding");
				924	}
				925
				926	if (byteorder)
				927	*byteorder = bo;
				928
				929	/* Adjust length */
				930	if (_PyUnicode_Resize(unicode, p - unicode->str))
				931	goto onError;
				932
				933	return (PyObject *)unicode;
				934
				935	onError:
				936	Py_DECREF(unicode);
				937	return NULL;
				938	}
				939
				940	#undef UTF16_ERROR
				941
				942	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				943	int size,
				944	const char *errors,
				945	int byteorder)
				946	{
				947	PyObject *v;
				948	Py_UNICODE *p;
				949	char *q;
				950
				951	/* We don't create UTF-16 pairs... */
				952	v = PyString_FromStringAndSize(NULL,
				953	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				954	if (v == NULL)
				955	return NULL;
				956	if (size == 0)
				957	goto done;
				958
				959	q = PyString_AS_STRING(v);
				960	p = (Py_UNICODE *)q;
				961
				962	if (byteorder == 0)
				963	*p++ = 0xFEFF;
				964	if (byteorder == 0 \|\|
				965	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				966	byteorder == -1
				967	#else
				968	byteorder == 1
				969	#endif
				970	)
				971	memcpy(p, s, size * sizeof(Py_UNICODE));
				972	else
				973	while (size-- > 0) {
				974	Py_UNICODE ch = *s++;
				975	*p++ = (ch >> 8) \| (ch << 8);
				976	}
				977	done:
				978	return v;
				979	}
				980
				981	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				982	{
				983	if (!PyUnicode_Check(unicode)) {
				984	PyErr_BadArgument();
				985	return NULL;
				986	}
				987	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				988	PyUnicode_GET_SIZE(unicode),
				989	NULL,
				990	0);
				991	}
				992
				993	/* --- Unicode Escape Codec ----------------------------------------------- */
				994
				995	static
				996	int unicodeescape_decoding_error(const char **source,
				997	unsigned int *x,
				998	const char *errors,
				999	const char *details)
				1000	{
				1001	if ((errors == NULL) \|\|
				1002	(strcmp(errors,"strict") == 0)) {
				1003	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1004	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1005	details);
				1006	return -1;
				1007	}
				1008	else if (strcmp(errors,"ignore") == 0) {
				1009	return 0;
				1010	}
				1011	else if (strcmp(errors,"replace") == 0) {
				1012	*x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
				1013	return 0;
				1014	}
				1015	else {
				1016	PyErr_Format(PyExc_ValueError,
				1017	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1018	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1019	errors);
				1020	return -1;
				1021	}
				1022	}
				1023
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1024	static _Py_UCNHashAPI *pucnHash = NULL;
				1025
				1026	static
				1027	int mystrnicmp(const char s1, const char s2, size_t count)
				1028	{
				1029	char c1, c2;
				1030
				1031	if (count)
				1032	{
				1033	do
				1034	{
				1035	c1 = tolower(*(s1++));
				1036	c2 = tolower(*(s2++));
				1037	}
				1038	while(--count && c1 == c2);
				1039
				1040	return c1 - c2;
				1041	}
				1042
				1043	return 0;
				1044	}
				1045
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1046	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1047	int size,
				1048	const char *errors)
				1049	{
				1050	PyUnicodeObject *v;
				1051	Py_UNICODE p = NULL, buf = NULL;
				1052	const char *end;
				1053
				1054	/* Escaped strings will always be longer than the resulting
				1055	Unicode string, so we start with size here and then reduce the
				1056	length after conversion to the true value. */
				1057	v = _PyUnicode_New(size);
				1058	if (v == NULL)
				1059	goto onError;
				1060	if (size == 0)
				1061	return (PyObject *)v;
				1062	p = buf = PyUnicode_AS_UNICODE(v);
				1063	end = s + size;
				1064	while (s < end) {
				1065	unsigned char c;
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1066	unsigned long x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1067	int i;
				1068
				1069	/* Non-escape characters are interpreted as Unicode ordinals */
				1070	if (*s != '\\') {
				1071	p++ = (unsigned char)s++;
				1072	continue;
				1073	}
				1074
				1075	/* \ - Escapes */
				1076	s++;
				1077	switch (*s++) {
				1078
				1079	/* \x escapes */
				1080	case '\n': break;
				1081	case '\\': *p++ = '\\'; break;
				1082	case '\'': *p++ = '\''; break;
				1083	case '\"': *p++ = '\"'; break;
				1084	case 'b': *p++ = '\b'; break;
				1085	case 'f': p++ = '\014'; break; / FF */
				1086	case 't': *p++ = '\t'; break;
				1087	case 'n': *p++ = '\n'; break;
				1088	case 'r': *p++ = '\r'; break;
				1089	case 'v': p++ = '\013'; break; / VT */
				1090	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1091
				1092	/* \OOO (octal) escapes */
				1093	case '0': case '1': case '2': case '3':
				1094	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1095	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1096	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1097	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1098	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1099	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1100	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1101	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1102	break;
				1103
				1104	/* \xXXXX escape with 0-4 hex digits */
				1105	case 'x':
				1106	x = 0;
				1107	c = (unsigned char)*s;
				1108	if (isxdigit(c)) {
				1109	do {
				1110	x = (x<<4) & ~0xF;
				1111	if ('0' <= c && c <= '9')
				1112	x += c - '0';
				1113	else if ('a' <= c && c <= 'f')
				1114	x += 10 + c - 'a';
				1115	else
				1116	x += 10 + c - 'A';
				1117	c = (unsigned char)*++s;
				1118	} while (isxdigit(c));
				1119	*p++ = x;
				1120	} else {
				1121	*p++ = '\\';
				1122	*p++ = (unsigned char)s[-1];
				1123	}
				1124	break;
				1125
				1126	/* \uXXXX with 4 hex digits */
				1127	case 'u':
				1128	for (x = 0, i = 0; i < 4; i++) {
				1129	c = (unsigned char)s[i];
				1130	if (!isxdigit(c)) {
				1131	if (unicodeescape_decoding_error(&s, &x, errors,
				1132	"truncated \\uXXXX"))
				1133	goto onError;
				1134	i++;
				1135	break;
				1136	}
				1137	x = (x<<4) & ~0xF;
				1138	if (c >= '0' && c <= '9')
				1139	x += c - '0';
				1140	else if (c >= 'a' && c <= 'f')
				1141	x += 10 + c - 'a';
				1142	else
				1143	x += 10 + c - 'A';
				1144	}
				1145	s += i;
				1146	*p++ = x;
				1147	break;
				1148
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1149	case 'N':
				1150	/* Ok, we need to deal with Unicode Character Names now,
				1151	* make sure we've imported the hash table data...
				1152	*/
				1153	if (pucnHash == NULL)
				1154	{
				1155	PyObject mod = 0, v = 0;
				1156
				1157	mod = PyImport_ImportModule("ucnhash");
				1158	if (mod == NULL)
				1159	goto onError;
				1160	v = PyObject_GetAttrString(mod,"ucnhashAPI");
				1161	Py_DECREF(mod);
				1162	if (v == NULL)
				1163	{
				1164	goto onError;
				1165	}
				1166	pucnHash = PyCObject_AsVoidPtr(v);
				1167	Py_DECREF(v);
				1168	if (pucnHash == NULL)
				1169	{
				1170	goto onError;
				1171	}
				1172	}
				1173
				1174	if (*s == '{')
				1175	{
				1176	const char *start = s + 1;
				1177	const char *endBrace = start;
				1178	unsigned int uiValue;
				1179	unsigned long j;
				1180
				1181	/* look for either the closing brace, or we
				1182	* exceed the maximum length of the unicode character names
				1183	*/
				1184	while (*endBrace != '}' &&
				1185	(unsigned int)(endBrace - start) <=
				1186	pucnHash->cchMax &&
				1187	endBrace < end)
				1188	{
				1189	endBrace++;
				1190	}
				1191	if (endBrace != end && *endBrace == '}')
				1192	{
				1193	j = pucnHash->hash(start, endBrace - start);
				1194	if (j > pucnHash->cKeys \|\|
				1195	mystrnicmp(
				1196	start,
				1197	((_Py_UnicodeCharacterName *)
				1198	(pucnHash->getValue(j)))->pszUCN,
				1199	(int)(endBrace - start)) != 0)
				1200	{
				1201	if (unicodeescape_decoding_error(
				1202	&s, &x, errors,
				1203	"Invalid Unicode Character Name"))
				1204	{
				1205	goto onError;
				1206	}
				1207	goto ucnFallthrough;
				1208	}
				1209	uiValue = ((_Py_UnicodeCharacterName *)
				1210	(pucnHash->getValue(j)))->uiValue;
				1211	if (uiValue < 1<<16)
				1212	{
				1213	/* In UCS-2 range, easy solution.. */
				1214	*p++ = uiValue;
				1215	}
				1216	else
				1217	{
				1218	/* Oops, its in UCS-4 space, */
				1219	/* compute and append the two surrogates: */
				1220	/* translate from 10000..10FFFF to 0..FFFFF */
				1221	uiValue -= 0x10000;
				1222
				1223	/* high surrogate = top 10 bits added to D800 */
				1224	*p++ = 0xD800 + (uiValue >> 10);
				1225
				1226	/* low surrogate = bottom 10 bits added to DC00 */
				1227	*p++ = 0xDC00 + (uiValue & ~0xFC00);
				1228	}
				1229	s = endBrace + 1;
				1230	}
				1231	else
				1232	{
				1233	if (unicodeescape_decoding_error(
				1234	&s, &x, errors,
				1235	"Unicode name missing closing brace"))
				1236	goto onError;
				1237	goto ucnFallthrough;
				1238	}
				1239	break;
				1240	}
				1241	if (unicodeescape_decoding_error(
				1242	&s, &x, errors,
				1243	"Missing opening brace for Unicode Character Name escape"))
				1244	goto onError;
				1245	ucnFallthrough:
				1246	/* fall through on purpose */
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1247	default:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1248	*p++ = '\\';
				1249	*p++ = (unsigned char)s[-1];
				1250	break;
				1251	}
				1252	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1253	if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1254	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1255	return (PyObject *)v;
				1256
				1257	onError:
				1258	Py_XDECREF(v);
				1259	return NULL;
				1260	}
				1261
				1262	/* Return a Unicode-Escape string version of the Unicode object.
				1263
				1264	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1265	appropriate.
				1266
				1267	*/
				1268
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1269	static const Py_UNICODE findchar(const Py_UNICODE s,
				1270	int size,
				1271	Py_UNICODE ch);
				1272
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1273	static
				1274	PyObject unicodeescape_string(const Py_UNICODE s,
				1275	int size,
				1276	int quotes)
				1277	{
				1278	PyObject *repr;
				1279	char *p;
				1280	char *q;
				1281
				1282	static const char *hexdigit = "0123456789ABCDEF";
				1283
				1284	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1285	if (repr == NULL)
				1286	return NULL;
				1287
				1288	p = q = PyString_AS_STRING(repr);
				1289
				1290	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1291	*p++ = 'u';
				1292	*p++ = (findchar(s, size, '\'') &&
				1293	!findchar(s, size, '"')) ? '"' : '\'';
				1294	}
				1295	while (size-- > 0) {
				1296	Py_UNICODE ch = *s++;
				1297	/* Escape quotes */
				1298	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1299	*p++ = '\\';
				1300	*p++ = (char) ch;
				1301	}
				1302	/* Map 16-bit characters to '\uxxxx' */
				1303	else if (ch >= 256) {
				1304	*p++ = '\\';
				1305	*p++ = 'u';
				1306	*p++ = hexdigit[(ch >> 12) & 0xf];
				1307	*p++ = hexdigit[(ch >> 8) & 0xf];
				1308	*p++ = hexdigit[(ch >> 4) & 0xf];
				1309	*p++ = hexdigit[ch & 15];
				1310	}
				1311	/* Map non-printable US ASCII to '\ooo' */
				1312	else if (ch < ' ' \|\| ch >= 128) {
				1313	*p++ = '\\';
				1314	*p++ = hexdigit[(ch >> 6) & 7];
				1315	*p++ = hexdigit[(ch >> 3) & 7];
				1316	*p++ = hexdigit[ch & 7];
				1317	}
				1318	/* Copy everything else as-is */
				1319	else
				1320	*p++ = (char) ch;
				1321	}
				1322	if (quotes)
				1323	*p++ = q[1];
				1324
				1325	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1326	if (_PyString_Resize(&repr, p - q))
				1327	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1328
				1329	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1330
				1331	onError:
				1332	Py_DECREF(repr);
				1333	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1334	}
				1335
				1336	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1337	int size)
				1338	{
				1339	return unicodeescape_string(s, size, 0);
				1340	}
				1341
				1342	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1343	{
				1344	if (!PyUnicode_Check(unicode)) {
				1345	PyErr_BadArgument();
				1346	return NULL;
				1347	}
				1348	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1349	PyUnicode_GET_SIZE(unicode));
				1350	}
				1351
				1352	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1353
				1354	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1355	int size,
				1356	const char *errors)
				1357	{
				1358	PyUnicodeObject *v;
				1359	Py_UNICODE p, buf;
				1360	const char *end;
				1361	const char *bs;
				1362
				1363	/* Escaped strings will always be longer than the resulting
				1364	Unicode string, so we start with size here and then reduce the
				1365	length after conversion to the true value. */
				1366	v = _PyUnicode_New(size);
				1367	if (v == NULL)
				1368	goto onError;
				1369	if (size == 0)
				1370	return (PyObject *)v;
				1371	p = buf = PyUnicode_AS_UNICODE(v);
				1372	end = s + size;
				1373	while (s < end) {
				1374	unsigned char c;
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1375	unsigned long x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1376	int i;
				1377
				1378	/* Non-escape characters are interpreted as Unicode ordinals */
				1379	if (*s != '\\') {
				1380	p++ = (unsigned char)s++;
				1381	continue;
				1382	}
				1383
				1384	/* \u-escapes are only interpreted iff the number of leading
				1385	backslashes if odd */
				1386	bs = s;
				1387	for (;s < end;) {
				1388	if (*s != '\\')
				1389	break;
				1390	p++ = (unsigned char)s++;
				1391	}
				1392	if (((s - bs) & 1) == 0 \|\|
				1393	s >= end \|\|
				1394	*s != 'u') {
				1395	continue;
				1396	}
				1397	p--;
				1398	s++;
				1399
				1400	/* \uXXXX with 4 hex digits */
				1401	for (x = 0, i = 0; i < 4; i++) {
				1402	c = (unsigned char)s[i];
				1403	if (!isxdigit(c)) {
				1404	if (unicodeescape_decoding_error(&s, &x, errors,
				1405	"truncated \\uXXXX"))
				1406	goto onError;
				1407	i++;
				1408	break;
				1409	}
				1410	x = (x<<4) & ~0xF;
				1411	if (c >= '0' && c <= '9')
				1412	x += c - '0';
				1413	else if (c >= 'a' && c <= 'f')
				1414	x += 10 + c - 'a';
				1415	else
				1416	x += 10 + c - 'A';
				1417	}
				1418	s += i;
				1419	*p++ = x;
				1420	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1421	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1422	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1423	return (PyObject *)v;
				1424
				1425	onError:
				1426	Py_XDECREF(v);
				1427	return NULL;
				1428	}
				1429
				1430	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1431	int size)
				1432	{
				1433	PyObject *repr;
				1434	char *p;
				1435	char *q;
				1436
				1437	static const char *hexdigit = "0123456789ABCDEF";
				1438
				1439	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1440	if (repr == NULL)
				1441	return NULL;
				1442
				1443	p = q = PyString_AS_STRING(repr);
				1444	while (size-- > 0) {
				1445	Py_UNICODE ch = *s++;
				1446	/* Map 16-bit characters to '\uxxxx' */
				1447	if (ch >= 256) {
				1448	*p++ = '\\';
				1449	*p++ = 'u';
				1450	*p++ = hexdigit[(ch >> 12) & 0xf];
				1451	*p++ = hexdigit[(ch >> 8) & 0xf];
				1452	*p++ = hexdigit[(ch >> 4) & 0xf];
				1453	*p++ = hexdigit[ch & 15];
				1454	}
				1455	/* Copy everything else as-is */
				1456	else
				1457	*p++ = (char) ch;
				1458	}
				1459	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1460	if (_PyString_Resize(&repr, p - q))
				1461	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1462
				1463	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1464
				1465	onError:
				1466	Py_DECREF(repr);
				1467	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1468	}
				1469
				1470	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1471	{
				1472	if (!PyUnicode_Check(unicode)) {
				1473	PyErr_BadArgument();
				1474	return NULL;
				1475	}
				1476	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1477	PyUnicode_GET_SIZE(unicode));
				1478	}
				1479
				1480	/* --- Latin-1 Codec ------------------------------------------------------ */
				1481
				1482	PyObject PyUnicode_DecodeLatin1(const char s,
				1483	int size,
				1484	const char *errors)
				1485	{
				1486	PyUnicodeObject *v;
				1487	Py_UNICODE *p;
				1488
				1489	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1490	v = _PyUnicode_New(size);
				1491	if (v == NULL)
				1492	goto onError;
				1493	if (size == 0)
				1494	return (PyObject *)v;
				1495	p = PyUnicode_AS_UNICODE(v);
				1496	while (size-- > 0)
				1497	p++ = (unsigned char)s++;
				1498	return (PyObject *)v;
				1499
				1500	onError:
				1501	Py_XDECREF(v);
				1502	return NULL;
				1503	}
				1504
				1505	static
				1506	int latin1_encoding_error(const Py_UNICODE **source,
				1507	char **dest,
				1508	const char *errors,
				1509	const char *details)
				1510	{
				1511	if ((errors == NULL) \|\|
				1512	(strcmp(errors,"strict") == 0)) {
				1513	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1514	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1515	details);
				1516	return -1;
				1517	}
				1518	else if (strcmp(errors,"ignore") == 0) {
				1519	return 0;
				1520	}
				1521	else if (strcmp(errors,"replace") == 0) {
				1522	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1523	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1524	return 0;
				1525	}
				1526	else {
				1527	PyErr_Format(PyExc_ValueError,
				1528	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1529	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1530	errors);
				1531	return -1;
				1532	}
				1533	}
				1534
				1535	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1536	int size,
				1537	const char *errors)
				1538	{
				1539	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1540	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1541	repr = PyString_FromStringAndSize(NULL, size);
				1542	if (repr == NULL)
				1543	return NULL;
				1544
				1545	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1546	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1547	while (size-- > 0) {
				1548	Py_UNICODE ch = *p++;
				1549	if (ch >= 256) {
				1550	if (latin1_encoding_error(&p, &s, errors,
				1551	"ordinal not in range(256)"))
				1552	goto onError;
				1553	}
				1554	else
				1555	*s++ = (char)ch;
				1556	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1557	/* Resize if error handling skipped some characters */
				1558	if (s - start < PyString_GET_SIZE(repr))
				1559	if (_PyString_Resize(&repr, s - start))
				1560	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1561	return repr;
				1562
				1563	onError:
				1564	Py_DECREF(repr);
				1565	return NULL;
				1566	}
				1567
				1568	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1569	{
				1570	if (!PyUnicode_Check(unicode)) {
				1571	PyErr_BadArgument();
				1572	return NULL;
				1573	}
				1574	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1575	PyUnicode_GET_SIZE(unicode),
				1576	NULL);
				1577	}
				1578
				1579	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1580
				1581	static
				1582	int ascii_decoding_error(const char **source,
				1583	Py_UNICODE **dest,
				1584	const char *errors,
				1585	const char *details)
				1586	{
				1587	if ((errors == NULL) \|\|
				1588	(strcmp(errors,"strict") == 0)) {
				1589	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1590	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1591	details);
				1592	return -1;
				1593	}
				1594	else if (strcmp(errors,"ignore") == 0) {
				1595	return 0;
				1596	}
				1597	else if (strcmp(errors,"replace") == 0) {
				1598	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1599	(*dest)++;
				1600	return 0;
				1601	}
				1602	else {
				1603	PyErr_Format(PyExc_ValueError,
				1604	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1605	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1606	errors);
				1607	return -1;
				1608	}
				1609	}
				1610
				1611	PyObject PyUnicode_DecodeASCII(const char s,
				1612	int size,
				1613	const char *errors)
				1614	{
				1615	PyUnicodeObject *v;
				1616	Py_UNICODE *p;
				1617
				1618	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1619	v = _PyUnicode_New(size);
				1620	if (v == NULL)
				1621	goto onError;
				1622	if (size == 0)
				1623	return (PyObject *)v;
				1624	p = PyUnicode_AS_UNICODE(v);
				1625	while (size-- > 0) {
				1626	register unsigned char c;
				1627
				1628	c = (unsigned char)*s++;
				1629	if (c < 128)
				1630	*p++ = c;
				1631	else if (ascii_decoding_error(&s, &p, errors,
				1632	"ordinal not in range(128)"))
				1633	goto onError;
				1634	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1635	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1636	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1637	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1638	return (PyObject *)v;
				1639
				1640	onError:
				1641	Py_XDECREF(v);
				1642	return NULL;
				1643	}
				1644
				1645	static
				1646	int ascii_encoding_error(const Py_UNICODE **source,
				1647	char **dest,
				1648	const char *errors,
				1649	const char *details)
				1650	{
				1651	if ((errors == NULL) \|\|
				1652	(strcmp(errors,"strict") == 0)) {
				1653	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1654	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1655	details);
				1656	return -1;
				1657	}
				1658	else if (strcmp(errors,"ignore") == 0) {
				1659	return 0;
				1660	}
				1661	else if (strcmp(errors,"replace") == 0) {
				1662	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1663	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1664	return 0;
				1665	}
				1666	else {
				1667	PyErr_Format(PyExc_ValueError,
				1668	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1669	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1670	errors);
				1671	return -1;
				1672	}
				1673	}
				1674
				1675	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1676	int size,
				1677	const char *errors)
				1678	{
				1679	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1680	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1681	repr = PyString_FromStringAndSize(NULL, size);
				1682	if (repr == NULL)
				1683	return NULL;
				1684
				1685	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1686	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1687	while (size-- > 0) {
				1688	Py_UNICODE ch = *p++;
				1689	if (ch >= 128) {
				1690	if (ascii_encoding_error(&p, &s, errors,
				1691	"ordinal not in range(128)"))
				1692	goto onError;
				1693	}
				1694	else
				1695	*s++ = (char)ch;
				1696	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1697	/* Resize if error handling skipped some characters */
				1698	if (s - start < PyString_GET_SIZE(repr))
				1699	if (_PyString_Resize(&repr, s - start))
				1700	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1701	return repr;
				1702
				1703	onError:
				1704	Py_DECREF(repr);
				1705	return NULL;
				1706	}
				1707
				1708	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1709	{
				1710	if (!PyUnicode_Check(unicode)) {
				1711	PyErr_BadArgument();
				1712	return NULL;
				1713	}
				1714	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1715	PyUnicode_GET_SIZE(unicode),
				1716	NULL);
				1717	}
				1718
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1719	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1720
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1721	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1722
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1723	PyObject PyUnicode_DecodeMBCS(const char s,
				1724	int size,
				1725	const char *errors)
				1726	{
				1727	PyUnicodeObject *v;
				1728	Py_UNICODE *p;
				1729
				1730	/* First get the size of the result */
				1731	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1732	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1733	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1734
				1735	v = _PyUnicode_New(usize);
				1736	if (v == NULL)
				1737	return NULL;
				1738	if (usize == 0)
				1739	return (PyObject *)v;
				1740	p = PyUnicode_AS_UNICODE(v);
				1741	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1742	Py_DECREF(v);
				1743	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1744	}
				1745
				1746	return (PyObject *)v;
				1747	}
				1748
				1749	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1750	int size,
				1751	const char *errors)
				1752	{
				1753	PyObject *repr;
				1754	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1755	DWORD mbcssize;
				1756
				1757	/* If there are no characters, bail now! */
				1758	if (size==0)
				1759	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1760
				1761	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1762	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1763	if (mbcssize==0)
				1764	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1765
				1766	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1767	if (repr == NULL)
				1768	return NULL;
				1769	if (mbcssize==0)
				1770	return repr;
				1771
				1772	/* Do the conversion */
				1773	s = PyString_AS_STRING(repr);
				1774	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1775	Py_DECREF(repr);
				1776	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1777	}
				1778	return repr;
				1779	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1780
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1781	#endif /* MS_WIN32 */
				1782
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1783	/* --- Character Mapping Codec -------------------------------------------- */
				1784
				1785	static
				1786	int charmap_decoding_error(const char **source,
				1787	Py_UNICODE **dest,
				1788	const char *errors,
				1789	const char *details)
				1790	{
				1791	if ((errors == NULL) \|\|
				1792	(strcmp(errors,"strict") == 0)) {
				1793	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1794	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1795	details);
				1796	return -1;
				1797	}
				1798	else if (strcmp(errors,"ignore") == 0) {
				1799	return 0;
				1800	}
				1801	else if (strcmp(errors,"replace") == 0) {
				1802	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1803	(*dest)++;
				1804	return 0;
				1805	}
				1806	else {
				1807	PyErr_Format(PyExc_ValueError,
				1808	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1809	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1810	errors);
				1811	return -1;
				1812	}
				1813	}
				1814
				1815	PyObject PyUnicode_DecodeCharmap(const char s,
				1816	int size,
				1817	PyObject *mapping,
				1818	const char *errors)
				1819	{
				1820	PyUnicodeObject *v;
				1821	Py_UNICODE *p;
				1822
				1823	/* Default to Latin-1 */
				1824	if (mapping == NULL)
				1825	return PyUnicode_DecodeLatin1(s, size, errors);
				1826
				1827	v = _PyUnicode_New(size);
				1828	if (v == NULL)
				1829	goto onError;
				1830	if (size == 0)
				1831	return (PyObject *)v;
				1832	p = PyUnicode_AS_UNICODE(v);
				1833	while (size-- > 0) {
				1834	unsigned char ch = *s++;
				1835	PyObject w, x;
				1836
				1837	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1838	w = PyInt_FromLong((long)ch);
				1839	if (w == NULL)
				1840	goto onError;
				1841	x = PyObject_GetItem(mapping, w);
				1842	Py_DECREF(w);
				1843	if (x == NULL) {
				1844	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1845	/* No mapping found: default to Latin-1 mapping */
				1846	PyErr_Clear();
				1847	*p++ = (Py_UNICODE)ch;
				1848	continue;
				1849	}
				1850	goto onError;
				1851	}
				1852
				1853	/* Apply mapping */
				1854	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1855	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1856	if (value < 0 \|\| value > 65535) {
				1857	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1858	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1859	Py_DECREF(x);
				1860	goto onError;
				1861	}
				1862	*p++ = (Py_UNICODE)value;
				1863	}
				1864	else if (x == Py_None) {
				1865	/* undefined mapping */
				1866	if (charmap_decoding_error(&s, &p, errors,
				1867	"character maps to <undefined>")) {
				1868	Py_DECREF(x);
				1869	goto onError;
				1870	}
				1871	}
				1872	else if (PyUnicode_Check(x)) {
				1873	if (PyUnicode_GET_SIZE(x) != 1) {
				1874	/* 1-n mapping */
				1875	PyErr_SetString(PyExc_NotImplementedError,
				1876	"1-n mappings are currently not implemented");
				1877	Py_DECREF(x);
				1878	goto onError;
				1879	}
				1880	p++ = PyUnicode_AS_UNICODE(x);
				1881	}
				1882	else {
				1883	/* wrong return value */
				1884	PyErr_SetString(PyExc_TypeError,
				1885	"character mapping must return integer, None or unicode");
				1886	Py_DECREF(x);
				1887	goto onError;
				1888	}
				1889	Py_DECREF(x);
				1890	}
				1891	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1892	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1893	goto onError;
				1894	return (PyObject *)v;
				1895
				1896	onError:
				1897	Py_XDECREF(v);
				1898	return NULL;
				1899	}
				1900
				1901	static
				1902	int charmap_encoding_error(const Py_UNICODE **source,
				1903	char **dest,
				1904	const char *errors,
				1905	const char *details)
				1906	{
				1907	if ((errors == NULL) \|\|
				1908	(strcmp(errors,"strict") == 0)) {
				1909	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1910	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1911	details);
				1912	return -1;
				1913	}
				1914	else if (strcmp(errors,"ignore") == 0) {
				1915	return 0;
				1916	}
				1917	else if (strcmp(errors,"replace") == 0) {
				1918	**dest = '?';
				1919	(*dest)++;
				1920	return 0;
				1921	}
				1922	else {
				1923	PyErr_Format(PyExc_ValueError,
				1924	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1925	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1926	errors);
				1927	return -1;
				1928	}
				1929	}
				1930
				1931	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1932	int size,
				1933	PyObject *mapping,
				1934	const char *errors)
				1935	{
				1936	PyObject *v;
				1937	char *s;
				1938
				1939	/* Default to Latin-1 */
				1940	if (mapping == NULL)
				1941	return PyUnicode_EncodeLatin1(p, size, errors);
				1942
				1943	v = PyString_FromStringAndSize(NULL, size);
				1944	if (v == NULL)
				1945	return NULL;
				1946	s = PyString_AS_STRING(v);
				1947	while (size-- > 0) {
				1948	Py_UNICODE ch = *p++;
				1949	PyObject w, x;
				1950
				1951	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				1952	w = PyInt_FromLong((long)ch);
				1953	if (w == NULL)
				1954	goto onError;
				1955	x = PyObject_GetItem(mapping, w);
				1956	Py_DECREF(w);
				1957	if (x == NULL) {
				1958	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1959	/* No mapping found: default to Latin-1 mapping if possible */
				1960	PyErr_Clear();
				1961	if (ch < 256) {
				1962	*s++ = (char)ch;
				1963	continue;
				1964	}
				1965	else if (!charmap_encoding_error(&p, &s, errors,
				1966	"missing character mapping"))
				1967	continue;
				1968	}
				1969	goto onError;
				1970	}
				1971
				1972	/* Apply mapping */
				1973	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	1974	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1975	if (value < 0 \|\| value > 255) {
				1976	PyErr_SetString(PyExc_TypeError,
				1977	"character mapping must be in range(256)");
				1978	Py_DECREF(x);
				1979	goto onError;
				1980	}
				1981	*s++ = (char)value;
				1982	}
				1983	else if (x == Py_None) {
				1984	/* undefined mapping */
				1985	if (charmap_encoding_error(&p, &s, errors,
				1986	"character maps to <undefined>")) {
				1987	Py_DECREF(x);
				1988	goto onError;
				1989	}
				1990	}
				1991	else if (PyString_Check(x)) {
				1992	if (PyString_GET_SIZE(x) != 1) {
				1993	/* 1-n mapping */
				1994	PyErr_SetString(PyExc_NotImplementedError,
				1995	"1-n mappings are currently not implemented");
				1996	Py_DECREF(x);
				1997	goto onError;
				1998	}
				1999	s++ = PyString_AS_STRING(x);
				2000	}
				2001	else {
				2002	/* wrong return value */
				2003	PyErr_SetString(PyExc_TypeError,
				2004	"character mapping must return integer, None or unicode");
				2005	Py_DECREF(x);
				2006	goto onError;
				2007	}
				2008	Py_DECREF(x);
				2009	}
				2010	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2011	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2012	goto onError;
				2013	return v;
				2014
				2015	onError:
				2016	Py_DECREF(v);
				2017	return NULL;
				2018	}
				2019
				2020	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2021	PyObject *mapping)
				2022	{
				2023	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2024	PyErr_BadArgument();
				2025	return NULL;
				2026	}
				2027	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2028	PyUnicode_GET_SIZE(unicode),
				2029	mapping,
				2030	NULL);
				2031	}
				2032
				2033	static
				2034	int translate_error(const Py_UNICODE **source,
				2035	Py_UNICODE **dest,
				2036	const char *errors,
				2037	const char *details)
				2038	{
				2039	if ((errors == NULL) \|\|
				2040	(strcmp(errors,"strict") == 0)) {
				2041	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2042	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2043	details);
				2044	return -1;
				2045	}
				2046	else if (strcmp(errors,"ignore") == 0) {
				2047	return 0;
				2048	}
				2049	else if (strcmp(errors,"replace") == 0) {
				2050	**dest = '?';
				2051	(*dest)++;
				2052	return 0;
				2053	}
				2054	else {
				2055	PyErr_Format(PyExc_ValueError,
				2056	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2057	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2058	errors);
				2059	return -1;
				2060	}
				2061	}
				2062
				2063	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2064	int size,
				2065	PyObject *mapping,
				2066	const char *errors)
				2067	{
				2068	PyUnicodeObject *v;
				2069	Py_UNICODE *p;
				2070
				2071	if (mapping == NULL) {
				2072	PyErr_BadArgument();
				2073	return NULL;
				2074	}
				2075
				2076	/* Output will never be longer than input */
				2077	v = _PyUnicode_New(size);
				2078	if (v == NULL)
				2079	goto onError;
				2080	if (size == 0)
				2081	goto done;
				2082	p = PyUnicode_AS_UNICODE(v);
				2083	while (size-- > 0) {
				2084	Py_UNICODE ch = *s++;
				2085	PyObject w, x;
				2086
				2087	/* Get mapping */
				2088	w = PyInt_FromLong(ch);
				2089	if (w == NULL)
				2090	goto onError;
				2091	x = PyObject_GetItem(mapping, w);
				2092	Py_DECREF(w);
				2093	if (x == NULL) {
				2094	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2095	/* No mapping found: default to 1-1 mapping */
				2096	PyErr_Clear();
				2097	*p++ = ch;
				2098	continue;
				2099	}
				2100	goto onError;
				2101	}
				2102
				2103	/* Apply mapping */
				2104	if (PyInt_Check(x))
				2105	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2106	else if (x == Py_None) {
				2107	/* undefined mapping */
				2108	if (translate_error(&s, &p, errors,
				2109	"character maps to <undefined>")) {
				2110	Py_DECREF(x);
				2111	goto onError;
				2112	}
				2113	}
				2114	else if (PyUnicode_Check(x)) {
				2115	if (PyUnicode_GET_SIZE(x) != 1) {
				2116	/* 1-n mapping */
				2117	PyErr_SetString(PyExc_NotImplementedError,
				2118	"1-n mappings are currently not implemented");
				2119	Py_DECREF(x);
				2120	goto onError;
				2121	}
				2122	p++ = PyUnicode_AS_UNICODE(x);
				2123	}
				2124	else {
				2125	/* wrong return value */
				2126	PyErr_SetString(PyExc_TypeError,
				2127	"translate mapping must return integer, None or unicode");
				2128	Py_DECREF(x);
				2129	goto onError;
				2130	}
				2131	Py_DECREF(x);
				2132	}
				2133	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2134	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2135	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2136
				2137	done:
				2138	return (PyObject *)v;
				2139
				2140	onError:
				2141	Py_XDECREF(v);
				2142	return NULL;
				2143	}
				2144
				2145	PyObject PyUnicode_Translate(PyObject str,
				2146	PyObject *mapping,
				2147	const char *errors)
				2148	{
				2149	PyObject *result;
				2150
				2151	str = PyUnicode_FromObject(str);
				2152	if (str == NULL)
				2153	goto onError;
				2154	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2155	PyUnicode_GET_SIZE(str),
				2156	mapping,
				2157	errors);
				2158	Py_DECREF(str);
				2159	return result;
				2160
				2161	onError:
				2162	Py_XDECREF(str);
				2163	return NULL;
				2164	}
				2165
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2166	/* --- Decimal Encoder ---------------------------------------------------- */
				2167
				2168	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2169	int length,
				2170	char *output,
				2171	const char *errors)
				2172	{
				2173	Py_UNICODE p, end;
				2174
				2175	if (output == NULL) {
				2176	PyErr_BadArgument();
				2177	return -1;
				2178	}
				2179
				2180	p = s;
				2181	end = s + length;
				2182	while (p < end) {
				2183	register Py_UNICODE ch = *p++;
				2184	int decimal;
				2185
				2186	if (Py_UNICODE_ISSPACE(ch)) {
				2187	*output++ = ' ';
				2188	continue;
				2189	}
				2190	decimal = Py_UNICODE_TODECIMAL(ch);
				2191	if (decimal >= 0) {
				2192	*output++ = '0' + decimal;
				2193	continue;
				2194	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2195	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2196	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2197	continue;
				2198	}
				2199	/* All other characters are considered invalid */
				2200	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2201	PyErr_SetString(PyExc_ValueError,
				2202	"invalid decimal Unicode string");
				2203	goto onError;
				2204	}
				2205	else if (strcmp(errors, "ignore") == 0)
				2206	continue;
				2207	else if (strcmp(errors, "replace") == 0) {
				2208	*output++ = '?';
				2209	continue;
				2210	}
				2211	}
				2212	/* 0-terminate the output string */
				2213	*output++ = '\0';
				2214	return 0;
				2215
				2216	onError:
				2217	return -1;
				2218	}
				2219
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2220	/* --- Helpers ------------------------------------------------------------ */
				2221
				2222	static
				2223	int count(PyUnicodeObject *self,
				2224	int start,
				2225	int end,
				2226	PyUnicodeObject *substring)
				2227	{
				2228	int count = 0;
				2229
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2230	if (substring->length == 0)
				2231	return (end - start + 1);
				2232
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2233	end -= substring->length;
				2234
				2235	while (start <= end)
				2236	if (Py_UNICODE_MATCH(self, start, substring)) {
				2237	count++;
				2238	start += substring->length;
				2239	} else
				2240	start++;
				2241
				2242	return count;
				2243	}
				2244
				2245	int PyUnicode_Count(PyObject *str,
				2246	PyObject *substr,
				2247	int start,
				2248	int end)
				2249	{
				2250	int result;
				2251
				2252	str = PyUnicode_FromObject(str);
				2253	if (str == NULL)
				2254	return -1;
				2255	substr = PyUnicode_FromObject(substr);
				2256	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2257	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2258	return -1;
				2259	}
				2260
				2261	result = count((PyUnicodeObject *)str,
				2262	start, end,
				2263	(PyUnicodeObject *)substr);
				2264
				2265	Py_DECREF(str);
				2266	Py_DECREF(substr);
				2267	return result;
				2268	}
				2269
				2270	static
				2271	int findstring(PyUnicodeObject *self,
				2272	PyUnicodeObject *substring,
				2273	int start,
				2274	int end,
				2275	int direction)
				2276	{
				2277	if (start < 0)
				2278	start += self->length;
				2279	if (start < 0)
				2280	start = 0;
				2281
				2282	if (substring->length == 0)
				2283	return start;
				2284
				2285	if (end > self->length)
				2286	end = self->length;
				2287	if (end < 0)
				2288	end += self->length;
				2289	if (end < 0)
				2290	end = 0;
				2291
				2292	end -= substring->length;
				2293
				2294	if (direction < 0) {
				2295	for (; end >= start; end--)
				2296	if (Py_UNICODE_MATCH(self, end, substring))
				2297	return end;
				2298	} else {
				2299	for (; start <= end; start++)
				2300	if (Py_UNICODE_MATCH(self, start, substring))
				2301	return start;
				2302	}
				2303
				2304	return -1;
				2305	}
				2306
				2307	int PyUnicode_Find(PyObject *str,
				2308	PyObject *substr,
				2309	int start,
				2310	int end,
				2311	int direction)
				2312	{
				2313	int result;
				2314
				2315	str = PyUnicode_FromObject(str);
				2316	if (str == NULL)
				2317	return -1;
				2318	substr = PyUnicode_FromObject(substr);
				2319	if (substr == NULL) {
				2320	Py_DECREF(substr);
				2321	return -1;
				2322	}
				2323
				2324	result = findstring((PyUnicodeObject *)str,
				2325	(PyUnicodeObject *)substr,
				2326	start, end, direction);
				2327	Py_DECREF(str);
				2328	Py_DECREF(substr);
				2329	return result;
				2330	}
				2331
				2332	static
				2333	int tailmatch(PyUnicodeObject *self,
				2334	PyUnicodeObject *substring,
				2335	int start,
				2336	int end,
				2337	int direction)
				2338	{
				2339	if (start < 0)
				2340	start += self->length;
				2341	if (start < 0)
				2342	start = 0;
				2343
				2344	if (substring->length == 0)
				2345	return 1;
				2346
				2347	if (end > self->length)
				2348	end = self->length;
				2349	if (end < 0)
				2350	end += self->length;
				2351	if (end < 0)
				2352	end = 0;
				2353
				2354	end -= substring->length;
				2355	if (end < start)
				2356	return 0;
				2357
				2358	if (direction > 0) {
				2359	if (Py_UNICODE_MATCH(self, end, substring))
				2360	return 1;
				2361	} else {
				2362	if (Py_UNICODE_MATCH(self, start, substring))
				2363	return 1;
				2364	}
				2365
				2366	return 0;
				2367	}
				2368
				2369	int PyUnicode_Tailmatch(PyObject *str,
				2370	PyObject *substr,
				2371	int start,
				2372	int end,
				2373	int direction)
				2374	{
				2375	int result;
				2376
				2377	str = PyUnicode_FromObject(str);
				2378	if (str == NULL)
				2379	return -1;
				2380	substr = PyUnicode_FromObject(substr);
				2381	if (substr == NULL) {
				2382	Py_DECREF(substr);
				2383	return -1;
				2384	}
				2385
				2386	result = tailmatch((PyUnicodeObject *)str,
				2387	(PyUnicodeObject *)substr,
				2388	start, end, direction);
				2389	Py_DECREF(str);
				2390	Py_DECREF(substr);
				2391	return result;
				2392	}
				2393
				2394	static
				2395	const Py_UNICODE findchar(const Py_UNICODE s,
				2396	int size,
				2397	Py_UNICODE ch)
				2398	{
				2399	/* like wcschr, but doesn't stop at NULL characters */
				2400
				2401	while (size-- > 0) {
				2402	if (*s == ch)
				2403	return s;
				2404	s++;
				2405	}
				2406
				2407	return NULL;
				2408	}
				2409
				2410	/* Apply fixfct filter to the Unicode object self and return a
				2411	reference to the modified object */
				2412
				2413	static
				2414	PyObject fixup(PyUnicodeObject self,
				2415	int (fixfct)(PyUnicodeObject s))
				2416	{
				2417
				2418	PyUnicodeObject *u;
				2419
				2420	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2421	self->length);
				2422	if (u == NULL)
				2423	return NULL;
				2424	if (!fixfct(u)) {
				2425	/* fixfct should return TRUE if it modified the buffer. If
				2426	FALSE, return a reference to the original buffer instead
				2427	(to save space, not time) */
				2428	Py_INCREF(self);
				2429	Py_DECREF(u);
				2430	return (PyObject*) self;
				2431	}
				2432	return (PyObject*) u;
				2433	}
				2434
				2435	static
				2436	int fixupper(PyUnicodeObject *self)
				2437	{
				2438	int len = self->length;
				2439	Py_UNICODE *s = self->str;
				2440	int status = 0;
				2441
				2442	while (len-- > 0) {
				2443	register Py_UNICODE ch;
				2444
				2445	ch = Py_UNICODE_TOUPPER(*s);
				2446	if (ch != *s) {
				2447	status = 1;
				2448	*s = ch;
				2449	}
				2450	s++;
				2451	}
				2452
				2453	return status;
				2454	}
				2455
				2456	static
				2457	int fixlower(PyUnicodeObject *self)
				2458	{
				2459	int len = self->length;
				2460	Py_UNICODE *s = self->str;
				2461	int status = 0;
				2462
				2463	while (len-- > 0) {
				2464	register Py_UNICODE ch;
				2465
				2466	ch = Py_UNICODE_TOLOWER(*s);
				2467	if (ch != *s) {
				2468	status = 1;
				2469	*s = ch;
				2470	}
				2471	s++;
				2472	}
				2473
				2474	return status;
				2475	}
				2476
				2477	static
				2478	int fixswapcase(PyUnicodeObject *self)
				2479	{
				2480	int len = self->length;
				2481	Py_UNICODE *s = self->str;
				2482	int status = 0;
				2483
				2484	while (len-- > 0) {
				2485	if (Py_UNICODE_ISUPPER(*s)) {
				2486	s = Py_UNICODE_TOLOWER(s);
				2487	status = 1;
				2488	} else if (Py_UNICODE_ISLOWER(*s)) {
				2489	s = Py_UNICODE_TOUPPER(s);
				2490	status = 1;
				2491	}
				2492	s++;
				2493	}
				2494
				2495	return status;
				2496	}
				2497
				2498	static
				2499	int fixcapitalize(PyUnicodeObject *self)
				2500	{
				2501	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2502	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2503	return 1;
				2504	}
				2505	return 0;
				2506	}
				2507
				2508	static
				2509	int fixtitle(PyUnicodeObject *self)
				2510	{
				2511	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2512	register Py_UNICODE *e;
				2513	int previous_is_cased;
				2514
				2515	/* Shortcut for single character strings */
				2516	if (PyUnicode_GET_SIZE(self) == 1) {
				2517	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2518	if (*p != ch) {
				2519	*p = ch;
				2520	return 1;
				2521	}
				2522	else
				2523	return 0;
				2524	}
				2525
				2526	e = p + PyUnicode_GET_SIZE(self);
				2527	previous_is_cased = 0;
				2528	for (; p < e; p++) {
				2529	register const Py_UNICODE ch = *p;
				2530
				2531	if (previous_is_cased)
				2532	*p = Py_UNICODE_TOLOWER(ch);
				2533	else
				2534	*p = Py_UNICODE_TOTITLE(ch);
				2535
				2536	if (Py_UNICODE_ISLOWER(ch) \|\|
				2537	Py_UNICODE_ISUPPER(ch) \|\|
				2538	Py_UNICODE_ISTITLE(ch))
				2539	previous_is_cased = 1;
				2540	else
				2541	previous_is_cased = 0;
				2542	}
				2543	return 1;
				2544	}
				2545
				2546	PyObject PyUnicode_Join(PyObject separator,
				2547	PyObject *seq)
				2548	{
				2549	Py_UNICODE *sep;
				2550	int seplen;
				2551	PyUnicodeObject *res = NULL;
				2552	int reslen = 0;
				2553	Py_UNICODE *p;
				2554	int seqlen = 0;
				2555	int sz = 100;
				2556	int i;
				2557
				2558	seqlen = PySequence_Length(seq);
				2559	if (seqlen < 0 && PyErr_Occurred())
				2560	return NULL;
				2561
				2562	if (separator == NULL) {
				2563	Py_UNICODE blank = ' ';
				2564	sep = &blank;
				2565	seplen = 1;
				2566	}
				2567	else {
				2568	separator = PyUnicode_FromObject(separator);
				2569	if (separator == NULL)
				2570	return NULL;
				2571	sep = PyUnicode_AS_UNICODE(separator);
				2572	seplen = PyUnicode_GET_SIZE(separator);
				2573	}
				2574
				2575	res = _PyUnicode_New(sz);
				2576	if (res == NULL)
				2577	goto onError;
				2578	p = PyUnicode_AS_UNICODE(res);
				2579	reslen = 0;
				2580
				2581	for (i = 0; i < seqlen; i++) {
				2582	int itemlen;
				2583	PyObject *item;
				2584
				2585	item = PySequence_GetItem(seq, i);
				2586	if (item == NULL)
				2587	goto onError;
				2588	if (!PyUnicode_Check(item)) {
				2589	PyObject *v;
				2590	v = PyUnicode_FromObject(item);
				2591	Py_DECREF(item);
				2592	item = v;
				2593	if (item == NULL)
				2594	goto onError;
				2595	}
				2596	itemlen = PyUnicode_GET_SIZE(item);
				2597	while (reslen + itemlen + seplen >= sz) {
				2598	if (_PyUnicode_Resize(res, sz*2))
				2599	goto onError;
				2600	sz *= 2;
				2601	p = PyUnicode_AS_UNICODE(res) + reslen;
				2602	}
				2603	if (i > 0) {
				2604	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2605	p += seplen;
				2606	reslen += seplen;
				2607	}
				2608	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2609	p += itemlen;
				2610	reslen += itemlen;
				2611	Py_DECREF(item);
				2612	}
				2613	if (_PyUnicode_Resize(res, reslen))
				2614	goto onError;
				2615
				2616	Py_XDECREF(separator);
				2617	return (PyObject *)res;
				2618
				2619	onError:
				2620	Py_XDECREF(separator);
				2621	Py_DECREF(res);
				2622	return NULL;
				2623	}
				2624
				2625	static
				2626	PyUnicodeObject pad(PyUnicodeObject self,
				2627	int left,
				2628	int right,
				2629	Py_UNICODE fill)
				2630	{
				2631	PyUnicodeObject *u;
				2632
				2633	if (left < 0)
				2634	left = 0;
				2635	if (right < 0)
				2636	right = 0;
				2637
				2638	if (left == 0 && right == 0) {
				2639	Py_INCREF(self);
				2640	return self;
				2641	}
				2642
				2643	u = _PyUnicode_New(left + self->length + right);
				2644	if (u) {
				2645	if (left)
				2646	Py_UNICODE_FILL(u->str, fill, left);
				2647	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2648	if (right)
				2649	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2650	}
				2651
				2652	return u;
				2653	}
				2654
				2655	#define SPLIT_APPEND(data, left, right) \
				2656	str = PyUnicode_FromUnicode(data + left, right - left); \
				2657	if (!str) \
				2658	goto onError; \
				2659	if (PyList_Append(list, str)) { \
				2660	Py_DECREF(str); \
				2661	goto onError; \
				2662	} \
				2663	else \
				2664	Py_DECREF(str);
				2665
				2666	static
				2667	PyObject split_whitespace(PyUnicodeObject self,
				2668	PyObject *list,
				2669	int maxcount)
				2670	{
				2671	register int i;
				2672	register int j;
				2673	int len = self->length;
				2674	PyObject *str;
				2675
				2676	for (i = j = 0; i < len; ) {
				2677	/* find a token */
				2678	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2679	i++;
				2680	j = i;
				2681	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2682	i++;
				2683	if (j < i) {
				2684	if (maxcount-- <= 0)
				2685	break;
				2686	SPLIT_APPEND(self->str, j, i);
				2687	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2688	i++;
				2689	j = i;
				2690	}
				2691	}
				2692	if (j < len) {
				2693	SPLIT_APPEND(self->str, j, len);
				2694	}
				2695	return list;
				2696
				2697	onError:
				2698	Py_DECREF(list);
				2699	return NULL;
				2700	}
				2701
				2702	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2703	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2704	{
				2705	register int i;
				2706	register int j;
				2707	int len;
				2708	PyObject *list;
				2709	PyObject *str;
				2710	Py_UNICODE *data;
				2711
				2712	string = PyUnicode_FromObject(string);
				2713	if (string == NULL)
				2714	return NULL;
				2715	data = PyUnicode_AS_UNICODE(string);
				2716	len = PyUnicode_GET_SIZE(string);
				2717
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2718	list = PyList_New(0);
				2719	if (!list)
				2720	goto onError;
				2721
				2722	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2723	int eol;
				2724
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2725	/* Find a line and append it */
				2726	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2727	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2728
				2729	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2730	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2731	if (i < len) {
				2732	if (data[i] == '\r' && i + 1 < len &&
				2733	data[i+1] == '\n')
				2734	i += 2;
				2735	else
				2736	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2737	if (keepends)
				2738	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2739	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2740	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2741	j = i;
				2742	}
				2743	if (j < len) {
				2744	SPLIT_APPEND(data, j, len);
				2745	}
				2746
				2747	Py_DECREF(string);
				2748	return list;
				2749
				2750	onError:
				2751	Py_DECREF(list);
				2752	Py_DECREF(string);
				2753	return NULL;
				2754	}
				2755
				2756	static
				2757	PyObject split_char(PyUnicodeObject self,
				2758	PyObject *list,
				2759	Py_UNICODE ch,
				2760	int maxcount)
				2761	{
				2762	register int i;
				2763	register int j;
				2764	int len = self->length;
				2765	PyObject *str;
				2766
				2767	for (i = j = 0; i < len; ) {
				2768	if (self->str[i] == ch) {
				2769	if (maxcount-- <= 0)
				2770	break;
				2771	SPLIT_APPEND(self->str, j, i);
				2772	i = j = i + 1;
				2773	} else
				2774	i++;
				2775	}
				2776	if (j <= len) {
				2777	SPLIT_APPEND(self->str, j, len);
				2778	}
				2779	return list;
				2780
				2781	onError:
				2782	Py_DECREF(list);
				2783	return NULL;
				2784	}
				2785
				2786	static
				2787	PyObject split_substring(PyUnicodeObject self,
				2788	PyObject *list,
				2789	PyUnicodeObject *substring,
				2790	int maxcount)
				2791	{
				2792	register int i;
				2793	register int j;
				2794	int len = self->length;
				2795	int sublen = substring->length;
				2796	PyObject *str;
				2797
				2798	for (i = j = 0; i < len - sublen; ) {
				2799	if (Py_UNICODE_MATCH(self, i, substring)) {
				2800	if (maxcount-- <= 0)
				2801	break;
				2802	SPLIT_APPEND(self->str, j, i);
				2803	i = j = i + sublen;
				2804	} else
				2805	i++;
				2806	}
				2807	if (j <= len) {
				2808	SPLIT_APPEND(self->str, j, len);
				2809	}
				2810	return list;
				2811
				2812	onError:
				2813	Py_DECREF(list);
				2814	return NULL;
				2815	}
				2816
				2817	#undef SPLIT_APPEND
				2818
				2819	static
				2820	PyObject split(PyUnicodeObject self,
				2821	PyUnicodeObject *substring,
				2822	int maxcount)
				2823	{
				2824	PyObject *list;
				2825
				2826	if (maxcount < 0)
				2827	maxcount = INT_MAX;
				2828
				2829	list = PyList_New(0);
				2830	if (!list)
				2831	return NULL;
				2832
				2833	if (substring == NULL)
				2834	return split_whitespace(self,list,maxcount);
				2835
				2836	else if (substring->length == 1)
				2837	return split_char(self,list,substring->str[0],maxcount);
				2838
				2839	else if (substring->length == 0) {
				2840	Py_DECREF(list);
				2841	PyErr_SetString(PyExc_ValueError, "empty separator");
				2842	return NULL;
				2843	}
				2844	else
				2845	return split_substring(self,list,substring,maxcount);
				2846	}
				2847
				2848	static
				2849	PyObject strip(PyUnicodeObject self,
				2850	int left,
				2851	int right)
				2852	{
				2853	Py_UNICODE *p = self->str;
				2854	int start = 0;
				2855	int end = self->length;
				2856
				2857	if (left)
				2858	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2859	start++;
				2860
				2861	if (right)
				2862	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2863	end--;
				2864
				2865	if (start == 0 && end == self->length) {
				2866	/* couldn't strip anything off, return original string */
				2867	Py_INCREF(self);
				2868	return (PyObject*) self;
				2869	}
				2870
				2871	return (PyObject*) PyUnicode_FromUnicode(
				2872	self->str + start,
				2873	end - start
				2874	);
				2875	}
				2876
				2877	static
				2878	PyObject replace(PyUnicodeObject self,
				2879	PyUnicodeObject *str1,
				2880	PyUnicodeObject *str2,
				2881	int maxcount)
				2882	{
				2883	PyUnicodeObject *u;
				2884
				2885	if (maxcount < 0)
				2886	maxcount = INT_MAX;
				2887
				2888	if (str1->length == 1 && str2->length == 1) {
				2889	int i;
				2890
				2891	/* replace characters */
				2892	if (!findchar(self->str, self->length, str1->str[0])) {
				2893	/* nothing to replace, return original string */
				2894	Py_INCREF(self);
				2895	u = self;
				2896	} else {
				2897	Py_UNICODE u1 = str1->str[0];
				2898	Py_UNICODE u2 = str2->str[0];
				2899
				2900	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2901	self->str,
				2902	self->length
				2903	);
				2904	if (u)
				2905	for (i = 0; i < u->length; i++)
				2906	if (u->str[i] == u1) {
				2907	if (--maxcount < 0)
				2908	break;
				2909	u->str[i] = u2;
				2910	}
				2911	}
				2912
				2913	} else {
				2914	int n, i;
				2915	Py_UNICODE *p;
				2916
				2917	/* replace strings */
				2918	n = count(self, 0, self->length, str1);
				2919	if (n > maxcount)
				2920	n = maxcount;
				2921	if (n == 0) {
				2922	/* nothing to replace, return original string */
				2923	Py_INCREF(self);
				2924	u = self;
				2925	} else {
				2926	u = _PyUnicode_New(
				2927	self->length + n * (str2->length - str1->length));
				2928	if (u) {
				2929	i = 0;
				2930	p = u->str;
				2931	while (i <= self->length - str1->length)
				2932	if (Py_UNICODE_MATCH(self, i, str1)) {
				2933	/* replace string segment */
				2934	Py_UNICODE_COPY(p, str2->str, str2->length);
				2935	p += str2->length;
				2936	i += str1->length;
				2937	if (--n <= 0) {
				2938	/* copy remaining part */
				2939	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				2940	break;
				2941	}
				2942	} else
				2943	*p++ = self->str[i++];
				2944	}
				2945	}
				2946	}
				2947
				2948	return (PyObject *) u;
				2949	}
				2950
				2951	/* --- Unicode Object Methods --------------------------------------------- */
				2952
				2953	static char title__doc__[] =
				2954	"S.title() -> unicode\n\
				2955	\n\
				2956	Return a titlecased version of S, i.e. words start with title case\n\
				2957	characters, all remaining cased characters have lower case.";
				2958
				2959	static PyObject*
				2960	unicode_title(PyUnicodeObject self, PyObject args)
				2961	{
				2962	if (!PyArg_NoArgs(args))
				2963	return NULL;
				2964	return fixup(self, fixtitle);
				2965	}
				2966
				2967	static char capitalize__doc__[] =
				2968	"S.capitalize() -> unicode\n\
				2969	\n\
				2970	Return a capitalized version of S, i.e. make the first character\n\
				2971	have upper case.";
				2972
				2973	static PyObject*
				2974	unicode_capitalize(PyUnicodeObject self, PyObject args)
				2975	{
				2976	if (!PyArg_NoArgs(args))
				2977	return NULL;
				2978	return fixup(self, fixcapitalize);
				2979	}
				2980
				2981	#if 0
				2982	static char capwords__doc__[] =
				2983	"S.capwords() -> unicode\n\
				2984	\n\
				2985	Apply .capitalize() to all words in S and return the result with\n\
				2986	normalized whitespace (all whitespace strings are replaced by ' ').";
				2987
				2988	static PyObject*
				2989	unicode_capwords(PyUnicodeObject self, PyObject args)
				2990	{
				2991	PyObject *list;
				2992	PyObject *item;
				2993	int i;
				2994
				2995	if (!PyArg_NoArgs(args))
				2996	return NULL;
				2997
				2998	/* Split into words */
				2999	list = split(self, NULL, -1);
				3000	if (!list)
				3001	return NULL;
				3002
				3003	/* Capitalize each word */
				3004	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3005	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3006	fixcapitalize);
				3007	if (item == NULL)
				3008	goto onError;
				3009	Py_DECREF(PyList_GET_ITEM(list, i));
				3010	PyList_SET_ITEM(list, i, item);
				3011	}
				3012
				3013	/* Join the words to form a new string */
				3014	item = PyUnicode_Join(NULL, list);
				3015
				3016	onError:
				3017	Py_DECREF(list);
				3018	return (PyObject *)item;
				3019	}
				3020	#endif
				3021
				3022	static char center__doc__[] =
				3023	"S.center(width) -> unicode\n\
				3024	\n\
				3025	Return S centered in a Unicode string of length width. Padding is done\n\
				3026	using spaces.";
				3027
				3028	static PyObject *
				3029	unicode_center(PyUnicodeObject self, PyObject args)
				3030	{
				3031	int marg, left;
				3032	int width;
				3033
				3034	if (!PyArg_ParseTuple(args, "i:center", &width))
				3035	return NULL;
				3036
				3037	if (self->length >= width) {
				3038	Py_INCREF(self);
				3039	return (PyObject*) self;
				3040	}
				3041
				3042	marg = width - self->length;
				3043	left = marg / 2 + (marg & width & 1);
				3044
				3045	return (PyObject*) pad(self, left, marg - left, ' ');
				3046	}
				3047
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3048	/* speedy UTF-16 code point order comparison */
				3049	/* gleaned from: */
				3050	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3051
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame^]	3052	static unsigned long utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3053	{
				3054	0, 0, 0, 0, 0, 0, 0, 0,
				3055	0, 0, 0, 0, 0, 0, 0, 0,
				3056	0, 0, 0, 0, 0, 0, 0, 0,
				3057	0, 0, 0, 0x2000, 0xf800, 0xf800, 0xf800, 0xf800
				3058	};
				3059
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3060	static int
				3061	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3062	{
				3063	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3064
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3065	Py_UNICODE *s1 = str1->str;
				3066	Py_UNICODE *s2 = str2->str;
				3067
				3068	len1 = str1->length;
				3069	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3070
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3071	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	449c325	2000-07-06 20:13:23 +0000	[diff] [blame^]	3072	unsigned long c1, c2;
				3073	long diff;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3074
				3075	c1 = *s1++;
				3076	c2 = *s2++;
				3077	if (c1 > (1<<11) * 26)
				3078	c1 += utf16Fixup[c1>>11];
				3079	if (c2 > (1<<11) * 26)
				3080	c2 += utf16Fixup[c2>>11];
				3081
				3082	/* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	3083	diff = (long)c1 - (long)c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3084	if (diff)
				3085	return (diff < 0) ? -1 : (diff != 0);
				3086	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3087	}
				3088
				3089	return (len1 < len2) ? -1 : (len1 != len2);
				3090	}
				3091
				3092	int PyUnicode_Compare(PyObject *left,
				3093	PyObject *right)
				3094	{
				3095	PyUnicodeObject u = NULL, v = NULL;
				3096	int result;
				3097
				3098	/* Coerce the two arguments */
				3099	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3100	if (u == NULL)
				3101	goto onError;
				3102	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3103	if (v == NULL)
				3104	goto onError;
				3105
				3106	/* Shortcut for emtpy or interned objects */
				3107	if (v == u) {
				3108	Py_DECREF(u);
				3109	Py_DECREF(v);
				3110	return 0;
				3111	}
				3112
				3113	result = unicode_compare(u, v);
				3114
				3115	Py_DECREF(u);
				3116	Py_DECREF(v);
				3117	return result;
				3118
				3119	onError:
				3120	Py_XDECREF(u);
				3121	Py_XDECREF(v);
				3122	return -1;
				3123	}
				3124
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3125	int PyUnicode_Contains(PyObject *container,
				3126	PyObject *element)
				3127	{
				3128	PyUnicodeObject u = NULL, v = NULL;
				3129	int result;
				3130	register const Py_UNICODE p, e;
				3131	register Py_UNICODE ch;
				3132
				3133	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3134	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3135	if (v == NULL) {
				3136	PyErr_SetString(PyExc_TypeError,
				3137	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3138	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3139	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3140	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3141	if (u == NULL) {
				3142	Py_DECREF(v);
				3143	goto onError;
				3144	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3145
				3146	/* Check v in u */
				3147	if (PyUnicode_GET_SIZE(v) != 1) {
				3148	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3149	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3150	goto onError;
				3151	}
				3152	ch = *PyUnicode_AS_UNICODE(v);
				3153	p = PyUnicode_AS_UNICODE(u);
				3154	e = p + PyUnicode_GET_SIZE(u);
				3155	result = 0;
				3156	while (p < e) {
				3157	if (*p++ == ch) {
				3158	result = 1;
				3159	break;
				3160	}
				3161	}
				3162
				3163	Py_DECREF(u);
				3164	Py_DECREF(v);
				3165	return result;
				3166
				3167	onError:
				3168	Py_XDECREF(u);
				3169	Py_XDECREF(v);
				3170	return -1;
				3171	}
				3172
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3173	/* Concat to string or Unicode object giving a new Unicode object. */
				3174
				3175	PyObject PyUnicode_Concat(PyObject left,
				3176	PyObject *right)
				3177	{
				3178	PyUnicodeObject u = NULL, v = NULL, *w;
				3179
				3180	/* Coerce the two arguments */
				3181	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3182	if (u == NULL)
				3183	goto onError;
				3184	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3185	if (v == NULL)
				3186	goto onError;
				3187
				3188	/* Shortcuts */
				3189	if (v == unicode_empty) {
				3190	Py_DECREF(v);
				3191	return (PyObject *)u;
				3192	}
				3193	if (u == unicode_empty) {
				3194	Py_DECREF(u);
				3195	return (PyObject *)v;
				3196	}
				3197
				3198	/* Concat the two Unicode strings */
				3199	w = _PyUnicode_New(u->length + v->length);
				3200	if (w == NULL)
				3201	goto onError;
				3202	Py_UNICODE_COPY(w->str, u->str, u->length);
				3203	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3204
				3205	Py_DECREF(u);
				3206	Py_DECREF(v);
				3207	return (PyObject *)w;
				3208
				3209	onError:
				3210	Py_XDECREF(u);
				3211	Py_XDECREF(v);
				3212	return NULL;
				3213	}
				3214
				3215	static char count__doc__[] =
				3216	"S.count(sub[, start[, end]]) -> int\n\
				3217	\n\
				3218	Return the number of occurrences of substring sub in Unicode string\n\
				3219	S[start:end]. Optional arguments start and end are\n\
				3220	interpreted as in slice notation.";
				3221
				3222	static PyObject *
				3223	unicode_count(PyUnicodeObject self, PyObject args)
				3224	{
				3225	PyUnicodeObject *substring;
				3226	int start = 0;
				3227	int end = INT_MAX;
				3228	PyObject *result;
				3229
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3230	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3231	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3232	return NULL;
				3233
				3234	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3235	(PyObject *)substring);
				3236	if (substring == NULL)
				3237	return NULL;
				3238
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3239	if (start < 0)
				3240	start += self->length;
				3241	if (start < 0)
				3242	start = 0;
				3243	if (end > self->length)
				3244	end = self->length;
				3245	if (end < 0)
				3246	end += self->length;
				3247	if (end < 0)
				3248	end = 0;
				3249
				3250	result = PyInt_FromLong((long) count(self, start, end, substring));
				3251
				3252	Py_DECREF(substring);
				3253	return result;
				3254	}
				3255
				3256	static char encode__doc__[] =
				3257	"S.encode([encoding[,errors]]) -> string\n\
				3258	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3259	Return an encoded string version of S. Default encoding is the current\n\
				3260	default string encoding. errors may be given to set a different error\n\
				3261	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3262	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3263
				3264	static PyObject *
				3265	unicode_encode(PyUnicodeObject self, PyObject args)
				3266	{
				3267	char *encoding = NULL;
				3268	char *errors = NULL;
				3269	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3270	return NULL;
				3271	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3272	}
				3273
				3274	static char expandtabs__doc__[] =
				3275	"S.expandtabs([tabsize]) -> unicode\n\
				3276	\n\
				3277	Return a copy of S where all tab characters are expanded using spaces.\n\
				3278	If tabsize is not given, a tab size of 8 characters is assumed.";
				3279
				3280	static PyObject*
				3281	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3282	{
				3283	Py_UNICODE *e;
				3284	Py_UNICODE *p;
				3285	Py_UNICODE *q;
				3286	int i, j;
				3287	PyUnicodeObject *u;
				3288	int tabsize = 8;
				3289
				3290	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3291	return NULL;
				3292
				3293	/* First pass: determine size of ouput string */
				3294	i = j = 0;
				3295	e = self->str + self->length;
				3296	for (p = self->str; p < e; p++)
				3297	if (*p == '\t') {
				3298	if (tabsize > 0)
				3299	j += tabsize - (j % tabsize);
				3300	}
				3301	else {
				3302	j++;
				3303	if (p == '\n' \|\| p == '\r') {
				3304	i += j;
				3305	j = 0;
				3306	}
				3307	}
				3308
				3309	/* Second pass: create output string and fill it */
				3310	u = _PyUnicode_New(i + j);
				3311	if (!u)
				3312	return NULL;
				3313
				3314	j = 0;
				3315	q = u->str;
				3316
				3317	for (p = self->str; p < e; p++)
				3318	if (*p == '\t') {
				3319	if (tabsize > 0) {
				3320	i = tabsize - (j % tabsize);
				3321	j += i;
				3322	while (i--)
				3323	*q++ = ' ';
				3324	}
				3325	}
				3326	else {
				3327	j++;
				3328	q++ = p;
				3329	if (p == '\n' \|\| p == '\r')
				3330	j = 0;
				3331	}
				3332
				3333	return (PyObject*) u;
				3334	}
				3335
				3336	static char find__doc__[] =
				3337	"S.find(sub [,start [,end]]) -> int\n\
				3338	\n\
				3339	Return the lowest index in S where substring sub is found,\n\
				3340	such that sub is contained within s[start,end]. Optional\n\
				3341	arguments start and end are interpreted as in slice notation.\n\
				3342	\n\
				3343	Return -1 on failure.";
				3344
				3345	static PyObject *
				3346	unicode_find(PyUnicodeObject self, PyObject args)
				3347	{
				3348	PyUnicodeObject *substring;
				3349	int start = 0;
				3350	int end = INT_MAX;
				3351	PyObject *result;
				3352
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3353	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3354	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3355	return NULL;
				3356	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3357	(PyObject *)substring);
				3358	if (substring == NULL)
				3359	return NULL;
				3360
				3361	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3362
				3363	Py_DECREF(substring);
				3364	return result;
				3365	}
				3366
				3367	static PyObject *
				3368	unicode_getitem(PyUnicodeObject *self, int index)
				3369	{
				3370	if (index < 0 \|\| index >= self->length) {
				3371	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3372	return NULL;
				3373	}
				3374
				3375	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3376	}
				3377
				3378	static long
				3379	unicode_hash(PyUnicodeObject *self)
				3380	{
				3381	long hash;
				3382	PyObject *utf8;
				3383
				3384	/* Since Unicode objects compare equal to their UTF-8 string
				3385	counterparts, they should also use the UTF-8 strings as basis
				3386	for their hash value. This is needed to assure that strings and
				3387	Unicode objects behave in the same way as dictionary
				3388	keys. Unfortunately, this costs some performance and also some
				3389	memory if the cached UTF-8 representation is not used later
				3390	on. */
				3391	if (self->hash != -1)
				3392	return self->hash;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	3393	utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3394	if (utf8 == NULL)
				3395	return -1;
				3396	hash = PyObject_Hash(utf8);
				3397	if (hash == -1)
				3398	return -1;
				3399	self->hash = hash;
				3400	return hash;
				3401	}
				3402
				3403	static char index__doc__[] =
				3404	"S.index(sub [,start [,end]]) -> int\n\
				3405	\n\
				3406	Like S.find() but raise ValueError when the substring is not found.";
				3407
				3408	static PyObject *
				3409	unicode_index(PyUnicodeObject self, PyObject args)
				3410	{
				3411	int result;
				3412	PyUnicodeObject *substring;
				3413	int start = 0;
				3414	int end = INT_MAX;
				3415
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3416	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3417	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3418	return NULL;
				3419
				3420	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3421	(PyObject *)substring);
				3422	if (substring == NULL)
				3423	return NULL;
				3424
				3425	result = findstring(self, substring, start, end, 1);
				3426
				3427	Py_DECREF(substring);
				3428	if (result < 0) {
				3429	PyErr_SetString(PyExc_ValueError, "substring not found");
				3430	return NULL;
				3431	}
				3432	return PyInt_FromLong(result);
				3433	}
				3434
				3435	static char islower__doc__[] =
				3436	"S.islower() -> int\n\
				3437	\n\
				3438	Return 1 if all cased characters in S are lowercase and there is\n\
				3439	at least one cased character in S, 0 otherwise.";
				3440
				3441	static PyObject*
				3442	unicode_islower(PyUnicodeObject self, PyObject args)
				3443	{
				3444	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3445	register const Py_UNICODE *e;
				3446	int cased;
				3447
				3448	if (!PyArg_NoArgs(args))
				3449	return NULL;
				3450
				3451	/* Shortcut for single character strings */
				3452	if (PyUnicode_GET_SIZE(self) == 1)
				3453	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3454
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3455	/* Special case for empty strings */
				3456	if (PyString_GET_SIZE(self) == 0)
				3457	return PyInt_FromLong(0);
				3458
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3459	e = p + PyUnicode_GET_SIZE(self);
				3460	cased = 0;
				3461	for (; p < e; p++) {
				3462	register const Py_UNICODE ch = *p;
				3463
				3464	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3465	return PyInt_FromLong(0);
				3466	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3467	cased = 1;
				3468	}
				3469	return PyInt_FromLong(cased);
				3470	}
				3471
				3472	static char isupper__doc__[] =
				3473	"S.isupper() -> int\n\
				3474	\n\
				3475	Return 1 if all cased characters in S are uppercase and there is\n\
				3476	at least one cased character in S, 0 otherwise.";
				3477
				3478	static PyObject*
				3479	unicode_isupper(PyUnicodeObject self, PyObject args)
				3480	{
				3481	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3482	register const Py_UNICODE *e;
				3483	int cased;
				3484
				3485	if (!PyArg_NoArgs(args))
				3486	return NULL;
				3487
				3488	/* Shortcut for single character strings */
				3489	if (PyUnicode_GET_SIZE(self) == 1)
				3490	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3491
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3492	/* Special case for empty strings */
				3493	if (PyString_GET_SIZE(self) == 0)
				3494	return PyInt_FromLong(0);
				3495
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3496	e = p + PyUnicode_GET_SIZE(self);
				3497	cased = 0;
				3498	for (; p < e; p++) {
				3499	register const Py_UNICODE ch = *p;
				3500
				3501	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3502	return PyInt_FromLong(0);
				3503	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3504	cased = 1;
				3505	}
				3506	return PyInt_FromLong(cased);
				3507	}
				3508
				3509	static char istitle__doc__[] =
				3510	"S.istitle() -> int\n\
				3511	\n\
				3512	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3513	may only follow uncased characters and lowercase characters only cased\n\
				3514	ones. Return 0 otherwise.";
				3515
				3516	static PyObject*
				3517	unicode_istitle(PyUnicodeObject self, PyObject args)
				3518	{
				3519	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3520	register const Py_UNICODE *e;
				3521	int cased, previous_is_cased;
				3522
				3523	if (!PyArg_NoArgs(args))
				3524	return NULL;
				3525
				3526	/* Shortcut for single character strings */
				3527	if (PyUnicode_GET_SIZE(self) == 1)
				3528	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3529	(Py_UNICODE_ISUPPER(*p) != 0));
				3530
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3531	/* Special case for empty strings */
				3532	if (PyString_GET_SIZE(self) == 0)
				3533	return PyInt_FromLong(0);
				3534
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3535	e = p + PyUnicode_GET_SIZE(self);
				3536	cased = 0;
				3537	previous_is_cased = 0;
				3538	for (; p < e; p++) {
				3539	register const Py_UNICODE ch = *p;
				3540
				3541	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3542	if (previous_is_cased)
				3543	return PyInt_FromLong(0);
				3544	previous_is_cased = 1;
				3545	cased = 1;
				3546	}
				3547	else if (Py_UNICODE_ISLOWER(ch)) {
				3548	if (!previous_is_cased)
				3549	return PyInt_FromLong(0);
				3550	previous_is_cased = 1;
				3551	cased = 1;
				3552	}
				3553	else
				3554	previous_is_cased = 0;
				3555	}
				3556	return PyInt_FromLong(cased);
				3557	}
				3558
				3559	static char isspace__doc__[] =
				3560	"S.isspace() -> int\n\
				3561	\n\
				3562	Return 1 if there are only whitespace characters in S,\n\
				3563	0 otherwise.";
				3564
				3565	static PyObject*
				3566	unicode_isspace(PyUnicodeObject self, PyObject args)
				3567	{
				3568	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3569	register const Py_UNICODE *e;
				3570
				3571	if (!PyArg_NoArgs(args))
				3572	return NULL;
				3573
				3574	/* Shortcut for single character strings */
				3575	if (PyUnicode_GET_SIZE(self) == 1 &&
				3576	Py_UNICODE_ISSPACE(*p))
				3577	return PyInt_FromLong(1);
				3578
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3579	/* Special case for empty strings */
				3580	if (PyString_GET_SIZE(self) == 0)
				3581	return PyInt_FromLong(0);
				3582
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3583	e = p + PyUnicode_GET_SIZE(self);
				3584	for (; p < e; p++) {
				3585	if (!Py_UNICODE_ISSPACE(*p))
				3586	return PyInt_FromLong(0);
				3587	}
				3588	return PyInt_FromLong(1);
				3589	}
				3590
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3591	static char isalpha__doc__[] =
				3592	"S.isalpha() -> int\n\
				3593	\n\
				3594	Return 1 if all characters in S are alphabetic\n\
				3595	and there is at least one character in S, 0 otherwise.";
				3596
				3597	static PyObject*
				3598	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3599	{
				3600	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3601	register const Py_UNICODE *e;
				3602
				3603	if (!PyArg_NoArgs(args))
				3604	return NULL;
				3605
				3606	/* Shortcut for single character strings */
				3607	if (PyUnicode_GET_SIZE(self) == 1 &&
				3608	Py_UNICODE_ISALPHA(*p))
				3609	return PyInt_FromLong(1);
				3610
				3611	/* Special case for empty strings */
				3612	if (PyString_GET_SIZE(self) == 0)
				3613	return PyInt_FromLong(0);
				3614
				3615	e = p + PyUnicode_GET_SIZE(self);
				3616	for (; p < e; p++) {
				3617	if (!Py_UNICODE_ISALPHA(*p))
				3618	return PyInt_FromLong(0);
				3619	}
				3620	return PyInt_FromLong(1);
				3621	}
				3622
				3623	static char isalnum__doc__[] =
				3624	"S.isalnum() -> int\n\
				3625	\n\
				3626	Return 1 if all characters in S are alphanumeric\n\
				3627	and there is at least one character in S, 0 otherwise.";
				3628
				3629	static PyObject*
				3630	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3631	{
				3632	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3633	register const Py_UNICODE *e;
				3634
				3635	if (!PyArg_NoArgs(args))
				3636	return NULL;
				3637
				3638	/* Shortcut for single character strings */
				3639	if (PyUnicode_GET_SIZE(self) == 1 &&
				3640	Py_UNICODE_ISALNUM(*p))
				3641	return PyInt_FromLong(1);
				3642
				3643	/* Special case for empty strings */
				3644	if (PyString_GET_SIZE(self) == 0)
				3645	return PyInt_FromLong(0);
				3646
				3647	e = p + PyUnicode_GET_SIZE(self);
				3648	for (; p < e; p++) {
				3649	if (!Py_UNICODE_ISALNUM(*p))
				3650	return PyInt_FromLong(0);
				3651	}
				3652	return PyInt_FromLong(1);
				3653	}
				3654
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3655	static char isdecimal__doc__[] =
				3656	"S.isdecimal() -> int\n\
				3657	\n\
				3658	Return 1 if there are only decimal characters in S,\n\
				3659	0 otherwise.";
				3660
				3661	static PyObject*
				3662	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3663	{
				3664	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3665	register const Py_UNICODE *e;
				3666
				3667	if (!PyArg_NoArgs(args))
				3668	return NULL;
				3669
				3670	/* Shortcut for single character strings */
				3671	if (PyUnicode_GET_SIZE(self) == 1 &&
				3672	Py_UNICODE_ISDECIMAL(*p))
				3673	return PyInt_FromLong(1);
				3674
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3675	/* Special case for empty strings */
				3676	if (PyString_GET_SIZE(self) == 0)
				3677	return PyInt_FromLong(0);
				3678
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3679	e = p + PyUnicode_GET_SIZE(self);
				3680	for (; p < e; p++) {
				3681	if (!Py_UNICODE_ISDECIMAL(*p))
				3682	return PyInt_FromLong(0);
				3683	}
				3684	return PyInt_FromLong(1);
				3685	}
				3686
				3687	static char isdigit__doc__[] =
				3688	"S.isdigit() -> int\n\
				3689	\n\
				3690	Return 1 if there are only digit characters in S,\n\
				3691	0 otherwise.";
				3692
				3693	static PyObject*
				3694	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3695	{
				3696	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3697	register const Py_UNICODE *e;
				3698
				3699	if (!PyArg_NoArgs(args))
				3700	return NULL;
				3701
				3702	/* Shortcut for single character strings */
				3703	if (PyUnicode_GET_SIZE(self) == 1 &&
				3704	Py_UNICODE_ISDIGIT(*p))
				3705	return PyInt_FromLong(1);
				3706
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3707	/* Special case for empty strings */
				3708	if (PyString_GET_SIZE(self) == 0)
				3709	return PyInt_FromLong(0);
				3710
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3711	e = p + PyUnicode_GET_SIZE(self);
				3712	for (; p < e; p++) {
				3713	if (!Py_UNICODE_ISDIGIT(*p))
				3714	return PyInt_FromLong(0);
				3715	}
				3716	return PyInt_FromLong(1);
				3717	}
				3718
				3719	static char isnumeric__doc__[] =
				3720	"S.isnumeric() -> int\n\
				3721	\n\
				3722	Return 1 if there are only numeric characters in S,\n\
				3723	0 otherwise.";
				3724
				3725	static PyObject*
				3726	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3727	{
				3728	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3729	register const Py_UNICODE *e;
				3730
				3731	if (!PyArg_NoArgs(args))
				3732	return NULL;
				3733
				3734	/* Shortcut for single character strings */
				3735	if (PyUnicode_GET_SIZE(self) == 1 &&
				3736	Py_UNICODE_ISNUMERIC(*p))
				3737	return PyInt_FromLong(1);
				3738
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3739	/* Special case for empty strings */
				3740	if (PyString_GET_SIZE(self) == 0)
				3741	return PyInt_FromLong(0);
				3742
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3743	e = p + PyUnicode_GET_SIZE(self);
				3744	for (; p < e; p++) {
				3745	if (!Py_UNICODE_ISNUMERIC(*p))
				3746	return PyInt_FromLong(0);
				3747	}
				3748	return PyInt_FromLong(1);
				3749	}
				3750
				3751	static char join__doc__[] =
				3752	"S.join(sequence) -> unicode\n\
				3753	\n\
				3754	Return a string which is the concatenation of the strings in the\n\
				3755	sequence. The separator between elements is S.";
				3756
				3757	static PyObject*
				3758	unicode_join(PyUnicodeObject self, PyObject args)
				3759	{
				3760	PyObject *data;
				3761	if (!PyArg_ParseTuple(args, "O:join", &data))
				3762	return NULL;
				3763
				3764	return PyUnicode_Join((PyObject *)self, data);
				3765	}
				3766
				3767	static int
				3768	unicode_length(PyUnicodeObject *self)
				3769	{
				3770	return self->length;
				3771	}
				3772
				3773	static char ljust__doc__[] =
				3774	"S.ljust(width) -> unicode\n\
				3775	\n\
				3776	Return S left justified in a Unicode string of length width. Padding is\n\
				3777	done using spaces.";
				3778
				3779	static PyObject *
				3780	unicode_ljust(PyUnicodeObject self, PyObject args)
				3781	{
				3782	int width;
				3783	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3784	return NULL;
				3785
				3786	if (self->length >= width) {
				3787	Py_INCREF(self);
				3788	return (PyObject*) self;
				3789	}
				3790
				3791	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3792	}
				3793
				3794	static char lower__doc__[] =
				3795	"S.lower() -> unicode\n\
				3796	\n\
				3797	Return a copy of the string S converted to lowercase.";
				3798
				3799	static PyObject*
				3800	unicode_lower(PyUnicodeObject self, PyObject args)
				3801	{
				3802	if (!PyArg_NoArgs(args))
				3803	return NULL;
				3804	return fixup(self, fixlower);
				3805	}
				3806
				3807	static char lstrip__doc__[] =
				3808	"S.lstrip() -> unicode\n\
				3809	\n\
				3810	Return a copy of the string S with leading whitespace removed.";
				3811
				3812	static PyObject *
				3813	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3814	{
				3815	if (!PyArg_NoArgs(args))
				3816	return NULL;
				3817	return strip(self, 1, 0);
				3818	}
				3819
				3820	static PyObject*
				3821	unicode_repeat(PyUnicodeObject *str, int len)
				3822	{
				3823	PyUnicodeObject *u;
				3824	Py_UNICODE *p;
				3825
				3826	if (len < 0)
				3827	len = 0;
				3828
				3829	if (len == 1) {
				3830	/* no repeat, return original string */
				3831	Py_INCREF(str);
				3832	return (PyObject*) str;
				3833	}
				3834
				3835	u = _PyUnicode_New(len * str->length);
				3836	if (!u)
				3837	return NULL;
				3838
				3839	p = u->str;
				3840
				3841	while (len-- > 0) {
				3842	Py_UNICODE_COPY(p, str->str, str->length);
				3843	p += str->length;
				3844	}
				3845
				3846	return (PyObject*) u;
				3847	}
				3848
				3849	PyObject PyUnicode_Replace(PyObject obj,
				3850	PyObject *subobj,
				3851	PyObject *replobj,
				3852	int maxcount)
				3853	{
				3854	PyObject *self;
				3855	PyObject *str1;
				3856	PyObject *str2;
				3857	PyObject *result;
				3858
				3859	self = PyUnicode_FromObject(obj);
				3860	if (self == NULL)
				3861	return NULL;
				3862	str1 = PyUnicode_FromObject(subobj);
				3863	if (str1 == NULL) {
				3864	Py_DECREF(self);
				3865	return NULL;
				3866	}
				3867	str2 = PyUnicode_FromObject(replobj);
				3868	if (str2 == NULL) {
				3869	Py_DECREF(self);
				3870	Py_DECREF(str1);
				3871	return NULL;
				3872	}
				3873	result = replace((PyUnicodeObject *)self,
				3874	(PyUnicodeObject *)str1,
				3875	(PyUnicodeObject *)str2,
				3876	maxcount);
				3877	Py_DECREF(self);
				3878	Py_DECREF(str1);
				3879	Py_DECREF(str2);
				3880	return result;
				3881	}
				3882
				3883	static char replace__doc__[] =
				3884	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3885	\n\
				3886	Return a copy of S with all occurrences of substring\n\
				3887	old replaced by new. If the optional argument maxsplit is\n\
				3888	given, only the first maxsplit occurrences are replaced.";
				3889
				3890	static PyObject*
				3891	unicode_replace(PyUnicodeObject self, PyObject args)
				3892	{
				3893	PyUnicodeObject *str1;
				3894	PyUnicodeObject *str2;
				3895	int maxcount = -1;
				3896	PyObject *result;
				3897
				3898	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3899	return NULL;
				3900	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3901	if (str1 == NULL)
				3902	return NULL;
				3903	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3904	if (str2 == NULL)
				3905	return NULL;
				3906
				3907	result = replace(self, str1, str2, maxcount);
				3908
				3909	Py_DECREF(str1);
				3910	Py_DECREF(str2);
				3911	return result;
				3912	}
				3913
				3914	static
				3915	PyObject unicode_repr(PyObject unicode)
				3916	{
				3917	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				3918	PyUnicode_GET_SIZE(unicode),
				3919	1);
				3920	}
				3921
				3922	static char rfind__doc__[] =
				3923	"S.rfind(sub [,start [,end]]) -> int\n\
				3924	\n\
				3925	Return the highest index in S where substring sub is found,\n\
				3926	such that sub is contained within s[start,end]. Optional\n\
				3927	arguments start and end are interpreted as in slice notation.\n\
				3928	\n\
				3929	Return -1 on failure.";
				3930
				3931	static PyObject *
				3932	unicode_rfind(PyUnicodeObject self, PyObject args)
				3933	{
				3934	PyUnicodeObject *substring;
				3935	int start = 0;
				3936	int end = INT_MAX;
				3937	PyObject *result;
				3938
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3939	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				3940	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3941	return NULL;
				3942	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3943	(PyObject *)substring);
				3944	if (substring == NULL)
				3945	return NULL;
				3946
				3947	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				3948
				3949	Py_DECREF(substring);
				3950	return result;
				3951	}
				3952
				3953	static char rindex__doc__[] =
				3954	"S.rindex(sub [,start [,end]]) -> int\n\
				3955	\n\
				3956	Like S.rfind() but raise ValueError when the substring is not found.";
				3957
				3958	static PyObject *
				3959	unicode_rindex(PyUnicodeObject self, PyObject args)
				3960	{
				3961	int result;
				3962	PyUnicodeObject *substring;
				3963	int start = 0;
				3964	int end = INT_MAX;
				3965
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3966	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				3967	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3968	return NULL;
				3969	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3970	(PyObject *)substring);
				3971	if (substring == NULL)
				3972	return NULL;
				3973
				3974	result = findstring(self, substring, start, end, -1);
				3975
				3976	Py_DECREF(substring);
				3977	if (result < 0) {
				3978	PyErr_SetString(PyExc_ValueError, "substring not found");
				3979	return NULL;
				3980	}
				3981	return PyInt_FromLong(result);
				3982	}
				3983
				3984	static char rjust__doc__[] =
				3985	"S.rjust(width) -> unicode\n\
				3986	\n\
				3987	Return S right justified in a Unicode string of length width. Padding is\n\
				3988	done using spaces.";
				3989
				3990	static PyObject *
				3991	unicode_rjust(PyUnicodeObject self, PyObject args)
				3992	{
				3993	int width;
				3994	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				3995	return NULL;
				3996
				3997	if (self->length >= width) {
				3998	Py_INCREF(self);
				3999	return (PyObject*) self;
				4000	}
				4001
				4002	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4003	}
				4004
				4005	static char rstrip__doc__[] =
				4006	"S.rstrip() -> unicode\n\
				4007	\n\
				4008	Return a copy of the string S with trailing whitespace removed.";
				4009
				4010	static PyObject *
				4011	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4012	{
				4013	if (!PyArg_NoArgs(args))
				4014	return NULL;
				4015	return strip(self, 0, 1);
				4016	}
				4017
				4018	static PyObject*
				4019	unicode_slice(PyUnicodeObject *self, int start, int end)
				4020	{
				4021	/* standard clamping */
				4022	if (start < 0)
				4023	start = 0;
				4024	if (end < 0)
				4025	end = 0;
				4026	if (end > self->length)
				4027	end = self->length;
				4028	if (start == 0 && end == self->length) {
				4029	/* full slice, return original string */
				4030	Py_INCREF(self);
				4031	return (PyObject*) self;
				4032	}
				4033	if (start > end)
				4034	start = end;
				4035	/* copy slice */
				4036	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4037	end - start);
				4038	}
				4039
				4040	PyObject PyUnicode_Split(PyObject s,
				4041	PyObject *sep,
				4042	int maxsplit)
				4043	{
				4044	PyObject *result;
				4045
				4046	s = PyUnicode_FromObject(s);
				4047	if (s == NULL)
				4048	return NULL;
				4049	if (sep != NULL) {
				4050	sep = PyUnicode_FromObject(sep);
				4051	if (sep == NULL) {
				4052	Py_DECREF(s);
				4053	return NULL;
				4054	}
				4055	}
				4056
				4057	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4058
				4059	Py_DECREF(s);
				4060	Py_XDECREF(sep);
				4061	return result;
				4062	}
				4063
				4064	static char split__doc__[] =
				4065	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4066	\n\
				4067	Return a list of the words in S, using sep as the\n\
				4068	delimiter string. If maxsplit is given, at most maxsplit\n\
				4069	splits are done. If sep is not specified, any whitespace string\n\
				4070	is a separator.";
				4071
				4072	static PyObject*
				4073	unicode_split(PyUnicodeObject self, PyObject args)
				4074	{
				4075	PyObject *substring = Py_None;
				4076	int maxcount = -1;
				4077
				4078	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4079	return NULL;
				4080
				4081	if (substring == Py_None)
				4082	return split(self, NULL, maxcount);
				4083	else if (PyUnicode_Check(substring))
				4084	return split(self, (PyUnicodeObject *)substring, maxcount);
				4085	else
				4086	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4087	}
				4088
				4089	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4090	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4091	\n\
				4092	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4093	Line breaks are not included in the resulting list unless keepends\n\
				4094	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4095
				4096	static PyObject*
				4097	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4098	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4099	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4100
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4101	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4102	return NULL;
				4103
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4104	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4105	}
				4106
				4107	static
				4108	PyObject unicode_str(PyUnicodeObject self)
				4109	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4110	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4111	}
				4112
				4113	static char strip__doc__[] =
				4114	"S.strip() -> unicode\n\
				4115	\n\
				4116	Return a copy of S with leading and trailing whitespace removed.";
				4117
				4118	static PyObject *
				4119	unicode_strip(PyUnicodeObject self, PyObject args)
				4120	{
				4121	if (!PyArg_NoArgs(args))
				4122	return NULL;
				4123	return strip(self, 1, 1);
				4124	}
				4125
				4126	static char swapcase__doc__[] =
				4127	"S.swapcase() -> unicode\n\
				4128	\n\
				4129	Return a copy of S with uppercase characters converted to lowercase\n\
				4130	and vice versa.";
				4131
				4132	static PyObject*
				4133	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4134	{
				4135	if (!PyArg_NoArgs(args))
				4136	return NULL;
				4137	return fixup(self, fixswapcase);
				4138	}
				4139
				4140	static char translate__doc__[] =
				4141	"S.translate(table) -> unicode\n\
				4142	\n\
				4143	Return a copy of the string S, where all characters have been mapped\n\
				4144	through the given translation table, which must be a mapping of\n\
				4145	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4146	are left untouched. Characters mapped to None are deleted.";
				4147
				4148	static PyObject*
				4149	unicode_translate(PyUnicodeObject self, PyObject args)
				4150	{
				4151	PyObject *table;
				4152
				4153	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4154	return NULL;
				4155	return PyUnicode_TranslateCharmap(self->str,
				4156	self->length,
				4157	table,
				4158	"ignore");
				4159	}
				4160
				4161	static char upper__doc__[] =
				4162	"S.upper() -> unicode\n\
				4163	\n\
				4164	Return a copy of S converted to uppercase.";
				4165
				4166	static PyObject*
				4167	unicode_upper(PyUnicodeObject self, PyObject args)
				4168	{
				4169	if (!PyArg_NoArgs(args))
				4170	return NULL;
				4171	return fixup(self, fixupper);
				4172	}
				4173
				4174	#if 0
				4175	static char zfill__doc__[] =
				4176	"S.zfill(width) -> unicode\n\
				4177	\n\
				4178	Pad a numeric string x with zeros on the left, to fill a field\n\
				4179	of the specified width. The string x is never truncated.";
				4180
				4181	static PyObject *
				4182	unicode_zfill(PyUnicodeObject self, PyObject args)
				4183	{
				4184	int fill;
				4185	PyUnicodeObject *u;
				4186
				4187	int width;
				4188	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4189	return NULL;
				4190
				4191	if (self->length >= width) {
				4192	Py_INCREF(self);
				4193	return (PyObject*) self;
				4194	}
				4195
				4196	fill = width - self->length;
				4197
				4198	u = pad(self, fill, 0, '0');
				4199
				4200	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4201	/* move sign to beginning of string */
				4202	u->str[0] = u->str[fill];
				4203	u->str[fill] = '0';
				4204	}
				4205
				4206	return (PyObject*) u;
				4207	}
				4208	#endif
				4209
				4210	#if 0
				4211	static PyObject*
				4212	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4213	{
				4214	if (!PyArg_NoArgs(args))
				4215	return NULL;
				4216	return PyInt_FromLong(unicode_freelist_size);
				4217	}
				4218	#endif
				4219
				4220	static char startswith__doc__[] =
				4221	"S.startswith(prefix[, start[, end]]) -> int\n\
				4222	\n\
				4223	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4224	optional start, test S beginning at that position. With optional end, stop\n\
				4225	comparing S at that position.";
				4226
				4227	static PyObject *
				4228	unicode_startswith(PyUnicodeObject *self,
				4229	PyObject *args)
				4230	{
				4231	PyUnicodeObject *substring;
				4232	int start = 0;
				4233	int end = INT_MAX;
				4234	PyObject *result;
				4235
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4236	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4237	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4238	return NULL;
				4239	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4240	(PyObject *)substring);
				4241	if (substring == NULL)
				4242	return NULL;
				4243
				4244	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4245
				4246	Py_DECREF(substring);
				4247	return result;
				4248	}
				4249
				4250
				4251	static char endswith__doc__[] =
				4252	"S.endswith(suffix[, start[, end]]) -> int\n\
				4253	\n\
				4254	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4255	optional start, test S beginning at that position. With optional end, stop\n\
				4256	comparing S at that position.";
				4257
				4258	static PyObject *
				4259	unicode_endswith(PyUnicodeObject *self,
				4260	PyObject *args)
				4261	{
				4262	PyUnicodeObject *substring;
				4263	int start = 0;
				4264	int end = INT_MAX;
				4265	PyObject *result;
				4266
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4267	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4268	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4269	return NULL;
				4270	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4271	(PyObject *)substring);
				4272	if (substring == NULL)
				4273	return NULL;
				4274
				4275	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4276
				4277	Py_DECREF(substring);
				4278	return result;
				4279	}
				4280
				4281
				4282	static PyMethodDef unicode_methods[] = {
				4283
				4284	/* Order is according to common usage: often used methods should
				4285	appear first, since lookup is done sequentially. */
				4286
				4287	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4288	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4289	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4290	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4291	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4292	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4293	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4294	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4295	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4296	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4297	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4298	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4299	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4300	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4301	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4302	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4303	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4304	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4305	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4306	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4307	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4308	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4309	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4310	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4311	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4312	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4313	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4314	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4315	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4316	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4317	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4318	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4319	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4320	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4321	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4322	#if 0
				4323	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4324	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4325	#endif
				4326
				4327	#if 0
				4328	/* This one is just used for debugging the implementation. */
				4329	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4330	#endif
				4331
				4332	{NULL, NULL}
				4333	};
				4334
				4335	static PyObject *
				4336	unicode_getattr(PyUnicodeObject self, char name)
				4337	{
				4338	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4339	}
				4340
				4341	static PySequenceMethods unicode_as_sequence = {
				4342	(inquiry) unicode_length, /* sq_length */
				4343	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4344	(intargfunc) unicode_repeat, /* sq_repeat */
				4345	(intargfunc) unicode_getitem, /* sq_item */
				4346	(intintargfunc) unicode_slice, /* sq_slice */
				4347	0, /* sq_ass_item */
				4348	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4349	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4350	};
				4351
				4352	static int
				4353	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4354	int index,
				4355	const void **ptr)
				4356	{
				4357	if (index != 0) {
				4358	PyErr_SetString(PyExc_SystemError,
				4359	"accessing non-existent unicode segment");
				4360	return -1;
				4361	}
				4362	ptr = (void ) self->str;
				4363	return PyUnicode_GET_DATA_SIZE(self);
				4364	}
				4365
				4366	static int
				4367	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4368	const void **ptr)
				4369	{
				4370	PyErr_SetString(PyExc_TypeError,
				4371	"cannot use unicode as modifyable buffer");
				4372	return -1;
				4373	}
				4374
				4375	static int
				4376	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4377	int *lenp)
				4378	{
				4379	if (lenp)
				4380	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4381	return 1;
				4382	}
				4383
				4384	static int
				4385	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4386	int index,
				4387	const void **ptr)
				4388	{
				4389	PyObject *str;
				4390
				4391	if (index != 0) {
				4392	PyErr_SetString(PyExc_SystemError,
				4393	"accessing non-existent unicode segment");
				4394	return -1;
				4395	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	4396	str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4397	if (str == NULL)
				4398	return -1;
				4399	ptr = (void ) PyString_AS_STRING(str);
				4400	return PyString_GET_SIZE(str);
				4401	}
				4402
				4403	/* Helpers for PyUnicode_Format() */
				4404
				4405	static PyObject *
				4406	getnextarg(args, arglen, p_argidx)
				4407	PyObject *args;
				4408	int arglen;
				4409	int *p_argidx;
				4410	{
				4411	int argidx = *p_argidx;
				4412	if (argidx < arglen) {
				4413	(*p_argidx)++;
				4414	if (arglen < 0)
				4415	return args;
				4416	else
				4417	return PyTuple_GetItem(args, argidx);
				4418	}
				4419	PyErr_SetString(PyExc_TypeError,
				4420	"not enough arguments for format string");
				4421	return NULL;
				4422	}
				4423
				4424	#define F_LJUST (1<<0)
				4425	#define F_SIGN (1<<1)
				4426	#define F_BLANK (1<<2)
				4427	#define F_ALT (1<<3)
				4428	#define F_ZERO (1<<4)
				4429
				4430	static
				4431	#ifdef HAVE_STDARG_PROTOTYPES
				4432	int usprintf(register Py_UNICODE buffer, char format, ...)
				4433	#else
				4434	int usprintf(va_alist) va_dcl
				4435	#endif
				4436	{
				4437	register int i;
				4438	int len;
				4439	va_list va;
				4440	char *charbuffer;
				4441	#ifdef HAVE_STDARG_PROTOTYPES
				4442	va_start(va, format);
				4443	#else
				4444	Py_UNICODE *args;
				4445	char *format;
				4446
				4447	va_start(va);
				4448	buffer = va_arg(va, Py_UNICODE *);
				4449	format = va_arg(va, char *);
				4450	#endif
				4451
				4452	/* First, format the string as char array, then expand to Py_UNICODE
				4453	array. */
				4454	charbuffer = (char *)buffer;
				4455	len = vsprintf(charbuffer, format, va);
				4456	for (i = len - 1; i >= 0; i--)
				4457	buffer[i] = (Py_UNICODE) charbuffer[i];
				4458
				4459	va_end(va);
				4460	return len;
				4461	}
				4462
				4463	static int
				4464	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4465	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4466	int flags,
				4467	int prec,
				4468	int type,
				4469	PyObject *v)
				4470	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4471	/* fmt = '%#.' + `prec` + `type`
				4472	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4473	char fmt[20];
				4474	double x;
				4475
				4476	x = PyFloat_AsDouble(v);
				4477	if (x == -1.0 && PyErr_Occurred())
				4478	return -1;
				4479	if (prec < 0)
				4480	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4481	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4482	type = 'g';
				4483	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4484	/* worst case length calc to ensure no buffer overrun:
				4485	fmt = %#.<prec>g
				4486	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4487	for any double rep.)
				4488	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4489	If prec=0 the effective precision is 1 (the leading digit is
				4490	always given), therefore increase by one to 10+prec. */
				4491	if (buflen <= (size_t)10 + (size_t)prec) {
				4492	PyErr_SetString(PyExc_OverflowError,
				4493	"formatted float is too long (precision too long?)");
				4494	return -1;
				4495	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4496	return usprintf(buf, fmt, x);
				4497	}
				4498
				4499	static int
				4500	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4501	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4502	int flags,
				4503	int prec,
				4504	int type,
				4505	PyObject *v)
				4506	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4507	/* fmt = '%#.' + `prec` + 'l' + `type`
				4508	worst case length = 3 + 10 (len of INT_MAX) + 1 + 1 = 15 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4509	char fmt[20];
				4510	long x;
				4511
				4512	x = PyInt_AsLong(v);
				4513	if (x == -1 && PyErr_Occurred())
				4514	return -1;
				4515	if (prec < 0)
				4516	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4517	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4518	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4519	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4520	PyErr_SetString(PyExc_OverflowError,
				4521	"formatted integer is too long (precision too long?)");
				4522	return -1;
				4523	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4524	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4525	return usprintf(buf, fmt, x);
				4526	}
				4527
				4528	static int
				4529	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4530	size_t buflen,
				4531	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4532	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4533	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4534	if (PyUnicode_Check(v)) {
				4535	if (PyUnicode_GET_SIZE(v) != 1)
				4536	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4537	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4538	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4539
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4540	else if (PyString_Check(v)) {
				4541	if (PyString_GET_SIZE(v) != 1)
				4542	goto onError;
				4543	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4544	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4545
				4546	else {
				4547	/* Integer input truncated to a character */
				4548	long x;
				4549	x = PyInt_AsLong(v);
				4550	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4551	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4552	buf[0] = (char) x;
				4553	}
				4554	buf[1] = '\0';
				4555	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4556
				4557	onError:
				4558	PyErr_SetString(PyExc_TypeError,
				4559	"%c requires int or char");
				4560	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4561	}
				4562
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4563	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4564
				4565	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4566	chars are formatted. XXX This is a magic number. Each formatting
				4567	routine does bounds checking to ensure no overflow, but a better
				4568	solution may be to malloc a buffer of appropriate size for each
				4569	format. For now, the current solution is sufficient.
				4570	*/
				4571	#define FORMATBUFLEN (size_t)120
				4572
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4573	PyObject PyUnicode_Format(PyObject format,
				4574	PyObject *args)
				4575	{
				4576	Py_UNICODE fmt, res;
				4577	int fmtcnt, rescnt, reslen, arglen, argidx;
				4578	int args_owned = 0;
				4579	PyUnicodeObject *result = NULL;
				4580	PyObject *dict = NULL;
				4581	PyObject *uformat;
				4582
				4583	if (format == NULL \|\| args == NULL) {
				4584	PyErr_BadInternalCall();
				4585	return NULL;
				4586	}
				4587	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4588	if (uformat == NULL)
				4589	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4590	fmt = PyUnicode_AS_UNICODE(uformat);
				4591	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4592
				4593	reslen = rescnt = fmtcnt + 100;
				4594	result = _PyUnicode_New(reslen);
				4595	if (result == NULL)
				4596	goto onError;
				4597	res = PyUnicode_AS_UNICODE(result);
				4598
				4599	if (PyTuple_Check(args)) {
				4600	arglen = PyTuple_Size(args);
				4601	argidx = 0;
				4602	}
				4603	else {
				4604	arglen = -1;
				4605	argidx = -2;
				4606	}
				4607	if (args->ob_type->tp_as_mapping)
				4608	dict = args;
				4609
				4610	while (--fmtcnt >= 0) {
				4611	if (*fmt != '%') {
				4612	if (--rescnt < 0) {
				4613	rescnt = fmtcnt + 100;
				4614	reslen += rescnt;
				4615	if (_PyUnicode_Resize(result, reslen) < 0)
				4616	return NULL;
				4617	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4618	--rescnt;
				4619	}
				4620	res++ = fmt++;
				4621	}
				4622	else {
				4623	/* Got a format specifier */
				4624	int flags = 0;
				4625	int width = -1;
				4626	int prec = -1;
				4627	int size = 0;
				4628	Py_UNICODE c = '\0';
				4629	Py_UNICODE fill;
				4630	PyObject *v = NULL;
				4631	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4632	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4633	Py_UNICODE sign;
				4634	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4635	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4636
				4637	fmt++;
				4638	if (*fmt == '(') {
				4639	Py_UNICODE *keystart;
				4640	int keylen;
				4641	PyObject *key;
				4642	int pcount = 1;
				4643
				4644	if (dict == NULL) {
				4645	PyErr_SetString(PyExc_TypeError,
				4646	"format requires a mapping");
				4647	goto onError;
				4648	}
				4649	++fmt;
				4650	--fmtcnt;
				4651	keystart = fmt;
				4652	/* Skip over balanced parentheses */
				4653	while (pcount > 0 && --fmtcnt >= 0) {
				4654	if (*fmt == ')')
				4655	--pcount;
				4656	else if (*fmt == '(')
				4657	++pcount;
				4658	fmt++;
				4659	}
				4660	keylen = fmt - keystart - 1;
				4661	if (fmtcnt < 0 \|\| pcount > 0) {
				4662	PyErr_SetString(PyExc_ValueError,
				4663	"incomplete format key");
				4664	goto onError;
				4665	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4666	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4667	then looked up since Python uses strings to hold
				4668	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4669	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4670	key = PyUnicode_EncodeUTF8(keystart,
				4671	keylen,
				4672	NULL);
				4673	if (key == NULL)
				4674	goto onError;
				4675	if (args_owned) {
				4676	Py_DECREF(args);
				4677	args_owned = 0;
				4678	}
				4679	args = PyObject_GetItem(dict, key);
				4680	Py_DECREF(key);
				4681	if (args == NULL) {
				4682	goto onError;
				4683	}
				4684	args_owned = 1;
				4685	arglen = -1;
				4686	argidx = -2;
				4687	}
				4688	while (--fmtcnt >= 0) {
				4689	switch (c = *fmt++) {
				4690	case '-': flags \|= F_LJUST; continue;
				4691	case '+': flags \|= F_SIGN; continue;
				4692	case ' ': flags \|= F_BLANK; continue;
				4693	case '#': flags \|= F_ALT; continue;
				4694	case '0': flags \|= F_ZERO; continue;
				4695	}
				4696	break;
				4697	}
				4698	if (c == '*') {
				4699	v = getnextarg(args, arglen, &argidx);
				4700	if (v == NULL)
				4701	goto onError;
				4702	if (!PyInt_Check(v)) {
				4703	PyErr_SetString(PyExc_TypeError,
				4704	"* wants int");
				4705	goto onError;
				4706	}
				4707	width = PyInt_AsLong(v);
				4708	if (width < 0) {
				4709	flags \|= F_LJUST;
				4710	width = -width;
				4711	}
				4712	if (--fmtcnt >= 0)
				4713	c = *fmt++;
				4714	}
				4715	else if (c >= '0' && c <= '9') {
				4716	width = c - '0';
				4717	while (--fmtcnt >= 0) {
				4718	c = *fmt++;
				4719	if (c < '0' \|\| c > '9')
				4720	break;
				4721	if ((width*10) / 10 != width) {
				4722	PyErr_SetString(PyExc_ValueError,
				4723	"width too big");
				4724	goto onError;
				4725	}
				4726	width = width*10 + (c - '0');
				4727	}
				4728	}
				4729	if (c == '.') {
				4730	prec = 0;
				4731	if (--fmtcnt >= 0)
				4732	c = *fmt++;
				4733	if (c == '*') {
				4734	v = getnextarg(args, arglen, &argidx);
				4735	if (v == NULL)
				4736	goto onError;
				4737	if (!PyInt_Check(v)) {
				4738	PyErr_SetString(PyExc_TypeError,
				4739	"* wants int");
				4740	goto onError;
				4741	}
				4742	prec = PyInt_AsLong(v);
				4743	if (prec < 0)
				4744	prec = 0;
				4745	if (--fmtcnt >= 0)
				4746	c = *fmt++;
				4747	}
				4748	else if (c >= '0' && c <= '9') {
				4749	prec = c - '0';
				4750	while (--fmtcnt >= 0) {
				4751	c = Py_CHARMASK(*fmt++);
				4752	if (c < '0' \|\| c > '9')
				4753	break;
				4754	if ((prec*10) / 10 != prec) {
				4755	PyErr_SetString(PyExc_ValueError,
				4756	"prec too big");
				4757	goto onError;
				4758	}
				4759	prec = prec*10 + (c - '0');
				4760	}
				4761	}
				4762	} /* prec */
				4763	if (fmtcnt >= 0) {
				4764	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4765	size = c;
				4766	if (--fmtcnt >= 0)
				4767	c = *fmt++;
				4768	}
				4769	}
				4770	if (fmtcnt < 0) {
				4771	PyErr_SetString(PyExc_ValueError,
				4772	"incomplete format");
				4773	goto onError;
				4774	}
				4775	if (c != '%') {
				4776	v = getnextarg(args, arglen, &argidx);
				4777	if (v == NULL)
				4778	goto onError;
				4779	}
				4780	sign = 0;
				4781	fill = ' ';
				4782	switch (c) {
				4783
				4784	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4785	pbuf = formatbuf;
				4786	/* presume that buffer length is at least 1 */
				4787	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4788	len = 1;
				4789	break;
				4790
				4791	case 's':
				4792	case 'r':
				4793	if (PyUnicode_Check(v) && c == 's') {
				4794	temp = v;
				4795	Py_INCREF(temp);
				4796	}
				4797	else {
				4798	PyObject *unicode;
				4799	if (c == 's')
				4800	temp = PyObject_Str(v);
				4801	else
				4802	temp = PyObject_Repr(v);
				4803	if (temp == NULL)
				4804	goto onError;
				4805	if (!PyString_Check(temp)) {
				4806	/* XXX Note: this should never happen, since
				4807	PyObject_Repr() and PyObject_Str() assure
				4808	this */
				4809	Py_DECREF(temp);
				4810	PyErr_SetString(PyExc_TypeError,
				4811	"%s argument has non-string str()");
				4812	goto onError;
				4813	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4814	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4815	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4816	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4817	"strict");
				4818	Py_DECREF(temp);
				4819	temp = unicode;
				4820	if (temp == NULL)
				4821	goto onError;
				4822	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4823	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4824	len = PyUnicode_GET_SIZE(temp);
				4825	if (prec >= 0 && len > prec)
				4826	len = prec;
				4827	break;
				4828
				4829	case 'i':
				4830	case 'd':
				4831	case 'u':
				4832	case 'o':
				4833	case 'x':
				4834	case 'X':
				4835	if (c == 'i')
				4836	c = 'd';
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4837	pbuf = formatbuf;
				4838	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4839	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4840	if (len < 0)
				4841	goto onError;
				4842	sign = (c == 'd');
				4843	if (flags & F_ZERO) {
				4844	fill = '0';
				4845	if ((flags&F_ALT) &&
				4846	(c == 'x' \|\| c == 'X') &&
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4847	pbuf[0] == '0' && pbuf[1] == c) {
				4848	res++ = pbuf++;
				4849	res++ = pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4850	rescnt -= 2;
				4851	len -= 2;
				4852	width -= 2;
				4853	if (width < 0)
				4854	width = 0;
				4855	}
				4856	}
				4857	break;
				4858
				4859	case 'e':
				4860	case 'E':
				4861	case 'f':
				4862	case 'g':
				4863	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4864	pbuf = formatbuf;
				4865	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				4866	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4867	if (len < 0)
				4868	goto onError;
				4869	sign = 1;
				4870	if (flags&F_ZERO)
				4871	fill = '0';
				4872	break;
				4873
				4874	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4875	pbuf = formatbuf;
				4876	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4877	if (len < 0)
				4878	goto onError;
				4879	break;
				4880
				4881	default:
				4882	PyErr_Format(PyExc_ValueError,
				4883	"unsupported format character '%c' (0x%x)",
				4884	c, c);
				4885	goto onError;
				4886	}
				4887	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4888	if (pbuf == '-' \|\| pbuf == '+') {
				4889	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4890	len--;
				4891	}
				4892	else if (flags & F_SIGN)
				4893	sign = '+';
				4894	else if (flags & F_BLANK)
				4895	sign = ' ';
				4896	else
				4897	sign = 0;
				4898	}
				4899	if (width < len)
				4900	width = len;
				4901	if (rescnt < width + (sign != 0)) {
				4902	reslen -= rescnt;
				4903	rescnt = width + fmtcnt + 100;
				4904	reslen += rescnt;
				4905	if (_PyUnicode_Resize(result, reslen) < 0)
				4906	return NULL;
				4907	res = PyUnicode_AS_UNICODE(result)
				4908	+ reslen - rescnt;
				4909	}
				4910	if (sign) {
				4911	if (fill != ' ')
				4912	*res++ = sign;
				4913	rescnt--;
				4914	if (width > len)
				4915	width--;
				4916	}
				4917	if (width > len && !(flags & F_LJUST)) {
				4918	do {
				4919	--rescnt;
				4920	*res++ = fill;
				4921	} while (--width > len);
				4922	}
				4923	if (sign && fill == ' ')
				4924	*res++ = sign;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4925	memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4926	res += len;
				4927	rescnt -= len;
				4928	while (--width >= len) {
				4929	--rescnt;
				4930	*res++ = ' ';
				4931	}
				4932	if (dict && (argidx < arglen) && c != '%') {
				4933	PyErr_SetString(PyExc_TypeError,
				4934	"not all arguments converted");
				4935	goto onError;
				4936	}
				4937	Py_XDECREF(temp);
				4938	} /* '%' */
				4939	} /* until end */
				4940	if (argidx < arglen && !dict) {
				4941	PyErr_SetString(PyExc_TypeError,
				4942	"not all arguments converted");
				4943	goto onError;
				4944	}
				4945
				4946	if (args_owned) {
				4947	Py_DECREF(args);
				4948	}
				4949	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4950	if (_PyUnicode_Resize(result, reslen - rescnt))
				4951	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4952	return (PyObject *)result;
				4953
				4954	onError:
				4955	Py_XDECREF(result);
				4956	Py_DECREF(uformat);
				4957	if (args_owned) {
				4958	Py_DECREF(args);
				4959	}
				4960	return NULL;
				4961	}
				4962
				4963	static PyBufferProcs unicode_as_buffer = {
				4964	(getreadbufferproc) unicode_buffer_getreadbuf,
				4965	(getwritebufferproc) unicode_buffer_getwritebuf,
				4966	(getsegcountproc) unicode_buffer_getsegcount,
				4967	(getcharbufferproc) unicode_buffer_getcharbuf,
				4968	};
				4969
				4970	PyTypeObject PyUnicode_Type = {
				4971	PyObject_HEAD_INIT(&PyType_Type)
				4972	0, /* ob_size */
				4973	"unicode", /* tp_name */
				4974	sizeof(PyUnicodeObject), /* tp_size */
				4975	0, /* tp_itemsize */
				4976	/* Slots */
				4977	(destructor)_PyUnicode_Free, /* tp_dealloc */
				4978	0, /* tp_print */
				4979	(getattrfunc)unicode_getattr, /* tp_getattr */
				4980	0, /* tp_setattr */
				4981	(cmpfunc) unicode_compare, /* tp_compare */
				4982	(reprfunc) unicode_repr, /* tp_repr */
				4983	0, /* tp_as_number */
				4984	&unicode_as_sequence, /* tp_as_sequence */
				4985	0, /* tp_as_mapping */
				4986	(hashfunc) unicode_hash, /* tp_hash*/
				4987	0, /* tp_call*/
				4988	(reprfunc) unicode_str, /* tp_str */
				4989	(getattrofunc) NULL, /* tp_getattro */
				4990	(setattrofunc) NULL, /* tp_setattro */
				4991	&unicode_as_buffer, /* tp_as_buffer */
				4992	Py_TPFLAGS_DEFAULT, /* tp_flags */
				4993	};
				4994
				4995	/* Initialize the Unicode implementation */
				4996
				4997	void _PyUnicode_Init()
				4998	{
				4999	/* Doublecheck the configuration... */
				5000	if (sizeof(Py_UNICODE) != 2)
				5001	Py_FatalError("Unicode configuration error: "
				5002	"sizeof(Py_UNICODE) != 2 bytes");
				5003
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5004	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5005	unicode_freelist = NULL;
				5006	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5007	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5008	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5009	}
				5010
				5011	/* Finalize the Unicode implementation */
				5012
				5013	void
				5014	_PyUnicode_Fini()
				5015	{
				5016	PyUnicodeObject *u = unicode_freelist;
				5017
				5018	while (u != NULL) {
				5019	PyUnicodeObject *v = u;
				5020	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5021	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5022	PyMem_DEL(v->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5023	Py_XDECREF(v->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5024	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5025	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5026	unicode_freelist = NULL;
				5027	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5028	Py_XDECREF(unicode_empty);
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5029	unicode_empty = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5030	}