Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: bfc59dd97a1c0b28582b5e2c085bf5f459ea3c17 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
				69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	79
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	80	/* Limit for the Unicode object free list */
				81
				82	#define MAX_UNICODE_FREELIST_SIZE 1024
				83
				84	/* Limit for the Unicode object free list stay alive optimization.
				85
				86	The implementation will keep allocated Unicode memory intact for
				87	all objects on the free list having a size less than this
				88	limit. This reduces malloc() overhead for small Unicode objects.
				89
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	90	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	91	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92	malloc()-overhead) bytes of unused garbage.
				93
				94	Setting the limit to 0 effectively turns the feature off.
				95
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	96	Note: This is an experimental feature ! If you get core dumps when
				97	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	98
				99	*/
				100
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	101	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	111	/* --- Globals ------------------------------------------------------------
				112
				113	The globals are initialized by the _PyUnicode_Init() API and should
				114	not be used before calling that API.
				115
				116	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	117
				118	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	119	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	120
				121	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	122	static PyUnicodeObject *unicode_freelist;
				123	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	124
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	125	/* Default encoding to use and assume when NULL is passed as encoding
				126	parameter; it is initialized by _PyUnicode_Init().
				127
				128	Always use the PyUnicode_SetDefaultEncoding() and
				129	PyUnicode_GetDefaultEncoding() APIs to access this global.
				130
				131	*/
				132
				133	static char unicode_default_encoding[100];
				134
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* --- Unicode Object ----------------------------------------------------- */
				136
				137	static
				138	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				139	int length)
				140	{
				141	void *oldstr;
				142
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	143	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	144	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	146
				147	/* Resizing unicode_empty is not allowed. */
				148	if (unicode == unicode_empty) {
				149	PyErr_SetString(PyExc_SystemError,
				150	"can't resize empty unicode object");
				151	return -1;
				152	}
				153
				154	/* We allocate one more byte to make sure the string is
				155	Ux0000 terminated -- XXX is this needed ? */
				156	oldstr = unicode->str;
				157	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				158	if (!unicode->str) {
				159	unicode->str = oldstr;
				160	PyErr_NoMemory();
				161	return -1;
				162	}
				163	unicode->str[length] = 0;
				164	unicode->length = length;
				165
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	166	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	167	/* Reset the object caches */
				168	if (unicode->utf8str) {
				169	Py_DECREF(unicode->utf8str);
				170	unicode->utf8str = NULL;
				171	}
				172	unicode->hash = -1;
				173
				174	return 0;
				175	}
				176
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	177	int PyUnicode_Resize(PyObject **unicode,
				178	int length)
				179	{
				180	PyUnicodeObject *v;
				181
				182	if (unicode == NULL) {
				183	PyErr_BadInternalCall();
				184	return -1;
				185	}
				186	v = (PyUnicodeObject )unicode;
				187	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				188	PyErr_BadInternalCall();
				189	return -1;
				190	}
				191	return _PyUnicode_Resize(v, length);
				192	}
				193
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	194	/* We allocate one more byte to make sure the string is
				195	Ux0000 terminated -- XXX is this needed ?
				196
				197	XXX This allocator could further be enhanced by assuring that the
				198	free list never reduces its size below 1.
				199
				200	*/
				201
				202	static
				203	PyUnicodeObject *_PyUnicode_New(int length)
				204	{
				205	register PyUnicodeObject *unicode;
				206
				207	/* Optimization for empty strings */
				208	if (length == 0 && unicode_empty != NULL) {
				209	Py_INCREF(unicode_empty);
				210	return unicode_empty;
				211	}
				212
				213	/* Unicode freelist & memory allocation */
				214	if (unicode_freelist) {
				215	unicode = unicode_freelist;
				216	unicode_freelist = (PyUnicodeObject *)unicode_freelist;
				217	unicode_freelist_size--;
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	218	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	219	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	220	/* Keep-Alive optimization: we only upsize the buffer,
				221	never downsize it. */
				222	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	223	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	224	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	225	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	226	}
				227	}
				228	else
				229	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				230	}
				231	else {
				232	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				233	if (unicode == NULL)
				234	return NULL;
				235	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				236	}
				237
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	238	if (!unicode->str) {
				239	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	240	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	241	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	242	unicode->str[length] = 0;
				243	unicode->length = length;
				244	unicode->hash = -1;
				245	unicode->utf8str = NULL;
				246	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	247
				248	onError:
				249	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	250	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	251	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	252	}
				253
				254	static
				255	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				256	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	257	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	258	/* Keep-Alive optimization */
				259	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	260	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	261	unicode->str = NULL;
				262	unicode->length = 0;
				263	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	264	if (unicode->utf8str) {
				265	Py_DECREF(unicode->utf8str);
				266	unicode->utf8str = NULL;
				267	}
				268	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	269	(PyUnicodeObject *)unicode = unicode_freelist;
				270	unicode_freelist = unicode;
				271	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	272	}
				273	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	274	PyMem_DEL(unicode->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	275	Py_XDECREF(unicode->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	276	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	277	}
				278	}
				279
				280	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				281	int size)
				282	{
				283	PyUnicodeObject *unicode;
				284
				285	unicode = _PyUnicode_New(size);
				286	if (!unicode)
				287	return NULL;
				288
				289	/* Copy the Unicode data into the new object */
				290	if (u != NULL)
				291	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				292
				293	return (PyObject *)unicode;
				294	}
				295
				296	#ifdef HAVE_WCHAR_H
				297
				298	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				299	int size)
				300	{
				301	PyUnicodeObject *unicode;
				302
				303	if (w == NULL) {
				304	PyErr_BadInternalCall();
				305	return NULL;
				306	}
				307
				308	unicode = _PyUnicode_New(size);
				309	if (!unicode)
				310	return NULL;
				311
				312	/* Copy the wchar_t data into the new object */
				313	#ifdef HAVE_USABLE_WCHAR_T
				314	memcpy(unicode->str, w, size * sizeof(wchar_t));
				315	#else
				316	{
				317	register Py_UNICODE *u;
				318	register int i;
				319	u = PyUnicode_AS_UNICODE(unicode);
				320	for (i = size; i >= 0; i--)
				321	u++ = w++;
				322	}
				323	#endif
				324
				325	return (PyObject *)unicode;
				326	}
				327
				328	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				329	register wchar_t *w,
				330	int size)
				331	{
				332	if (unicode == NULL) {
				333	PyErr_BadInternalCall();
				334	return -1;
				335	}
				336	if (size > PyUnicode_GET_SIZE(unicode))
				337	size = PyUnicode_GET_SIZE(unicode);
				338	#ifdef HAVE_USABLE_WCHAR_T
				339	memcpy(w, unicode->str, size * sizeof(wchar_t));
				340	#else
				341	{
				342	register Py_UNICODE *u;
				343	register int i;
				344	u = PyUnicode_AS_UNICODE(unicode);
				345	for (i = size; i >= 0; i--)
				346	w++ = u++;
				347	}
				348	#endif
				349
				350	return size;
				351	}
				352
				353	#endif
				354
				355	PyObject PyUnicode_FromObject(register PyObject obj)
				356	{
				357	const char *s;
				358	int len;
				359
				360	if (obj == NULL) {
				361	PyErr_BadInternalCall();
				362	return NULL;
				363	}
				364	else if (PyUnicode_Check(obj)) {
				365	Py_INCREF(obj);
				366	return obj;
				367	}
				368	else if (PyString_Check(obj)) {
				369	s = PyString_AS_STRING(obj);
				370	len = PyString_GET_SIZE(obj);
				371	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	372	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				373	/* Overwrite the error message with something more useful in
				374	case of a TypeError. */
				375	if (PyErr_ExceptionMatches(PyExc_TypeError))
				376	PyErr_SetString(PyExc_TypeError,
				377	"coercing to Unicode: need string or charbuffer");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	378	return NULL;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	379	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	380	if (len == 0) {
				381	Py_INCREF(unicode_empty);
				382	return (PyObject *)unicode_empty;
				383	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	384	return PyUnicode_Decode(s, len, NULL, "strict");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	385	}
				386
				387	PyObject PyUnicode_Decode(const char s,
				388	int size,
				389	const char *encoding,
				390	const char *errors)
				391	{
				392	PyObject buffer = NULL, unicode;
				393
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	394	if (encoding == NULL)
				395	encoding = PyUnicode_GetDefaultEncoding();
				396
				397	/* Shortcuts for common default encodings */
				398	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	399	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	400	else if (strcmp(encoding, "latin-1") == 0)
				401	return PyUnicode_DecodeLatin1(s, size, errors);
				402	else if (strcmp(encoding, "ascii") == 0)
				403	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	404
				405	/* Decode via the codec registry */
				406	buffer = PyBuffer_FromMemory((void *)s, size);
				407	if (buffer == NULL)
				408	goto onError;
				409	unicode = PyCodec_Decode(buffer, encoding, errors);
				410	if (unicode == NULL)
				411	goto onError;
				412	if (!PyUnicode_Check(unicode)) {
				413	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	414	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	415	unicode->ob_type->tp_name);
				416	Py_DECREF(unicode);
				417	goto onError;
				418	}
				419	Py_DECREF(buffer);
				420	return unicode;
				421
				422	onError:
				423	Py_XDECREF(buffer);
				424	return NULL;
				425	}
				426
				427	PyObject PyUnicode_Encode(const Py_UNICODE s,
				428	int size,
				429	const char *encoding,
				430	const char *errors)
				431	{
				432	PyObject v, unicode;
				433
				434	unicode = PyUnicode_FromUnicode(s, size);
				435	if (unicode == NULL)
				436	return NULL;
				437	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				438	Py_DECREF(unicode);
				439	return v;
				440	}
				441
				442	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				443	const char *encoding,
				444	const char *errors)
				445	{
				446	PyObject *v;
				447
				448	if (!PyUnicode_Check(unicode)) {
				449	PyErr_BadArgument();
				450	goto onError;
				451	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	452
				453	if (encoding == NULL)
				454	encoding = PyUnicode_GetDefaultEncoding();
				455
				456	/* Shortcuts for common default encodings */
				457	if (errors == NULL) {
				458	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	459	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	460	else if (strcmp(encoding, "latin-1") == 0)
				461	return PyUnicode_AsLatin1String(unicode);
				462	else if (strcmp(encoding, "ascii") == 0)
				463	return PyUnicode_AsASCIIString(unicode);
				464	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	465
				466	/* Encode via the codec registry */
				467	v = PyCodec_Encode(unicode, encoding, errors);
				468	if (v == NULL)
				469	goto onError;
				470	/* XXX Should we really enforce this ? */
				471	if (!PyString_Check(v)) {
				472	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	473	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	474	v->ob_type->tp_name);
				475	Py_DECREF(v);
				476	goto onError;
				477	}
				478	return v;
				479
				480	onError:
				481	return NULL;
				482	}
				483
				484	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				485	{
				486	if (!PyUnicode_Check(unicode)) {
				487	PyErr_BadArgument();
				488	goto onError;
				489	}
				490	return PyUnicode_AS_UNICODE(unicode);
				491
				492	onError:
				493	return NULL;
				494	}
				495
				496	int PyUnicode_GetSize(PyObject *unicode)
				497	{
				498	if (!PyUnicode_Check(unicode)) {
				499	PyErr_BadArgument();
				500	goto onError;
				501	}
				502	return PyUnicode_GET_SIZE(unicode);
				503
				504	onError:
				505	return -1;
				506	}
				507
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	508	const char *PyUnicode_GetDefaultEncoding()
				509	{
				510	return unicode_default_encoding;
				511	}
				512
				513	int PyUnicode_SetDefaultEncoding(const char *encoding)
				514	{
				515	PyObject *v;
				516
				517	/* Make sure the encoding is valid. As side effect, this also
				518	loads the encoding into the codec registry cache. */
				519	v = _PyCodec_Lookup(encoding);
				520	if (v == NULL)
				521	goto onError;
				522	Py_DECREF(v);
				523	strncpy(unicode_default_encoding,
				524	encoding,
				525	sizeof(unicode_default_encoding));
				526	return 0;
				527
				528	onError:
				529	return -1;
				530	}
				531
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	532	/* --- UTF-8 Codec -------------------------------------------------------- */
				533
				534	static
				535	char utf8_code_length[256] = {
				536	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				537	illegal prefix. see RFC 2279 for details */
				538	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				539	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				540	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				541	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				542	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				543	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				544	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				545	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				546	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				547	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				548	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				549	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				550	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				551	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				552	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				553	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				554	};
				555
				556	static
				557	int utf8_decoding_error(const char **source,
				558	Py_UNICODE **dest,
				559	const char *errors,
				560	const char *details)
				561	{
				562	if ((errors == NULL) \|\|
				563	(strcmp(errors,"strict") == 0)) {
				564	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	565	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	566	details);
				567	return -1;
				568	}
				569	else if (strcmp(errors,"ignore") == 0) {
				570	(*source)++;
				571	return 0;
				572	}
				573	else if (strcmp(errors,"replace") == 0) {
				574	(*source)++;
				575	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				576	(*dest)++;
				577	return 0;
				578	}
				579	else {
				580	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	581	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	582	errors);
				583	return -1;
				584	}
				585	}
				586
				587	#define UTF8_ERROR(details) do { \
				588	if (utf8_decoding_error(&s, &p, errors, details)) \
				589	goto onError; \
				590	continue; \
				591	} while (0)
				592
				593	PyObject PyUnicode_DecodeUTF8(const char s,
				594	int size,
				595	const char *errors)
				596	{
				597	int n;
				598	const char *e;
				599	PyUnicodeObject *unicode;
				600	Py_UNICODE *p;
				601
				602	/* Note: size will always be longer than the resulting Unicode
				603	character count */
				604	unicode = _PyUnicode_New(size);
				605	if (!unicode)
				606	return NULL;
				607	if (size == 0)
				608	return (PyObject *)unicode;
				609
				610	/* Unpack UTF-8 encoded data */
				611	p = unicode->str;
				612	e = s + size;
				613
				614	while (s < e) {
				615	register Py_UNICODE ch = (unsigned char)*s;
				616
				617	if (ch < 0x80) {
				618	*p++ = ch;
				619	s++;
				620	continue;
				621	}
				622
				623	n = utf8_code_length[ch];
				624
				625	if (s + n > e)
				626	UTF8_ERROR("unexpected end of data");
				627
				628	switch (n) {
				629
				630	case 0:
				631	UTF8_ERROR("unexpected code byte");
				632	break;
				633
				634	case 1:
				635	UTF8_ERROR("internal error");
				636	break;
				637
				638	case 2:
				639	if ((s[1] & 0xc0) != 0x80)
				640	UTF8_ERROR("invalid data");
				641	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				642	if (ch < 0x80)
				643	UTF8_ERROR("illegal encoding");
				644	else
				645	*p++ = ch;
				646	break;
				647
				648	case 3:
				649	if ((s[1] & 0xc0) != 0x80 \|\|
				650	(s[2] & 0xc0) != 0x80)
				651	UTF8_ERROR("invalid data");
				652	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				653	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				654	UTF8_ERROR("illegal encoding");
				655	else
				656	*p++ = ch;
				657	break;
				658
				659	default:
				660	/* Other sizes are only needed for UCS-4 */
				661	UTF8_ERROR("unsupported Unicode code range");
				662	}
				663	s += n;
				664	}
				665
				666	/* Adjust length */
				667	if (_PyUnicode_Resize(unicode, p - unicode->str))
				668	goto onError;
				669
				670	return (PyObject *)unicode;
				671
				672	onError:
				673	Py_DECREF(unicode);
				674	return NULL;
				675	}
				676
				677	#undef UTF8_ERROR
				678
				679	static
				680	int utf8_encoding_error(const Py_UNICODE **source,
				681	char **dest,
				682	const char *errors,
				683	const char *details)
				684	{
				685	if ((errors == NULL) \|\|
				686	(strcmp(errors,"strict") == 0)) {
				687	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	688	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	689	details);
				690	return -1;
				691	}
				692	else if (strcmp(errors,"ignore") == 0) {
				693	return 0;
				694	}
				695	else if (strcmp(errors,"replace") == 0) {
				696	**dest = '?';
				697	(*dest)++;
				698	return 0;
				699	}
				700	else {
				701	PyErr_Format(PyExc_ValueError,
				702	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	703	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	704	errors);
				705	return -1;
				706	}
				707	}
				708
				709	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				710	int size,
				711	const char *errors)
				712	{
				713	PyObject *v;
				714	char *p;
				715	char *q;
				716
				717	v = PyString_FromStringAndSize(NULL, 3 * size);
				718	if (v == NULL)
				719	return NULL;
				720	if (size == 0)
				721	goto done;
				722
				723	p = q = PyString_AS_STRING(v);
				724	while (size-- > 0) {
				725	Py_UNICODE ch = *s++;
				726	if (ch < 0x80)
				727	*p++ = (char) ch;
				728	else if (ch < 0x0800) {
				729	*p++ = 0xc0 \| (ch >> 6);
				730	*p++ = 0x80 \| (ch & 0x3f);
				731	} else if (0xD800 <= ch && ch <= 0xDFFF) {
				732	/* These byte ranges are reserved for UTF-16 surrogate
				733	bytes which the Python implementation currently does
				734	not support. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	735	if (utf8_encoding_error(&s, &p, errors,
				736	"unsupported code range"))
				737	goto onError;
				738	} else {
				739	*p++ = 0xe0 \| (ch >> 12);
				740	*p++ = 0x80 \| ((ch >> 6) & 0x3f);
				741	*p++ = 0x80 \| (ch & 0x3f);
				742	}
				743	}
				744	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	745	if (_PyString_Resize(&v, p - q))
				746	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	747
				748	done:
				749	return v;
				750
				751	onError:
				752	Py_DECREF(v);
				753	return NULL;
				754	}
				755
				756	/* Return a Python string holding the UTF-8 encoded value of the
				757	Unicode object.
				758
				759	The resulting string is cached in the Unicode object for subsequent
				760	usage by this function. The cached version is needed to implement
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	761	the character buffer interface and will live (at least) as long as
				762	the Unicode object itself.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	763
				764	The refcount of the string is not incremented.
				765
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	766	* Exported for internal use by the interpreter only !!! *
				767
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	768	*/
				769
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	770	PyObject _PyUnicode_AsUTF8String(PyObject unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	771	const char *errors)
				772	{
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	773	PyObject v = ((PyUnicodeObject )unicode)->utf8str;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	774
				775	if (v)
				776	return v;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	777	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				778	PyUnicode_GET_SIZE(unicode),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	779	errors);
				780	if (v && errors == NULL)
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	781	((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	782	return v;
				783	}
				784
				785	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				786	{
				787	PyObject *str;
				788
				789	if (!PyUnicode_Check(unicode)) {
				790	PyErr_BadArgument();
				791	return NULL;
				792	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	793	str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	794	if (str == NULL)
				795	return NULL;
				796	Py_INCREF(str);
				797	return str;
				798	}
				799
				800	/* --- UTF-16 Codec ------------------------------------------------------- */
				801
				802	static
				803	int utf16_decoding_error(const Py_UNICODE **source,
				804	Py_UNICODE **dest,
				805	const char *errors,
				806	const char *details)
				807	{
				808	if ((errors == NULL) \|\|
				809	(strcmp(errors,"strict") == 0)) {
				810	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	811	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	812	details);
				813	return -1;
				814	}
				815	else if (strcmp(errors,"ignore") == 0) {
				816	return 0;
				817	}
				818	else if (strcmp(errors,"replace") == 0) {
				819	if (dest) {
				820	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				821	(*dest)++;
				822	}
				823	return 0;
				824	}
				825	else {
				826	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	827	"UTF-16 decoding error; "
				828	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	829	errors);
				830	return -1;
				831	}
				832	}
				833
				834	#define UTF16_ERROR(details) do { \
				835	if (utf16_decoding_error(&q, &p, errors, details)) \
				836	goto onError; \
				837	continue; \
				838	} while(0)
				839
				840	PyObject PyUnicode_DecodeUTF16(const char s,
				841	int size,
				842	const char *errors,
				843	int *byteorder)
				844	{
				845	PyUnicodeObject *unicode;
				846	Py_UNICODE *p;
				847	const Py_UNICODE q, e;
				848	int bo = 0;
				849
				850	/* size should be an even number */
				851	if (size % sizeof(Py_UNICODE) != 0) {
				852	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				853	return NULL;
				854	/* The remaining input chars are ignored if we fall through
				855	here... */
				856	}
				857
				858	/* Note: size will always be longer than the resulting Unicode
				859	character count */
				860	unicode = _PyUnicode_New(size);
				861	if (!unicode)
				862	return NULL;
				863	if (size == 0)
				864	return (PyObject *)unicode;
				865
				866	/* Unpack UTF-16 encoded data */
				867	p = unicode->str;
				868	q = (Py_UNICODE *)s;
				869	e = q + (size / sizeof(Py_UNICODE));
				870
				871	if (byteorder)
				872	bo = *byteorder;
				873
				874	while (q < e) {
				875	register Py_UNICODE ch = *q++;
				876
				877	/* Check for BOM marks (U+FEFF) in the input and adjust
				878	current byte order setting accordingly. Swap input
				879	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				880	!) */
				881	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				882	if (ch == 0xFEFF) {
				883	bo = -1;
				884	continue;
				885	} else if (ch == 0xFFFE) {
				886	bo = 1;
				887	continue;
				888	}
				889	if (bo == 1)
				890	ch = (ch >> 8) \| (ch << 8);
				891	#else
				892	if (ch == 0xFEFF) {
				893	bo = 1;
				894	continue;
				895	} else if (ch == 0xFFFE) {
				896	bo = -1;
				897	continue;
				898	}
				899	if (bo == -1)
				900	ch = (ch >> 8) \| (ch << 8);
				901	#endif
				902	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				903	*p++ = ch;
				904	continue;
				905	}
				906
				907	/* UTF-16 code pair: */
				908	if (q >= e)
				909	UTF16_ERROR("unexpected end of data");
				910	if (0xDC00 <= q && q <= 0xDFFF) {
				911	q++;
				912	if (0xD800 <= q && q <= 0xDBFF)
				913	/* This is valid data (a UTF-16 surrogate pair), but
				914	we are not able to store this information since our
				915	Py_UNICODE type only has 16 bits... this might
				916	change someday, even though it's unlikely. */
				917	UTF16_ERROR("code pairs are not supported");
				918	else
				919	continue;
				920	}
				921	UTF16_ERROR("illegal encoding");
				922	}
				923
				924	if (byteorder)
				925	*byteorder = bo;
				926
				927	/* Adjust length */
				928	if (_PyUnicode_Resize(unicode, p - unicode->str))
				929	goto onError;
				930
				931	return (PyObject *)unicode;
				932
				933	onError:
				934	Py_DECREF(unicode);
				935	return NULL;
				936	}
				937
				938	#undef UTF16_ERROR
				939
				940	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				941	int size,
				942	const char *errors,
				943	int byteorder)
				944	{
				945	PyObject *v;
				946	Py_UNICODE *p;
				947	char *q;
				948
				949	/* We don't create UTF-16 pairs... */
				950	v = PyString_FromStringAndSize(NULL,
				951	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				952	if (v == NULL)
				953	return NULL;
				954	if (size == 0)
				955	goto done;
				956
				957	q = PyString_AS_STRING(v);
				958	p = (Py_UNICODE *)q;
				959
				960	if (byteorder == 0)
				961	*p++ = 0xFEFF;
				962	if (byteorder == 0 \|\|
				963	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				964	byteorder == -1
				965	#else
				966	byteorder == 1
				967	#endif
				968	)
				969	memcpy(p, s, size * sizeof(Py_UNICODE));
				970	else
				971	while (size-- > 0) {
				972	Py_UNICODE ch = *s++;
				973	*p++ = (ch >> 8) \| (ch << 8);
				974	}
				975	done:
				976	return v;
				977	}
				978
				979	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				980	{
				981	if (!PyUnicode_Check(unicode)) {
				982	PyErr_BadArgument();
				983	return NULL;
				984	}
				985	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				986	PyUnicode_GET_SIZE(unicode),
				987	NULL,
				988	0);
				989	}
				990
				991	/* --- Unicode Escape Codec ----------------------------------------------- */
				992
				993	static
				994	int unicodeescape_decoding_error(const char **source,
				995	unsigned int *x,
				996	const char *errors,
				997	const char *details)
				998	{
				999	if ((errors == NULL) \|\|
				1000	(strcmp(errors,"strict") == 0)) {
				1001	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1002	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1003	details);
				1004	return -1;
				1005	}
				1006	else if (strcmp(errors,"ignore") == 0) {
				1007	return 0;
				1008	}
				1009	else if (strcmp(errors,"replace") == 0) {
				1010	*x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
				1011	return 0;
				1012	}
				1013	else {
				1014	PyErr_Format(PyExc_ValueError,
				1015	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1016	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1017	errors);
				1018	return -1;
				1019	}
				1020	}
				1021
				1022	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1023	int size,
				1024	const char *errors)
				1025	{
				1026	PyUnicodeObject *v;
				1027	Py_UNICODE p = NULL, buf = NULL;
				1028	const char *end;
				1029
				1030	/* Escaped strings will always be longer than the resulting
				1031	Unicode string, so we start with size here and then reduce the
				1032	length after conversion to the true value. */
				1033	v = _PyUnicode_New(size);
				1034	if (v == NULL)
				1035	goto onError;
				1036	if (size == 0)
				1037	return (PyObject *)v;
				1038	p = buf = PyUnicode_AS_UNICODE(v);
				1039	end = s + size;
				1040	while (s < end) {
				1041	unsigned char c;
				1042	unsigned int x;
				1043	int i;
				1044
				1045	/* Non-escape characters are interpreted as Unicode ordinals */
				1046	if (*s != '\\') {
				1047	p++ = (unsigned char)s++;
				1048	continue;
				1049	}
				1050
				1051	/* \ - Escapes */
				1052	s++;
				1053	switch (*s++) {
				1054
				1055	/* \x escapes */
				1056	case '\n': break;
				1057	case '\\': *p++ = '\\'; break;
				1058	case '\'': *p++ = '\''; break;
				1059	case '\"': *p++ = '\"'; break;
				1060	case 'b': *p++ = '\b'; break;
				1061	case 'f': p++ = '\014'; break; / FF */
				1062	case 't': *p++ = '\t'; break;
				1063	case 'n': *p++ = '\n'; break;
				1064	case 'r': *p++ = '\r'; break;
				1065	case 'v': p++ = '\013'; break; / VT */
				1066	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1067
				1068	/* \OOO (octal) escapes */
				1069	case '0': case '1': case '2': case '3':
				1070	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1071	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1072	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1073	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1074	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1075	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1076	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1077	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1078	break;
				1079
				1080	/* \xXXXX escape with 0-4 hex digits */
				1081	case 'x':
				1082	x = 0;
				1083	c = (unsigned char)*s;
				1084	if (isxdigit(c)) {
				1085	do {
				1086	x = (x<<4) & ~0xF;
				1087	if ('0' <= c && c <= '9')
				1088	x += c - '0';
				1089	else if ('a' <= c && c <= 'f')
				1090	x += 10 + c - 'a';
				1091	else
				1092	x += 10 + c - 'A';
				1093	c = (unsigned char)*++s;
				1094	} while (isxdigit(c));
				1095	*p++ = x;
				1096	} else {
				1097	*p++ = '\\';
				1098	*p++ = (unsigned char)s[-1];
				1099	}
				1100	break;
				1101
				1102	/* \uXXXX with 4 hex digits */
				1103	case 'u':
				1104	for (x = 0, i = 0; i < 4; i++) {
				1105	c = (unsigned char)s[i];
				1106	if (!isxdigit(c)) {
				1107	if (unicodeescape_decoding_error(&s, &x, errors,
				1108	"truncated \\uXXXX"))
				1109	goto onError;
				1110	i++;
				1111	break;
				1112	}
				1113	x = (x<<4) & ~0xF;
				1114	if (c >= '0' && c <= '9')
				1115	x += c - '0';
				1116	else if (c >= 'a' && c <= 'f')
				1117	x += 10 + c - 'a';
				1118	else
				1119	x += 10 + c - 'A';
				1120	}
				1121	s += i;
				1122	*p++ = x;
				1123	break;
				1124
				1125	default:
				1126	*p++ = '\\';
				1127	*p++ = (unsigned char)s[-1];
				1128	break;
				1129	}
				1130	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1131	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1132	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1133	return (PyObject *)v;
				1134
				1135	onError:
				1136	Py_XDECREF(v);
				1137	return NULL;
				1138	}
				1139
				1140	/* Return a Unicode-Escape string version of the Unicode object.
				1141
				1142	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1143	appropriate.
				1144
				1145	*/
				1146
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1147	static const Py_UNICODE findchar(const Py_UNICODE s,
				1148	int size,
				1149	Py_UNICODE ch);
				1150
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1151	static
				1152	PyObject unicodeescape_string(const Py_UNICODE s,
				1153	int size,
				1154	int quotes)
				1155	{
				1156	PyObject *repr;
				1157	char *p;
				1158	char *q;
				1159
				1160	static const char *hexdigit = "0123456789ABCDEF";
				1161
				1162	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1163	if (repr == NULL)
				1164	return NULL;
				1165
				1166	p = q = PyString_AS_STRING(repr);
				1167
				1168	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1169	*p++ = 'u';
				1170	*p++ = (findchar(s, size, '\'') &&
				1171	!findchar(s, size, '"')) ? '"' : '\'';
				1172	}
				1173	while (size-- > 0) {
				1174	Py_UNICODE ch = *s++;
				1175	/* Escape quotes */
				1176	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1177	*p++ = '\\';
				1178	*p++ = (char) ch;
				1179	}
				1180	/* Map 16-bit characters to '\uxxxx' */
				1181	else if (ch >= 256) {
				1182	*p++ = '\\';
				1183	*p++ = 'u';
				1184	*p++ = hexdigit[(ch >> 12) & 0xf];
				1185	*p++ = hexdigit[(ch >> 8) & 0xf];
				1186	*p++ = hexdigit[(ch >> 4) & 0xf];
				1187	*p++ = hexdigit[ch & 15];
				1188	}
				1189	/* Map non-printable US ASCII to '\ooo' */
				1190	else if (ch < ' ' \|\| ch >= 128) {
				1191	*p++ = '\\';
				1192	*p++ = hexdigit[(ch >> 6) & 7];
				1193	*p++ = hexdigit[(ch >> 3) & 7];
				1194	*p++ = hexdigit[ch & 7];
				1195	}
				1196	/* Copy everything else as-is */
				1197	else
				1198	*p++ = (char) ch;
				1199	}
				1200	if (quotes)
				1201	*p++ = q[1];
				1202
				1203	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1204	if (_PyString_Resize(&repr, p - q))
				1205	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1206
				1207	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1208
				1209	onError:
				1210	Py_DECREF(repr);
				1211	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1212	}
				1213
				1214	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1215	int size)
				1216	{
				1217	return unicodeescape_string(s, size, 0);
				1218	}
				1219
				1220	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1221	{
				1222	if (!PyUnicode_Check(unicode)) {
				1223	PyErr_BadArgument();
				1224	return NULL;
				1225	}
				1226	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1227	PyUnicode_GET_SIZE(unicode));
				1228	}
				1229
				1230	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1231
				1232	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1233	int size,
				1234	const char *errors)
				1235	{
				1236	PyUnicodeObject *v;
				1237	Py_UNICODE p, buf;
				1238	const char *end;
				1239	const char *bs;
				1240
				1241	/* Escaped strings will always be longer than the resulting
				1242	Unicode string, so we start with size here and then reduce the
				1243	length after conversion to the true value. */
				1244	v = _PyUnicode_New(size);
				1245	if (v == NULL)
				1246	goto onError;
				1247	if (size == 0)
				1248	return (PyObject *)v;
				1249	p = buf = PyUnicode_AS_UNICODE(v);
				1250	end = s + size;
				1251	while (s < end) {
				1252	unsigned char c;
				1253	unsigned int x;
				1254	int i;
				1255
				1256	/* Non-escape characters are interpreted as Unicode ordinals */
				1257	if (*s != '\\') {
				1258	p++ = (unsigned char)s++;
				1259	continue;
				1260	}
				1261
				1262	/* \u-escapes are only interpreted iff the number of leading
				1263	backslashes if odd */
				1264	bs = s;
				1265	for (;s < end;) {
				1266	if (*s != '\\')
				1267	break;
				1268	p++ = (unsigned char)s++;
				1269	}
				1270	if (((s - bs) & 1) == 0 \|\|
				1271	s >= end \|\|
				1272	*s != 'u') {
				1273	continue;
				1274	}
				1275	p--;
				1276	s++;
				1277
				1278	/* \uXXXX with 4 hex digits */
				1279	for (x = 0, i = 0; i < 4; i++) {
				1280	c = (unsigned char)s[i];
				1281	if (!isxdigit(c)) {
				1282	if (unicodeescape_decoding_error(&s, &x, errors,
				1283	"truncated \\uXXXX"))
				1284	goto onError;
				1285	i++;
				1286	break;
				1287	}
				1288	x = (x<<4) & ~0xF;
				1289	if (c >= '0' && c <= '9')
				1290	x += c - '0';
				1291	else if (c >= 'a' && c <= 'f')
				1292	x += 10 + c - 'a';
				1293	else
				1294	x += 10 + c - 'A';
				1295	}
				1296	s += i;
				1297	*p++ = x;
				1298	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1299	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1300	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1301	return (PyObject *)v;
				1302
				1303	onError:
				1304	Py_XDECREF(v);
				1305	return NULL;
				1306	}
				1307
				1308	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1309	int size)
				1310	{
				1311	PyObject *repr;
				1312	char *p;
				1313	char *q;
				1314
				1315	static const char *hexdigit = "0123456789ABCDEF";
				1316
				1317	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1318	if (repr == NULL)
				1319	return NULL;
				1320
				1321	p = q = PyString_AS_STRING(repr);
				1322	while (size-- > 0) {
				1323	Py_UNICODE ch = *s++;
				1324	/* Map 16-bit characters to '\uxxxx' */
				1325	if (ch >= 256) {
				1326	*p++ = '\\';
				1327	*p++ = 'u';
				1328	*p++ = hexdigit[(ch >> 12) & 0xf];
				1329	*p++ = hexdigit[(ch >> 8) & 0xf];
				1330	*p++ = hexdigit[(ch >> 4) & 0xf];
				1331	*p++ = hexdigit[ch & 15];
				1332	}
				1333	/* Copy everything else as-is */
				1334	else
				1335	*p++ = (char) ch;
				1336	}
				1337	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1338	if (_PyString_Resize(&repr, p - q))
				1339	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1340
				1341	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1342
				1343	onError:
				1344	Py_DECREF(repr);
				1345	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1346	}
				1347
				1348	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1349	{
				1350	if (!PyUnicode_Check(unicode)) {
				1351	PyErr_BadArgument();
				1352	return NULL;
				1353	}
				1354	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1355	PyUnicode_GET_SIZE(unicode));
				1356	}
				1357
				1358	/* --- Latin-1 Codec ------------------------------------------------------ */
				1359
				1360	PyObject PyUnicode_DecodeLatin1(const char s,
				1361	int size,
				1362	const char *errors)
				1363	{
				1364	PyUnicodeObject *v;
				1365	Py_UNICODE *p;
				1366
				1367	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1368	v = _PyUnicode_New(size);
				1369	if (v == NULL)
				1370	goto onError;
				1371	if (size == 0)
				1372	return (PyObject *)v;
				1373	p = PyUnicode_AS_UNICODE(v);
				1374	while (size-- > 0)
				1375	p++ = (unsigned char)s++;
				1376	return (PyObject *)v;
				1377
				1378	onError:
				1379	Py_XDECREF(v);
				1380	return NULL;
				1381	}
				1382
				1383	static
				1384	int latin1_encoding_error(const Py_UNICODE **source,
				1385	char **dest,
				1386	const char *errors,
				1387	const char *details)
				1388	{
				1389	if ((errors == NULL) \|\|
				1390	(strcmp(errors,"strict") == 0)) {
				1391	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1392	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1393	details);
				1394	return -1;
				1395	}
				1396	else if (strcmp(errors,"ignore") == 0) {
				1397	return 0;
				1398	}
				1399	else if (strcmp(errors,"replace") == 0) {
				1400	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1401	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1402	return 0;
				1403	}
				1404	else {
				1405	PyErr_Format(PyExc_ValueError,
				1406	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1407	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1408	errors);
				1409	return -1;
				1410	}
				1411	}
				1412
				1413	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1414	int size,
				1415	const char *errors)
				1416	{
				1417	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1418	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1419	repr = PyString_FromStringAndSize(NULL, size);
				1420	if (repr == NULL)
				1421	return NULL;
				1422
				1423	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1424	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1425	while (size-- > 0) {
				1426	Py_UNICODE ch = *p++;
				1427	if (ch >= 256) {
				1428	if (latin1_encoding_error(&p, &s, errors,
				1429	"ordinal not in range(256)"))
				1430	goto onError;
				1431	}
				1432	else
				1433	*s++ = (char)ch;
				1434	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1435	/* Resize if error handling skipped some characters */
				1436	if (s - start < PyString_GET_SIZE(repr))
				1437	if (_PyString_Resize(&repr, s - start))
				1438	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1439	return repr;
				1440
				1441	onError:
				1442	Py_DECREF(repr);
				1443	return NULL;
				1444	}
				1445
				1446	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1447	{
				1448	if (!PyUnicode_Check(unicode)) {
				1449	PyErr_BadArgument();
				1450	return NULL;
				1451	}
				1452	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1453	PyUnicode_GET_SIZE(unicode),
				1454	NULL);
				1455	}
				1456
				1457	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1458
				1459	static
				1460	int ascii_decoding_error(const char **source,
				1461	Py_UNICODE **dest,
				1462	const char *errors,
				1463	const char *details)
				1464	{
				1465	if ((errors == NULL) \|\|
				1466	(strcmp(errors,"strict") == 0)) {
				1467	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1468	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1469	details);
				1470	return -1;
				1471	}
				1472	else if (strcmp(errors,"ignore") == 0) {
				1473	return 0;
				1474	}
				1475	else if (strcmp(errors,"replace") == 0) {
				1476	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1477	(*dest)++;
				1478	return 0;
				1479	}
				1480	else {
				1481	PyErr_Format(PyExc_ValueError,
				1482	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1483	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1484	errors);
				1485	return -1;
				1486	}
				1487	}
				1488
				1489	PyObject PyUnicode_DecodeASCII(const char s,
				1490	int size,
				1491	const char *errors)
				1492	{
				1493	PyUnicodeObject *v;
				1494	Py_UNICODE *p;
				1495
				1496	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1497	v = _PyUnicode_New(size);
				1498	if (v == NULL)
				1499	goto onError;
				1500	if (size == 0)
				1501	return (PyObject *)v;
				1502	p = PyUnicode_AS_UNICODE(v);
				1503	while (size-- > 0) {
				1504	register unsigned char c;
				1505
				1506	c = (unsigned char)*s++;
				1507	if (c < 128)
				1508	*p++ = c;
				1509	else if (ascii_decoding_error(&s, &p, errors,
				1510	"ordinal not in range(128)"))
				1511	goto onError;
				1512	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1513	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1514	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1515	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1516	return (PyObject *)v;
				1517
				1518	onError:
				1519	Py_XDECREF(v);
				1520	return NULL;
				1521	}
				1522
				1523	static
				1524	int ascii_encoding_error(const Py_UNICODE **source,
				1525	char **dest,
				1526	const char *errors,
				1527	const char *details)
				1528	{
				1529	if ((errors == NULL) \|\|
				1530	(strcmp(errors,"strict") == 0)) {
				1531	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1532	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1533	details);
				1534	return -1;
				1535	}
				1536	else if (strcmp(errors,"ignore") == 0) {
				1537	return 0;
				1538	}
				1539	else if (strcmp(errors,"replace") == 0) {
				1540	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1541	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1542	return 0;
				1543	}
				1544	else {
				1545	PyErr_Format(PyExc_ValueError,
				1546	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1547	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1548	errors);
				1549	return -1;
				1550	}
				1551	}
				1552
				1553	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1554	int size,
				1555	const char *errors)
				1556	{
				1557	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1558	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1559	repr = PyString_FromStringAndSize(NULL, size);
				1560	if (repr == NULL)
				1561	return NULL;
				1562
				1563	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1564	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1565	while (size-- > 0) {
				1566	Py_UNICODE ch = *p++;
				1567	if (ch >= 128) {
				1568	if (ascii_encoding_error(&p, &s, errors,
				1569	"ordinal not in range(128)"))
				1570	goto onError;
				1571	}
				1572	else
				1573	*s++ = (char)ch;
				1574	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1575	/* Resize if error handling skipped some characters */
				1576	if (s - start < PyString_GET_SIZE(repr))
				1577	if (_PyString_Resize(&repr, s - start))
				1578	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1579	return repr;
				1580
				1581	onError:
				1582	Py_DECREF(repr);
				1583	return NULL;
				1584	}
				1585
				1586	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1587	{
				1588	if (!PyUnicode_Check(unicode)) {
				1589	PyErr_BadArgument();
				1590	return NULL;
				1591	}
				1592	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1593	PyUnicode_GET_SIZE(unicode),
				1594	NULL);
				1595	}
				1596
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1597	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1598
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1599	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1600
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1601	PyObject PyUnicode_DecodeMBCS(const char s,
				1602	int size,
				1603	const char *errors)
				1604	{
				1605	PyUnicodeObject *v;
				1606	Py_UNICODE *p;
				1607
				1608	/* First get the size of the result */
				1609	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1610	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1611	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1612
				1613	v = _PyUnicode_New(usize);
				1614	if (v == NULL)
				1615	return NULL;
				1616	if (usize == 0)
				1617	return (PyObject *)v;
				1618	p = PyUnicode_AS_UNICODE(v);
				1619	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1620	Py_DECREF(v);
				1621	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1622	}
				1623
				1624	return (PyObject *)v;
				1625	}
				1626
				1627	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1628	int size,
				1629	const char *errors)
				1630	{
				1631	PyObject *repr;
				1632	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1633	DWORD mbcssize;
				1634
				1635	/* If there are no characters, bail now! */
				1636	if (size==0)
				1637	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1638
				1639	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1640	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1641	if (mbcssize==0)
				1642	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1643
				1644	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1645	if (repr == NULL)
				1646	return NULL;
				1647	if (mbcssize==0)
				1648	return repr;
				1649
				1650	/* Do the conversion */
				1651	s = PyString_AS_STRING(repr);
				1652	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1653	Py_DECREF(repr);
				1654	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1655	}
				1656	return repr;
				1657	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1658
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1659	#endif /* MS_WIN32 */
				1660
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1661	/* --- Character Mapping Codec -------------------------------------------- */
				1662
				1663	static
				1664	int charmap_decoding_error(const char **source,
				1665	Py_UNICODE **dest,
				1666	const char *errors,
				1667	const char *details)
				1668	{
				1669	if ((errors == NULL) \|\|
				1670	(strcmp(errors,"strict") == 0)) {
				1671	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1672	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1673	details);
				1674	return -1;
				1675	}
				1676	else if (strcmp(errors,"ignore") == 0) {
				1677	return 0;
				1678	}
				1679	else if (strcmp(errors,"replace") == 0) {
				1680	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1681	(*dest)++;
				1682	return 0;
				1683	}
				1684	else {
				1685	PyErr_Format(PyExc_ValueError,
				1686	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1687	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1688	errors);
				1689	return -1;
				1690	}
				1691	}
				1692
				1693	PyObject PyUnicode_DecodeCharmap(const char s,
				1694	int size,
				1695	PyObject *mapping,
				1696	const char *errors)
				1697	{
				1698	PyUnicodeObject *v;
				1699	Py_UNICODE *p;
				1700
				1701	/* Default to Latin-1 */
				1702	if (mapping == NULL)
				1703	return PyUnicode_DecodeLatin1(s, size, errors);
				1704
				1705	v = _PyUnicode_New(size);
				1706	if (v == NULL)
				1707	goto onError;
				1708	if (size == 0)
				1709	return (PyObject *)v;
				1710	p = PyUnicode_AS_UNICODE(v);
				1711	while (size-- > 0) {
				1712	unsigned char ch = *s++;
				1713	PyObject w, x;
				1714
				1715	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1716	w = PyInt_FromLong((long)ch);
				1717	if (w == NULL)
				1718	goto onError;
				1719	x = PyObject_GetItem(mapping, w);
				1720	Py_DECREF(w);
				1721	if (x == NULL) {
				1722	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1723	/* No mapping found: default to Latin-1 mapping */
				1724	PyErr_Clear();
				1725	*p++ = (Py_UNICODE)ch;
				1726	continue;
				1727	}
				1728	goto onError;
				1729	}
				1730
				1731	/* Apply mapping */
				1732	if (PyInt_Check(x)) {
				1733	int value = PyInt_AS_LONG(x);
				1734	if (value < 0 \|\| value > 65535) {
				1735	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1736	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1737	Py_DECREF(x);
				1738	goto onError;
				1739	}
				1740	*p++ = (Py_UNICODE)value;
				1741	}
				1742	else if (x == Py_None) {
				1743	/* undefined mapping */
				1744	if (charmap_decoding_error(&s, &p, errors,
				1745	"character maps to <undefined>")) {
				1746	Py_DECREF(x);
				1747	goto onError;
				1748	}
				1749	}
				1750	else if (PyUnicode_Check(x)) {
				1751	if (PyUnicode_GET_SIZE(x) != 1) {
				1752	/* 1-n mapping */
				1753	PyErr_SetString(PyExc_NotImplementedError,
				1754	"1-n mappings are currently not implemented");
				1755	Py_DECREF(x);
				1756	goto onError;
				1757	}
				1758	p++ = PyUnicode_AS_UNICODE(x);
				1759	}
				1760	else {
				1761	/* wrong return value */
				1762	PyErr_SetString(PyExc_TypeError,
				1763	"character mapping must return integer, None or unicode");
				1764	Py_DECREF(x);
				1765	goto onError;
				1766	}
				1767	Py_DECREF(x);
				1768	}
				1769	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1770	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1771	goto onError;
				1772	return (PyObject *)v;
				1773
				1774	onError:
				1775	Py_XDECREF(v);
				1776	return NULL;
				1777	}
				1778
				1779	static
				1780	int charmap_encoding_error(const Py_UNICODE **source,
				1781	char **dest,
				1782	const char *errors,
				1783	const char *details)
				1784	{
				1785	if ((errors == NULL) \|\|
				1786	(strcmp(errors,"strict") == 0)) {
				1787	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1788	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1789	details);
				1790	return -1;
				1791	}
				1792	else if (strcmp(errors,"ignore") == 0) {
				1793	return 0;
				1794	}
				1795	else if (strcmp(errors,"replace") == 0) {
				1796	**dest = '?';
				1797	(*dest)++;
				1798	return 0;
				1799	}
				1800	else {
				1801	PyErr_Format(PyExc_ValueError,
				1802	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1803	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1804	errors);
				1805	return -1;
				1806	}
				1807	}
				1808
				1809	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1810	int size,
				1811	PyObject *mapping,
				1812	const char *errors)
				1813	{
				1814	PyObject *v;
				1815	char *s;
				1816
				1817	/* Default to Latin-1 */
				1818	if (mapping == NULL)
				1819	return PyUnicode_EncodeLatin1(p, size, errors);
				1820
				1821	v = PyString_FromStringAndSize(NULL, size);
				1822	if (v == NULL)
				1823	return NULL;
				1824	s = PyString_AS_STRING(v);
				1825	while (size-- > 0) {
				1826	Py_UNICODE ch = *p++;
				1827	PyObject w, x;
				1828
				1829	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				1830	w = PyInt_FromLong((long)ch);
				1831	if (w == NULL)
				1832	goto onError;
				1833	x = PyObject_GetItem(mapping, w);
				1834	Py_DECREF(w);
				1835	if (x == NULL) {
				1836	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1837	/* No mapping found: default to Latin-1 mapping if possible */
				1838	PyErr_Clear();
				1839	if (ch < 256) {
				1840	*s++ = (char)ch;
				1841	continue;
				1842	}
				1843	else if (!charmap_encoding_error(&p, &s, errors,
				1844	"missing character mapping"))
				1845	continue;
				1846	}
				1847	goto onError;
				1848	}
				1849
				1850	/* Apply mapping */
				1851	if (PyInt_Check(x)) {
				1852	int value = PyInt_AS_LONG(x);
				1853	if (value < 0 \|\| value > 255) {
				1854	PyErr_SetString(PyExc_TypeError,
				1855	"character mapping must be in range(256)");
				1856	Py_DECREF(x);
				1857	goto onError;
				1858	}
				1859	*s++ = (char)value;
				1860	}
				1861	else if (x == Py_None) {
				1862	/* undefined mapping */
				1863	if (charmap_encoding_error(&p, &s, errors,
				1864	"character maps to <undefined>")) {
				1865	Py_DECREF(x);
				1866	goto onError;
				1867	}
				1868	}
				1869	else if (PyString_Check(x)) {
				1870	if (PyString_GET_SIZE(x) != 1) {
				1871	/* 1-n mapping */
				1872	PyErr_SetString(PyExc_NotImplementedError,
				1873	"1-n mappings are currently not implemented");
				1874	Py_DECREF(x);
				1875	goto onError;
				1876	}
				1877	s++ = PyString_AS_STRING(x);
				1878	}
				1879	else {
				1880	/* wrong return value */
				1881	PyErr_SetString(PyExc_TypeError,
				1882	"character mapping must return integer, None or unicode");
				1883	Py_DECREF(x);
				1884	goto onError;
				1885	}
				1886	Py_DECREF(x);
				1887	}
				1888	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				1889	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				1890	goto onError;
				1891	return v;
				1892
				1893	onError:
				1894	Py_DECREF(v);
				1895	return NULL;
				1896	}
				1897
				1898	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				1899	PyObject *mapping)
				1900	{
				1901	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				1902	PyErr_BadArgument();
				1903	return NULL;
				1904	}
				1905	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				1906	PyUnicode_GET_SIZE(unicode),
				1907	mapping,
				1908	NULL);
				1909	}
				1910
				1911	static
				1912	int translate_error(const Py_UNICODE **source,
				1913	Py_UNICODE **dest,
				1914	const char *errors,
				1915	const char *details)
				1916	{
				1917	if ((errors == NULL) \|\|
				1918	(strcmp(errors,"strict") == 0)) {
				1919	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1920	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1921	details);
				1922	return -1;
				1923	}
				1924	else if (strcmp(errors,"ignore") == 0) {
				1925	return 0;
				1926	}
				1927	else if (strcmp(errors,"replace") == 0) {
				1928	**dest = '?';
				1929	(*dest)++;
				1930	return 0;
				1931	}
				1932	else {
				1933	PyErr_Format(PyExc_ValueError,
				1934	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1935	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1936	errors);
				1937	return -1;
				1938	}
				1939	}
				1940
				1941	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				1942	int size,
				1943	PyObject *mapping,
				1944	const char *errors)
				1945	{
				1946	PyUnicodeObject *v;
				1947	Py_UNICODE *p;
				1948
				1949	if (mapping == NULL) {
				1950	PyErr_BadArgument();
				1951	return NULL;
				1952	}
				1953
				1954	/* Output will never be longer than input */
				1955	v = _PyUnicode_New(size);
				1956	if (v == NULL)
				1957	goto onError;
				1958	if (size == 0)
				1959	goto done;
				1960	p = PyUnicode_AS_UNICODE(v);
				1961	while (size-- > 0) {
				1962	Py_UNICODE ch = *s++;
				1963	PyObject w, x;
				1964
				1965	/* Get mapping */
				1966	w = PyInt_FromLong(ch);
				1967	if (w == NULL)
				1968	goto onError;
				1969	x = PyObject_GetItem(mapping, w);
				1970	Py_DECREF(w);
				1971	if (x == NULL) {
				1972	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1973	/* No mapping found: default to 1-1 mapping */
				1974	PyErr_Clear();
				1975	*p++ = ch;
				1976	continue;
				1977	}
				1978	goto onError;
				1979	}
				1980
				1981	/* Apply mapping */
				1982	if (PyInt_Check(x))
				1983	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				1984	else if (x == Py_None) {
				1985	/* undefined mapping */
				1986	if (translate_error(&s, &p, errors,
				1987	"character maps to <undefined>")) {
				1988	Py_DECREF(x);
				1989	goto onError;
				1990	}
				1991	}
				1992	else if (PyUnicode_Check(x)) {
				1993	if (PyUnicode_GET_SIZE(x) != 1) {
				1994	/* 1-n mapping */
				1995	PyErr_SetString(PyExc_NotImplementedError,
				1996	"1-n mappings are currently not implemented");
				1997	Py_DECREF(x);
				1998	goto onError;
				1999	}
				2000	p++ = PyUnicode_AS_UNICODE(x);
				2001	}
				2002	else {
				2003	/* wrong return value */
				2004	PyErr_SetString(PyExc_TypeError,
				2005	"translate mapping must return integer, None or unicode");
				2006	Py_DECREF(x);
				2007	goto onError;
				2008	}
				2009	Py_DECREF(x);
				2010	}
				2011	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2012	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2013	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2014
				2015	done:
				2016	return (PyObject *)v;
				2017
				2018	onError:
				2019	Py_XDECREF(v);
				2020	return NULL;
				2021	}
				2022
				2023	PyObject PyUnicode_Translate(PyObject str,
				2024	PyObject *mapping,
				2025	const char *errors)
				2026	{
				2027	PyObject *result;
				2028
				2029	str = PyUnicode_FromObject(str);
				2030	if (str == NULL)
				2031	goto onError;
				2032	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2033	PyUnicode_GET_SIZE(str),
				2034	mapping,
				2035	errors);
				2036	Py_DECREF(str);
				2037	return result;
				2038
				2039	onError:
				2040	Py_XDECREF(str);
				2041	return NULL;
				2042	}
				2043
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2044	/* --- Decimal Encoder ---------------------------------------------------- */
				2045
				2046	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2047	int length,
				2048	char *output,
				2049	const char *errors)
				2050	{
				2051	Py_UNICODE p, end;
				2052
				2053	if (output == NULL) {
				2054	PyErr_BadArgument();
				2055	return -1;
				2056	}
				2057
				2058	p = s;
				2059	end = s + length;
				2060	while (p < end) {
				2061	register Py_UNICODE ch = *p++;
				2062	int decimal;
				2063
				2064	if (Py_UNICODE_ISSPACE(ch)) {
				2065	*output++ = ' ';
				2066	continue;
				2067	}
				2068	decimal = Py_UNICODE_TODECIMAL(ch);
				2069	if (decimal >= 0) {
				2070	*output++ = '0' + decimal;
				2071	continue;
				2072	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2073	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2074	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2075	continue;
				2076	}
				2077	/* All other characters are considered invalid */
				2078	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2079	PyErr_SetString(PyExc_ValueError,
				2080	"invalid decimal Unicode string");
				2081	goto onError;
				2082	}
				2083	else if (strcmp(errors, "ignore") == 0)
				2084	continue;
				2085	else if (strcmp(errors, "replace") == 0) {
				2086	*output++ = '?';
				2087	continue;
				2088	}
				2089	}
				2090	/* 0-terminate the output string */
				2091	*output++ = '\0';
				2092	return 0;
				2093
				2094	onError:
				2095	return -1;
				2096	}
				2097
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2098	/* --- Helpers ------------------------------------------------------------ */
				2099
				2100	static
				2101	int count(PyUnicodeObject *self,
				2102	int start,
				2103	int end,
				2104	PyUnicodeObject *substring)
				2105	{
				2106	int count = 0;
				2107
				2108	end -= substring->length;
				2109
				2110	while (start <= end)
				2111	if (Py_UNICODE_MATCH(self, start, substring)) {
				2112	count++;
				2113	start += substring->length;
				2114	} else
				2115	start++;
				2116
				2117	return count;
				2118	}
				2119
				2120	int PyUnicode_Count(PyObject *str,
				2121	PyObject *substr,
				2122	int start,
				2123	int end)
				2124	{
				2125	int result;
				2126
				2127	str = PyUnicode_FromObject(str);
				2128	if (str == NULL)
				2129	return -1;
				2130	substr = PyUnicode_FromObject(substr);
				2131	if (substr == NULL) {
				2132	Py_DECREF(substr);
				2133	return -1;
				2134	}
				2135
				2136	result = count((PyUnicodeObject *)str,
				2137	start, end,
				2138	(PyUnicodeObject *)substr);
				2139
				2140	Py_DECREF(str);
				2141	Py_DECREF(substr);
				2142	return result;
				2143	}
				2144
				2145	static
				2146	int findstring(PyUnicodeObject *self,
				2147	PyUnicodeObject *substring,
				2148	int start,
				2149	int end,
				2150	int direction)
				2151	{
				2152	if (start < 0)
				2153	start += self->length;
				2154	if (start < 0)
				2155	start = 0;
				2156
				2157	if (substring->length == 0)
				2158	return start;
				2159
				2160	if (end > self->length)
				2161	end = self->length;
				2162	if (end < 0)
				2163	end += self->length;
				2164	if (end < 0)
				2165	end = 0;
				2166
				2167	end -= substring->length;
				2168
				2169	if (direction < 0) {
				2170	for (; end >= start; end--)
				2171	if (Py_UNICODE_MATCH(self, end, substring))
				2172	return end;
				2173	} else {
				2174	for (; start <= end; start++)
				2175	if (Py_UNICODE_MATCH(self, start, substring))
				2176	return start;
				2177	}
				2178
				2179	return -1;
				2180	}
				2181
				2182	int PyUnicode_Find(PyObject *str,
				2183	PyObject *substr,
				2184	int start,
				2185	int end,
				2186	int direction)
				2187	{
				2188	int result;
				2189
				2190	str = PyUnicode_FromObject(str);
				2191	if (str == NULL)
				2192	return -1;
				2193	substr = PyUnicode_FromObject(substr);
				2194	if (substr == NULL) {
				2195	Py_DECREF(substr);
				2196	return -1;
				2197	}
				2198
				2199	result = findstring((PyUnicodeObject *)str,
				2200	(PyUnicodeObject *)substr,
				2201	start, end, direction);
				2202	Py_DECREF(str);
				2203	Py_DECREF(substr);
				2204	return result;
				2205	}
				2206
				2207	static
				2208	int tailmatch(PyUnicodeObject *self,
				2209	PyUnicodeObject *substring,
				2210	int start,
				2211	int end,
				2212	int direction)
				2213	{
				2214	if (start < 0)
				2215	start += self->length;
				2216	if (start < 0)
				2217	start = 0;
				2218
				2219	if (substring->length == 0)
				2220	return 1;
				2221
				2222	if (end > self->length)
				2223	end = self->length;
				2224	if (end < 0)
				2225	end += self->length;
				2226	if (end < 0)
				2227	end = 0;
				2228
				2229	end -= substring->length;
				2230	if (end < start)
				2231	return 0;
				2232
				2233	if (direction > 0) {
				2234	if (Py_UNICODE_MATCH(self, end, substring))
				2235	return 1;
				2236	} else {
				2237	if (Py_UNICODE_MATCH(self, start, substring))
				2238	return 1;
				2239	}
				2240
				2241	return 0;
				2242	}
				2243
				2244	int PyUnicode_Tailmatch(PyObject *str,
				2245	PyObject *substr,
				2246	int start,
				2247	int end,
				2248	int direction)
				2249	{
				2250	int result;
				2251
				2252	str = PyUnicode_FromObject(str);
				2253	if (str == NULL)
				2254	return -1;
				2255	substr = PyUnicode_FromObject(substr);
				2256	if (substr == NULL) {
				2257	Py_DECREF(substr);
				2258	return -1;
				2259	}
				2260
				2261	result = tailmatch((PyUnicodeObject *)str,
				2262	(PyUnicodeObject *)substr,
				2263	start, end, direction);
				2264	Py_DECREF(str);
				2265	Py_DECREF(substr);
				2266	return result;
				2267	}
				2268
				2269	static
				2270	const Py_UNICODE findchar(const Py_UNICODE s,
				2271	int size,
				2272	Py_UNICODE ch)
				2273	{
				2274	/* like wcschr, but doesn't stop at NULL characters */
				2275
				2276	while (size-- > 0) {
				2277	if (*s == ch)
				2278	return s;
				2279	s++;
				2280	}
				2281
				2282	return NULL;
				2283	}
				2284
				2285	/* Apply fixfct filter to the Unicode object self and return a
				2286	reference to the modified object */
				2287
				2288	static
				2289	PyObject fixup(PyUnicodeObject self,
				2290	int (fixfct)(PyUnicodeObject s))
				2291	{
				2292
				2293	PyUnicodeObject *u;
				2294
				2295	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2296	self->length);
				2297	if (u == NULL)
				2298	return NULL;
				2299	if (!fixfct(u)) {
				2300	/* fixfct should return TRUE if it modified the buffer. If
				2301	FALSE, return a reference to the original buffer instead
				2302	(to save space, not time) */
				2303	Py_INCREF(self);
				2304	Py_DECREF(u);
				2305	return (PyObject*) self;
				2306	}
				2307	return (PyObject*) u;
				2308	}
				2309
				2310	static
				2311	int fixupper(PyUnicodeObject *self)
				2312	{
				2313	int len = self->length;
				2314	Py_UNICODE *s = self->str;
				2315	int status = 0;
				2316
				2317	while (len-- > 0) {
				2318	register Py_UNICODE ch;
				2319
				2320	ch = Py_UNICODE_TOUPPER(*s);
				2321	if (ch != *s) {
				2322	status = 1;
				2323	*s = ch;
				2324	}
				2325	s++;
				2326	}
				2327
				2328	return status;
				2329	}
				2330
				2331	static
				2332	int fixlower(PyUnicodeObject *self)
				2333	{
				2334	int len = self->length;
				2335	Py_UNICODE *s = self->str;
				2336	int status = 0;
				2337
				2338	while (len-- > 0) {
				2339	register Py_UNICODE ch;
				2340
				2341	ch = Py_UNICODE_TOLOWER(*s);
				2342	if (ch != *s) {
				2343	status = 1;
				2344	*s = ch;
				2345	}
				2346	s++;
				2347	}
				2348
				2349	return status;
				2350	}
				2351
				2352	static
				2353	int fixswapcase(PyUnicodeObject *self)
				2354	{
				2355	int len = self->length;
				2356	Py_UNICODE *s = self->str;
				2357	int status = 0;
				2358
				2359	while (len-- > 0) {
				2360	if (Py_UNICODE_ISUPPER(*s)) {
				2361	s = Py_UNICODE_TOLOWER(s);
				2362	status = 1;
				2363	} else if (Py_UNICODE_ISLOWER(*s)) {
				2364	s = Py_UNICODE_TOUPPER(s);
				2365	status = 1;
				2366	}
				2367	s++;
				2368	}
				2369
				2370	return status;
				2371	}
				2372
				2373	static
				2374	int fixcapitalize(PyUnicodeObject *self)
				2375	{
				2376	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2377	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2378	return 1;
				2379	}
				2380	return 0;
				2381	}
				2382
				2383	static
				2384	int fixtitle(PyUnicodeObject *self)
				2385	{
				2386	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2387	register Py_UNICODE *e;
				2388	int previous_is_cased;
				2389
				2390	/* Shortcut for single character strings */
				2391	if (PyUnicode_GET_SIZE(self) == 1) {
				2392	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2393	if (*p != ch) {
				2394	*p = ch;
				2395	return 1;
				2396	}
				2397	else
				2398	return 0;
				2399	}
				2400
				2401	e = p + PyUnicode_GET_SIZE(self);
				2402	previous_is_cased = 0;
				2403	for (; p < e; p++) {
				2404	register const Py_UNICODE ch = *p;
				2405
				2406	if (previous_is_cased)
				2407	*p = Py_UNICODE_TOLOWER(ch);
				2408	else
				2409	*p = Py_UNICODE_TOTITLE(ch);
				2410
				2411	if (Py_UNICODE_ISLOWER(ch) \|\|
				2412	Py_UNICODE_ISUPPER(ch) \|\|
				2413	Py_UNICODE_ISTITLE(ch))
				2414	previous_is_cased = 1;
				2415	else
				2416	previous_is_cased = 0;
				2417	}
				2418	return 1;
				2419	}
				2420
				2421	PyObject PyUnicode_Join(PyObject separator,
				2422	PyObject *seq)
				2423	{
				2424	Py_UNICODE *sep;
				2425	int seplen;
				2426	PyUnicodeObject *res = NULL;
				2427	int reslen = 0;
				2428	Py_UNICODE *p;
				2429	int seqlen = 0;
				2430	int sz = 100;
				2431	int i;
				2432
				2433	seqlen = PySequence_Length(seq);
				2434	if (seqlen < 0 && PyErr_Occurred())
				2435	return NULL;
				2436
				2437	if (separator == NULL) {
				2438	Py_UNICODE blank = ' ';
				2439	sep = &blank;
				2440	seplen = 1;
				2441	}
				2442	else {
				2443	separator = PyUnicode_FromObject(separator);
				2444	if (separator == NULL)
				2445	return NULL;
				2446	sep = PyUnicode_AS_UNICODE(separator);
				2447	seplen = PyUnicode_GET_SIZE(separator);
				2448	}
				2449
				2450	res = _PyUnicode_New(sz);
				2451	if (res == NULL)
				2452	goto onError;
				2453	p = PyUnicode_AS_UNICODE(res);
				2454	reslen = 0;
				2455
				2456	for (i = 0; i < seqlen; i++) {
				2457	int itemlen;
				2458	PyObject *item;
				2459
				2460	item = PySequence_GetItem(seq, i);
				2461	if (item == NULL)
				2462	goto onError;
				2463	if (!PyUnicode_Check(item)) {
				2464	PyObject *v;
				2465	v = PyUnicode_FromObject(item);
				2466	Py_DECREF(item);
				2467	item = v;
				2468	if (item == NULL)
				2469	goto onError;
				2470	}
				2471	itemlen = PyUnicode_GET_SIZE(item);
				2472	while (reslen + itemlen + seplen >= sz) {
				2473	if (_PyUnicode_Resize(res, sz*2))
				2474	goto onError;
				2475	sz *= 2;
				2476	p = PyUnicode_AS_UNICODE(res) + reslen;
				2477	}
				2478	if (i > 0) {
				2479	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2480	p += seplen;
				2481	reslen += seplen;
				2482	}
				2483	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2484	p += itemlen;
				2485	reslen += itemlen;
				2486	Py_DECREF(item);
				2487	}
				2488	if (_PyUnicode_Resize(res, reslen))
				2489	goto onError;
				2490
				2491	Py_XDECREF(separator);
				2492	return (PyObject *)res;
				2493
				2494	onError:
				2495	Py_XDECREF(separator);
				2496	Py_DECREF(res);
				2497	return NULL;
				2498	}
				2499
				2500	static
				2501	PyUnicodeObject pad(PyUnicodeObject self,
				2502	int left,
				2503	int right,
				2504	Py_UNICODE fill)
				2505	{
				2506	PyUnicodeObject *u;
				2507
				2508	if (left < 0)
				2509	left = 0;
				2510	if (right < 0)
				2511	right = 0;
				2512
				2513	if (left == 0 && right == 0) {
				2514	Py_INCREF(self);
				2515	return self;
				2516	}
				2517
				2518	u = _PyUnicode_New(left + self->length + right);
				2519	if (u) {
				2520	if (left)
				2521	Py_UNICODE_FILL(u->str, fill, left);
				2522	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2523	if (right)
				2524	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2525	}
				2526
				2527	return u;
				2528	}
				2529
				2530	#define SPLIT_APPEND(data, left, right) \
				2531	str = PyUnicode_FromUnicode(data + left, right - left); \
				2532	if (!str) \
				2533	goto onError; \
				2534	if (PyList_Append(list, str)) { \
				2535	Py_DECREF(str); \
				2536	goto onError; \
				2537	} \
				2538	else \
				2539	Py_DECREF(str);
				2540
				2541	static
				2542	PyObject split_whitespace(PyUnicodeObject self,
				2543	PyObject *list,
				2544	int maxcount)
				2545	{
				2546	register int i;
				2547	register int j;
				2548	int len = self->length;
				2549	PyObject *str;
				2550
				2551	for (i = j = 0; i < len; ) {
				2552	/* find a token */
				2553	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2554	i++;
				2555	j = i;
				2556	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2557	i++;
				2558	if (j < i) {
				2559	if (maxcount-- <= 0)
				2560	break;
				2561	SPLIT_APPEND(self->str, j, i);
				2562	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2563	i++;
				2564	j = i;
				2565	}
				2566	}
				2567	if (j < len) {
				2568	SPLIT_APPEND(self->str, j, len);
				2569	}
				2570	return list;
				2571
				2572	onError:
				2573	Py_DECREF(list);
				2574	return NULL;
				2575	}
				2576
				2577	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2578	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2579	{
				2580	register int i;
				2581	register int j;
				2582	int len;
				2583	PyObject *list;
				2584	PyObject *str;
				2585	Py_UNICODE *data;
				2586
				2587	string = PyUnicode_FromObject(string);
				2588	if (string == NULL)
				2589	return NULL;
				2590	data = PyUnicode_AS_UNICODE(string);
				2591	len = PyUnicode_GET_SIZE(string);
				2592
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2593	list = PyList_New(0);
				2594	if (!list)
				2595	goto onError;
				2596
				2597	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2598	int eol;
				2599
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2600	/* Find a line and append it */
				2601	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2602	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2603
				2604	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2605	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2606	if (i < len) {
				2607	if (data[i] == '\r' && i + 1 < len &&
				2608	data[i+1] == '\n')
				2609	i += 2;
				2610	else
				2611	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2612	if (keepends)
				2613	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2614	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2615	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2616	j = i;
				2617	}
				2618	if (j < len) {
				2619	SPLIT_APPEND(data, j, len);
				2620	}
				2621
				2622	Py_DECREF(string);
				2623	return list;
				2624
				2625	onError:
				2626	Py_DECREF(list);
				2627	Py_DECREF(string);
				2628	return NULL;
				2629	}
				2630
				2631	static
				2632	PyObject split_char(PyUnicodeObject self,
				2633	PyObject *list,
				2634	Py_UNICODE ch,
				2635	int maxcount)
				2636	{
				2637	register int i;
				2638	register int j;
				2639	int len = self->length;
				2640	PyObject *str;
				2641
				2642	for (i = j = 0; i < len; ) {
				2643	if (self->str[i] == ch) {
				2644	if (maxcount-- <= 0)
				2645	break;
				2646	SPLIT_APPEND(self->str, j, i);
				2647	i = j = i + 1;
				2648	} else
				2649	i++;
				2650	}
				2651	if (j <= len) {
				2652	SPLIT_APPEND(self->str, j, len);
				2653	}
				2654	return list;
				2655
				2656	onError:
				2657	Py_DECREF(list);
				2658	return NULL;
				2659	}
				2660
				2661	static
				2662	PyObject split_substring(PyUnicodeObject self,
				2663	PyObject *list,
				2664	PyUnicodeObject *substring,
				2665	int maxcount)
				2666	{
				2667	register int i;
				2668	register int j;
				2669	int len = self->length;
				2670	int sublen = substring->length;
				2671	PyObject *str;
				2672
				2673	for (i = j = 0; i < len - sublen; ) {
				2674	if (Py_UNICODE_MATCH(self, i, substring)) {
				2675	if (maxcount-- <= 0)
				2676	break;
				2677	SPLIT_APPEND(self->str, j, i);
				2678	i = j = i + sublen;
				2679	} else
				2680	i++;
				2681	}
				2682	if (j <= len) {
				2683	SPLIT_APPEND(self->str, j, len);
				2684	}
				2685	return list;
				2686
				2687	onError:
				2688	Py_DECREF(list);
				2689	return NULL;
				2690	}
				2691
				2692	#undef SPLIT_APPEND
				2693
				2694	static
				2695	PyObject split(PyUnicodeObject self,
				2696	PyUnicodeObject *substring,
				2697	int maxcount)
				2698	{
				2699	PyObject *list;
				2700
				2701	if (maxcount < 0)
				2702	maxcount = INT_MAX;
				2703
				2704	list = PyList_New(0);
				2705	if (!list)
				2706	return NULL;
				2707
				2708	if (substring == NULL)
				2709	return split_whitespace(self,list,maxcount);
				2710
				2711	else if (substring->length == 1)
				2712	return split_char(self,list,substring->str[0],maxcount);
				2713
				2714	else if (substring->length == 0) {
				2715	Py_DECREF(list);
				2716	PyErr_SetString(PyExc_ValueError, "empty separator");
				2717	return NULL;
				2718	}
				2719	else
				2720	return split_substring(self,list,substring,maxcount);
				2721	}
				2722
				2723	static
				2724	PyObject strip(PyUnicodeObject self,
				2725	int left,
				2726	int right)
				2727	{
				2728	Py_UNICODE *p = self->str;
				2729	int start = 0;
				2730	int end = self->length;
				2731
				2732	if (left)
				2733	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2734	start++;
				2735
				2736	if (right)
				2737	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2738	end--;
				2739
				2740	if (start == 0 && end == self->length) {
				2741	/* couldn't strip anything off, return original string */
				2742	Py_INCREF(self);
				2743	return (PyObject*) self;
				2744	}
				2745
				2746	return (PyObject*) PyUnicode_FromUnicode(
				2747	self->str + start,
				2748	end - start
				2749	);
				2750	}
				2751
				2752	static
				2753	PyObject replace(PyUnicodeObject self,
				2754	PyUnicodeObject *str1,
				2755	PyUnicodeObject *str2,
				2756	int maxcount)
				2757	{
				2758	PyUnicodeObject *u;
				2759
				2760	if (maxcount < 0)
				2761	maxcount = INT_MAX;
				2762
				2763	if (str1->length == 1 && str2->length == 1) {
				2764	int i;
				2765
				2766	/* replace characters */
				2767	if (!findchar(self->str, self->length, str1->str[0])) {
				2768	/* nothing to replace, return original string */
				2769	Py_INCREF(self);
				2770	u = self;
				2771	} else {
				2772	Py_UNICODE u1 = str1->str[0];
				2773	Py_UNICODE u2 = str2->str[0];
				2774
				2775	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2776	self->str,
				2777	self->length
				2778	);
				2779	if (u)
				2780	for (i = 0; i < u->length; i++)
				2781	if (u->str[i] == u1) {
				2782	if (--maxcount < 0)
				2783	break;
				2784	u->str[i] = u2;
				2785	}
				2786	}
				2787
				2788	} else {
				2789	int n, i;
				2790	Py_UNICODE *p;
				2791
				2792	/* replace strings */
				2793	n = count(self, 0, self->length, str1);
				2794	if (n > maxcount)
				2795	n = maxcount;
				2796	if (n == 0) {
				2797	/* nothing to replace, return original string */
				2798	Py_INCREF(self);
				2799	u = self;
				2800	} else {
				2801	u = _PyUnicode_New(
				2802	self->length + n * (str2->length - str1->length));
				2803	if (u) {
				2804	i = 0;
				2805	p = u->str;
				2806	while (i <= self->length - str1->length)
				2807	if (Py_UNICODE_MATCH(self, i, str1)) {
				2808	/* replace string segment */
				2809	Py_UNICODE_COPY(p, str2->str, str2->length);
				2810	p += str2->length;
				2811	i += str1->length;
				2812	if (--n <= 0) {
				2813	/* copy remaining part */
				2814	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				2815	break;
				2816	}
				2817	} else
				2818	*p++ = self->str[i++];
				2819	}
				2820	}
				2821	}
				2822
				2823	return (PyObject *) u;
				2824	}
				2825
				2826	/* --- Unicode Object Methods --------------------------------------------- */
				2827
				2828	static char title__doc__[] =
				2829	"S.title() -> unicode\n\
				2830	\n\
				2831	Return a titlecased version of S, i.e. words start with title case\n\
				2832	characters, all remaining cased characters have lower case.";
				2833
				2834	static PyObject*
				2835	unicode_title(PyUnicodeObject self, PyObject args)
				2836	{
				2837	if (!PyArg_NoArgs(args))
				2838	return NULL;
				2839	return fixup(self, fixtitle);
				2840	}
				2841
				2842	static char capitalize__doc__[] =
				2843	"S.capitalize() -> unicode\n\
				2844	\n\
				2845	Return a capitalized version of S, i.e. make the first character\n\
				2846	have upper case.";
				2847
				2848	static PyObject*
				2849	unicode_capitalize(PyUnicodeObject self, PyObject args)
				2850	{
				2851	if (!PyArg_NoArgs(args))
				2852	return NULL;
				2853	return fixup(self, fixcapitalize);
				2854	}
				2855
				2856	#if 0
				2857	static char capwords__doc__[] =
				2858	"S.capwords() -> unicode\n\
				2859	\n\
				2860	Apply .capitalize() to all words in S and return the result with\n\
				2861	normalized whitespace (all whitespace strings are replaced by ' ').";
				2862
				2863	static PyObject*
				2864	unicode_capwords(PyUnicodeObject self, PyObject args)
				2865	{
				2866	PyObject *list;
				2867	PyObject *item;
				2868	int i;
				2869
				2870	if (!PyArg_NoArgs(args))
				2871	return NULL;
				2872
				2873	/* Split into words */
				2874	list = split(self, NULL, -1);
				2875	if (!list)
				2876	return NULL;
				2877
				2878	/* Capitalize each word */
				2879	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				2880	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				2881	fixcapitalize);
				2882	if (item == NULL)
				2883	goto onError;
				2884	Py_DECREF(PyList_GET_ITEM(list, i));
				2885	PyList_SET_ITEM(list, i, item);
				2886	}
				2887
				2888	/* Join the words to form a new string */
				2889	item = PyUnicode_Join(NULL, list);
				2890
				2891	onError:
				2892	Py_DECREF(list);
				2893	return (PyObject *)item;
				2894	}
				2895	#endif
				2896
				2897	static char center__doc__[] =
				2898	"S.center(width) -> unicode\n\
				2899	\n\
				2900	Return S centered in a Unicode string of length width. Padding is done\n\
				2901	using spaces.";
				2902
				2903	static PyObject *
				2904	unicode_center(PyUnicodeObject self, PyObject args)
				2905	{
				2906	int marg, left;
				2907	int width;
				2908
				2909	if (!PyArg_ParseTuple(args, "i:center", &width))
				2910	return NULL;
				2911
				2912	if (self->length >= width) {
				2913	Py_INCREF(self);
				2914	return (PyObject*) self;
				2915	}
				2916
				2917	marg = width - self->length;
				2918	left = marg / 2 + (marg & width & 1);
				2919
				2920	return (PyObject*) pad(self, left, marg - left, ' ');
				2921	}
				2922
				2923	static int
				2924	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				2925	{
				2926	int len1, len2;
				2927	Py_UNICODE *s1 = str1->str;
				2928	Py_UNICODE *s2 = str2->str;
				2929
				2930	len1 = str1->length;
				2931	len2 = str2->length;
				2932
				2933	while (len1 > 0 && len2 > 0) {
				2934	int cmp = (s1++) - (s2++);
				2935	if (cmp)
				2936	/* This should make Christian happy! */
				2937	return (cmp < 0) ? -1 : (cmp != 0);
				2938	len1--, len2--;
				2939	}
				2940
				2941	return (len1 < len2) ? -1 : (len1 != len2);
				2942	}
				2943
				2944	int PyUnicode_Compare(PyObject *left,
				2945	PyObject *right)
				2946	{
				2947	PyUnicodeObject u = NULL, v = NULL;
				2948	int result;
				2949
				2950	/* Coerce the two arguments */
				2951	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2952	if (u == NULL)
				2953	goto onError;
				2954	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2955	if (v == NULL)
				2956	goto onError;
				2957
				2958	/* Shortcut for emtpy or interned objects */
				2959	if (v == u) {
				2960	Py_DECREF(u);
				2961	Py_DECREF(v);
				2962	return 0;
				2963	}
				2964
				2965	result = unicode_compare(u, v);
				2966
				2967	Py_DECREF(u);
				2968	Py_DECREF(v);
				2969	return result;
				2970
				2971	onError:
				2972	Py_XDECREF(u);
				2973	Py_XDECREF(v);
				2974	return -1;
				2975	}
				2976
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2977	int PyUnicode_Contains(PyObject *container,
				2978	PyObject *element)
				2979	{
				2980	PyUnicodeObject u = NULL, v = NULL;
				2981	int result;
				2982	register const Py_UNICODE p, e;
				2983	register Py_UNICODE ch;
				2984
				2985	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2986	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
				2987	if (v == NULL)
				2988	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2989	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				2990	if (u == NULL) {
				2991	Py_DECREF(v);
				2992	goto onError;
				2993	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2994
				2995	/* Check v in u */
				2996	if (PyUnicode_GET_SIZE(v) != 1) {
				2997	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	2998	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2999	goto onError;
				3000	}
				3001	ch = *PyUnicode_AS_UNICODE(v);
				3002	p = PyUnicode_AS_UNICODE(u);
				3003	e = p + PyUnicode_GET_SIZE(u);
				3004	result = 0;
				3005	while (p < e) {
				3006	if (*p++ == ch) {
				3007	result = 1;
				3008	break;
				3009	}
				3010	}
				3011
				3012	Py_DECREF(u);
				3013	Py_DECREF(v);
				3014	return result;
				3015
				3016	onError:
				3017	Py_XDECREF(u);
				3018	Py_XDECREF(v);
				3019	return -1;
				3020	}
				3021
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3022	/* Concat to string or Unicode object giving a new Unicode object. */
				3023
				3024	PyObject PyUnicode_Concat(PyObject left,
				3025	PyObject *right)
				3026	{
				3027	PyUnicodeObject u = NULL, v = NULL, *w;
				3028
				3029	/* Coerce the two arguments */
				3030	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3031	if (u == NULL)
				3032	goto onError;
				3033	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3034	if (v == NULL)
				3035	goto onError;
				3036
				3037	/* Shortcuts */
				3038	if (v == unicode_empty) {
				3039	Py_DECREF(v);
				3040	return (PyObject *)u;
				3041	}
				3042	if (u == unicode_empty) {
				3043	Py_DECREF(u);
				3044	return (PyObject *)v;
				3045	}
				3046
				3047	/* Concat the two Unicode strings */
				3048	w = _PyUnicode_New(u->length + v->length);
				3049	if (w == NULL)
				3050	goto onError;
				3051	Py_UNICODE_COPY(w->str, u->str, u->length);
				3052	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3053
				3054	Py_DECREF(u);
				3055	Py_DECREF(v);
				3056	return (PyObject *)w;
				3057
				3058	onError:
				3059	Py_XDECREF(u);
				3060	Py_XDECREF(v);
				3061	return NULL;
				3062	}
				3063
				3064	static char count__doc__[] =
				3065	"S.count(sub[, start[, end]]) -> int\n\
				3066	\n\
				3067	Return the number of occurrences of substring sub in Unicode string\n\
				3068	S[start:end]. Optional arguments start and end are\n\
				3069	interpreted as in slice notation.";
				3070
				3071	static PyObject *
				3072	unicode_count(PyUnicodeObject self, PyObject args)
				3073	{
				3074	PyUnicodeObject *substring;
				3075	int start = 0;
				3076	int end = INT_MAX;
				3077	PyObject *result;
				3078
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3079	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3080	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3081	return NULL;
				3082
				3083	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3084	(PyObject *)substring);
				3085	if (substring == NULL)
				3086	return NULL;
				3087
				3088	if (substring->length == 0) {
				3089	Py_DECREF(substring);
				3090	return PyInt_FromLong((long) 0);
				3091	}
				3092
				3093	if (start < 0)
				3094	start += self->length;
				3095	if (start < 0)
				3096	start = 0;
				3097	if (end > self->length)
				3098	end = self->length;
				3099	if (end < 0)
				3100	end += self->length;
				3101	if (end < 0)
				3102	end = 0;
				3103
				3104	result = PyInt_FromLong((long) count(self, start, end, substring));
				3105
				3106	Py_DECREF(substring);
				3107	return result;
				3108	}
				3109
				3110	static char encode__doc__[] =
				3111	"S.encode([encoding[,errors]]) -> string\n\
				3112	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3113	Return an encoded string version of S. Default encoding is the current\n\
				3114	default string encoding. errors may be given to set a different error\n\
				3115	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3116	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3117
				3118	static PyObject *
				3119	unicode_encode(PyUnicodeObject self, PyObject args)
				3120	{
				3121	char *encoding = NULL;
				3122	char *errors = NULL;
				3123	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3124	return NULL;
				3125	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3126	}
				3127
				3128	static char expandtabs__doc__[] =
				3129	"S.expandtabs([tabsize]) -> unicode\n\
				3130	\n\
				3131	Return a copy of S where all tab characters are expanded using spaces.\n\
				3132	If tabsize is not given, a tab size of 8 characters is assumed.";
				3133
				3134	static PyObject*
				3135	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3136	{
				3137	Py_UNICODE *e;
				3138	Py_UNICODE *p;
				3139	Py_UNICODE *q;
				3140	int i, j;
				3141	PyUnicodeObject *u;
				3142	int tabsize = 8;
				3143
				3144	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3145	return NULL;
				3146
				3147	/* First pass: determine size of ouput string */
				3148	i = j = 0;
				3149	e = self->str + self->length;
				3150	for (p = self->str; p < e; p++)
				3151	if (*p == '\t') {
				3152	if (tabsize > 0)
				3153	j += tabsize - (j % tabsize);
				3154	}
				3155	else {
				3156	j++;
				3157	if (p == '\n' \|\| p == '\r') {
				3158	i += j;
				3159	j = 0;
				3160	}
				3161	}
				3162
				3163	/* Second pass: create output string and fill it */
				3164	u = _PyUnicode_New(i + j);
				3165	if (!u)
				3166	return NULL;
				3167
				3168	j = 0;
				3169	q = u->str;
				3170
				3171	for (p = self->str; p < e; p++)
				3172	if (*p == '\t') {
				3173	if (tabsize > 0) {
				3174	i = tabsize - (j % tabsize);
				3175	j += i;
				3176	while (i--)
				3177	*q++ = ' ';
				3178	}
				3179	}
				3180	else {
				3181	j++;
				3182	q++ = p;
				3183	if (p == '\n' \|\| p == '\r')
				3184	j = 0;
				3185	}
				3186
				3187	return (PyObject*) u;
				3188	}
				3189
				3190	static char find__doc__[] =
				3191	"S.find(sub [,start [,end]]) -> int\n\
				3192	\n\
				3193	Return the lowest index in S where substring sub is found,\n\
				3194	such that sub is contained within s[start,end]. Optional\n\
				3195	arguments start and end are interpreted as in slice notation.\n\
				3196	\n\
				3197	Return -1 on failure.";
				3198
				3199	static PyObject *
				3200	unicode_find(PyUnicodeObject self, PyObject args)
				3201	{
				3202	PyUnicodeObject *substring;
				3203	int start = 0;
				3204	int end = INT_MAX;
				3205	PyObject *result;
				3206
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3207	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3208	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3209	return NULL;
				3210	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3211	(PyObject *)substring);
				3212	if (substring == NULL)
				3213	return NULL;
				3214
				3215	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3216
				3217	Py_DECREF(substring);
				3218	return result;
				3219	}
				3220
				3221	static PyObject *
				3222	unicode_getitem(PyUnicodeObject *self, int index)
				3223	{
				3224	if (index < 0 \|\| index >= self->length) {
				3225	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3226	return NULL;
				3227	}
				3228
				3229	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3230	}
				3231
				3232	static long
				3233	unicode_hash(PyUnicodeObject *self)
				3234	{
				3235	long hash;
				3236	PyObject *utf8;
				3237
				3238	/* Since Unicode objects compare equal to their UTF-8 string
				3239	counterparts, they should also use the UTF-8 strings as basis
				3240	for their hash value. This is needed to assure that strings and
				3241	Unicode objects behave in the same way as dictionary
				3242	keys. Unfortunately, this costs some performance and also some
				3243	memory if the cached UTF-8 representation is not used later
				3244	on. */
				3245	if (self->hash != -1)
				3246	return self->hash;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	3247	utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3248	if (utf8 == NULL)
				3249	return -1;
				3250	hash = PyObject_Hash(utf8);
				3251	if (hash == -1)
				3252	return -1;
				3253	self->hash = hash;
				3254	return hash;
				3255	}
				3256
				3257	static char index__doc__[] =
				3258	"S.index(sub [,start [,end]]) -> int\n\
				3259	\n\
				3260	Like S.find() but raise ValueError when the substring is not found.";
				3261
				3262	static PyObject *
				3263	unicode_index(PyUnicodeObject self, PyObject args)
				3264	{
				3265	int result;
				3266	PyUnicodeObject *substring;
				3267	int start = 0;
				3268	int end = INT_MAX;
				3269
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3270	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3271	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3272	return NULL;
				3273
				3274	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3275	(PyObject *)substring);
				3276	if (substring == NULL)
				3277	return NULL;
				3278
				3279	result = findstring(self, substring, start, end, 1);
				3280
				3281	Py_DECREF(substring);
				3282	if (result < 0) {
				3283	PyErr_SetString(PyExc_ValueError, "substring not found");
				3284	return NULL;
				3285	}
				3286	return PyInt_FromLong(result);
				3287	}
				3288
				3289	static char islower__doc__[] =
				3290	"S.islower() -> int\n\
				3291	\n\
				3292	Return 1 if all cased characters in S are lowercase and there is\n\
				3293	at least one cased character in S, 0 otherwise.";
				3294
				3295	static PyObject*
				3296	unicode_islower(PyUnicodeObject self, PyObject args)
				3297	{
				3298	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3299	register const Py_UNICODE *e;
				3300	int cased;
				3301
				3302	if (!PyArg_NoArgs(args))
				3303	return NULL;
				3304
				3305	/* Shortcut for single character strings */
				3306	if (PyUnicode_GET_SIZE(self) == 1)
				3307	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3308
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame^]	3309	/* Special case for empty strings */
				3310	if (PyString_GET_SIZE(self) == 0)
				3311	return PyInt_FromLong(0);
				3312
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3313	e = p + PyUnicode_GET_SIZE(self);
				3314	cased = 0;
				3315	for (; p < e; p++) {
				3316	register const Py_UNICODE ch = *p;
				3317
				3318	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3319	return PyInt_FromLong(0);
				3320	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3321	cased = 1;
				3322	}
				3323	return PyInt_FromLong(cased);
				3324	}
				3325
				3326	static char isupper__doc__[] =
				3327	"S.isupper() -> int\n\
				3328	\n\
				3329	Return 1 if all cased characters in S are uppercase and there is\n\
				3330	at least one cased character in S, 0 otherwise.";
				3331
				3332	static PyObject*
				3333	unicode_isupper(PyUnicodeObject self, PyObject args)
				3334	{
				3335	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3336	register const Py_UNICODE *e;
				3337	int cased;
				3338
				3339	if (!PyArg_NoArgs(args))
				3340	return NULL;
				3341
				3342	/* Shortcut for single character strings */
				3343	if (PyUnicode_GET_SIZE(self) == 1)
				3344	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3345
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame^]	3346	/* Special case for empty strings */
				3347	if (PyString_GET_SIZE(self) == 0)
				3348	return PyInt_FromLong(0);
				3349
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3350	e = p + PyUnicode_GET_SIZE(self);
				3351	cased = 0;
				3352	for (; p < e; p++) {
				3353	register const Py_UNICODE ch = *p;
				3354
				3355	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3356	return PyInt_FromLong(0);
				3357	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3358	cased = 1;
				3359	}
				3360	return PyInt_FromLong(cased);
				3361	}
				3362
				3363	static char istitle__doc__[] =
				3364	"S.istitle() -> int\n\
				3365	\n\
				3366	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3367	may only follow uncased characters and lowercase characters only cased\n\
				3368	ones. Return 0 otherwise.";
				3369
				3370	static PyObject*
				3371	unicode_istitle(PyUnicodeObject self, PyObject args)
				3372	{
				3373	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3374	register const Py_UNICODE *e;
				3375	int cased, previous_is_cased;
				3376
				3377	if (!PyArg_NoArgs(args))
				3378	return NULL;
				3379
				3380	/* Shortcut for single character strings */
				3381	if (PyUnicode_GET_SIZE(self) == 1)
				3382	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3383	(Py_UNICODE_ISUPPER(*p) != 0));
				3384
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame^]	3385	/* Special case for empty strings */
				3386	if (PyString_GET_SIZE(self) == 0)
				3387	return PyInt_FromLong(0);
				3388
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3389	e = p + PyUnicode_GET_SIZE(self);
				3390	cased = 0;
				3391	previous_is_cased = 0;
				3392	for (; p < e; p++) {
				3393	register const Py_UNICODE ch = *p;
				3394
				3395	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3396	if (previous_is_cased)
				3397	return PyInt_FromLong(0);
				3398	previous_is_cased = 1;
				3399	cased = 1;
				3400	}
				3401	else if (Py_UNICODE_ISLOWER(ch)) {
				3402	if (!previous_is_cased)
				3403	return PyInt_FromLong(0);
				3404	previous_is_cased = 1;
				3405	cased = 1;
				3406	}
				3407	else
				3408	previous_is_cased = 0;
				3409	}
				3410	return PyInt_FromLong(cased);
				3411	}
				3412
				3413	static char isspace__doc__[] =
				3414	"S.isspace() -> int\n\
				3415	\n\
				3416	Return 1 if there are only whitespace characters in S,\n\
				3417	0 otherwise.";
				3418
				3419	static PyObject*
				3420	unicode_isspace(PyUnicodeObject self, PyObject args)
				3421	{
				3422	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3423	register const Py_UNICODE *e;
				3424
				3425	if (!PyArg_NoArgs(args))
				3426	return NULL;
				3427
				3428	/* Shortcut for single character strings */
				3429	if (PyUnicode_GET_SIZE(self) == 1 &&
				3430	Py_UNICODE_ISSPACE(*p))
				3431	return PyInt_FromLong(1);
				3432
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame^]	3433	/* Special case for empty strings */
				3434	if (PyString_GET_SIZE(self) == 0)
				3435	return PyInt_FromLong(0);
				3436
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3437	e = p + PyUnicode_GET_SIZE(self);
				3438	for (; p < e; p++) {
				3439	if (!Py_UNICODE_ISSPACE(*p))
				3440	return PyInt_FromLong(0);
				3441	}
				3442	return PyInt_FromLong(1);
				3443	}
				3444
				3445	static char isdecimal__doc__[] =
				3446	"S.isdecimal() -> int\n\
				3447	\n\
				3448	Return 1 if there are only decimal characters in S,\n\
				3449	0 otherwise.";
				3450
				3451	static PyObject*
				3452	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3453	{
				3454	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3455	register const Py_UNICODE *e;
				3456
				3457	if (!PyArg_NoArgs(args))
				3458	return NULL;
				3459
				3460	/* Shortcut for single character strings */
				3461	if (PyUnicode_GET_SIZE(self) == 1 &&
				3462	Py_UNICODE_ISDECIMAL(*p))
				3463	return PyInt_FromLong(1);
				3464
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame^]	3465	/* Special case for empty strings */
				3466	if (PyString_GET_SIZE(self) == 0)
				3467	return PyInt_FromLong(0);
				3468
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3469	e = p + PyUnicode_GET_SIZE(self);
				3470	for (; p < e; p++) {
				3471	if (!Py_UNICODE_ISDECIMAL(*p))
				3472	return PyInt_FromLong(0);
				3473	}
				3474	return PyInt_FromLong(1);
				3475	}
				3476
				3477	static char isdigit__doc__[] =
				3478	"S.isdigit() -> int\n\
				3479	\n\
				3480	Return 1 if there are only digit characters in S,\n\
				3481	0 otherwise.";
				3482
				3483	static PyObject*
				3484	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3485	{
				3486	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3487	register const Py_UNICODE *e;
				3488
				3489	if (!PyArg_NoArgs(args))
				3490	return NULL;
				3491
				3492	/* Shortcut for single character strings */
				3493	if (PyUnicode_GET_SIZE(self) == 1 &&
				3494	Py_UNICODE_ISDIGIT(*p))
				3495	return PyInt_FromLong(1);
				3496
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame^]	3497	/* Special case for empty strings */
				3498	if (PyString_GET_SIZE(self) == 0)
				3499	return PyInt_FromLong(0);
				3500
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3501	e = p + PyUnicode_GET_SIZE(self);
				3502	for (; p < e; p++) {
				3503	if (!Py_UNICODE_ISDIGIT(*p))
				3504	return PyInt_FromLong(0);
				3505	}
				3506	return PyInt_FromLong(1);
				3507	}
				3508
				3509	static char isnumeric__doc__[] =
				3510	"S.isnumeric() -> int\n\
				3511	\n\
				3512	Return 1 if there are only numeric characters in S,\n\
				3513	0 otherwise.";
				3514
				3515	static PyObject*
				3516	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3517	{
				3518	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3519	register const Py_UNICODE *e;
				3520
				3521	if (!PyArg_NoArgs(args))
				3522	return NULL;
				3523
				3524	/* Shortcut for single character strings */
				3525	if (PyUnicode_GET_SIZE(self) == 1 &&
				3526	Py_UNICODE_ISNUMERIC(*p))
				3527	return PyInt_FromLong(1);
				3528
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame^]	3529	/* Special case for empty strings */
				3530	if (PyString_GET_SIZE(self) == 0)
				3531	return PyInt_FromLong(0);
				3532
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3533	e = p + PyUnicode_GET_SIZE(self);
				3534	for (; p < e; p++) {
				3535	if (!Py_UNICODE_ISNUMERIC(*p))
				3536	return PyInt_FromLong(0);
				3537	}
				3538	return PyInt_FromLong(1);
				3539	}
				3540
				3541	static char join__doc__[] =
				3542	"S.join(sequence) -> unicode\n\
				3543	\n\
				3544	Return a string which is the concatenation of the strings in the\n\
				3545	sequence. The separator between elements is S.";
				3546
				3547	static PyObject*
				3548	unicode_join(PyUnicodeObject self, PyObject args)
				3549	{
				3550	PyObject *data;
				3551	if (!PyArg_ParseTuple(args, "O:join", &data))
				3552	return NULL;
				3553
				3554	return PyUnicode_Join((PyObject *)self, data);
				3555	}
				3556
				3557	static int
				3558	unicode_length(PyUnicodeObject *self)
				3559	{
				3560	return self->length;
				3561	}
				3562
				3563	static char ljust__doc__[] =
				3564	"S.ljust(width) -> unicode\n\
				3565	\n\
				3566	Return S left justified in a Unicode string of length width. Padding is\n\
				3567	done using spaces.";
				3568
				3569	static PyObject *
				3570	unicode_ljust(PyUnicodeObject self, PyObject args)
				3571	{
				3572	int width;
				3573	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3574	return NULL;
				3575
				3576	if (self->length >= width) {
				3577	Py_INCREF(self);
				3578	return (PyObject*) self;
				3579	}
				3580
				3581	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3582	}
				3583
				3584	static char lower__doc__[] =
				3585	"S.lower() -> unicode\n\
				3586	\n\
				3587	Return a copy of the string S converted to lowercase.";
				3588
				3589	static PyObject*
				3590	unicode_lower(PyUnicodeObject self, PyObject args)
				3591	{
				3592	if (!PyArg_NoArgs(args))
				3593	return NULL;
				3594	return fixup(self, fixlower);
				3595	}
				3596
				3597	static char lstrip__doc__[] =
				3598	"S.lstrip() -> unicode\n\
				3599	\n\
				3600	Return a copy of the string S with leading whitespace removed.";
				3601
				3602	static PyObject *
				3603	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3604	{
				3605	if (!PyArg_NoArgs(args))
				3606	return NULL;
				3607	return strip(self, 1, 0);
				3608	}
				3609
				3610	static PyObject*
				3611	unicode_repeat(PyUnicodeObject *str, int len)
				3612	{
				3613	PyUnicodeObject *u;
				3614	Py_UNICODE *p;
				3615
				3616	if (len < 0)
				3617	len = 0;
				3618
				3619	if (len == 1) {
				3620	/* no repeat, return original string */
				3621	Py_INCREF(str);
				3622	return (PyObject*) str;
				3623	}
				3624
				3625	u = _PyUnicode_New(len * str->length);
				3626	if (!u)
				3627	return NULL;
				3628
				3629	p = u->str;
				3630
				3631	while (len-- > 0) {
				3632	Py_UNICODE_COPY(p, str->str, str->length);
				3633	p += str->length;
				3634	}
				3635
				3636	return (PyObject*) u;
				3637	}
				3638
				3639	PyObject PyUnicode_Replace(PyObject obj,
				3640	PyObject *subobj,
				3641	PyObject *replobj,
				3642	int maxcount)
				3643	{
				3644	PyObject *self;
				3645	PyObject *str1;
				3646	PyObject *str2;
				3647	PyObject *result;
				3648
				3649	self = PyUnicode_FromObject(obj);
				3650	if (self == NULL)
				3651	return NULL;
				3652	str1 = PyUnicode_FromObject(subobj);
				3653	if (str1 == NULL) {
				3654	Py_DECREF(self);
				3655	return NULL;
				3656	}
				3657	str2 = PyUnicode_FromObject(replobj);
				3658	if (str2 == NULL) {
				3659	Py_DECREF(self);
				3660	Py_DECREF(str1);
				3661	return NULL;
				3662	}
				3663	result = replace((PyUnicodeObject *)self,
				3664	(PyUnicodeObject *)str1,
				3665	(PyUnicodeObject *)str2,
				3666	maxcount);
				3667	Py_DECREF(self);
				3668	Py_DECREF(str1);
				3669	Py_DECREF(str2);
				3670	return result;
				3671	}
				3672
				3673	static char replace__doc__[] =
				3674	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3675	\n\
				3676	Return a copy of S with all occurrences of substring\n\
				3677	old replaced by new. If the optional argument maxsplit is\n\
				3678	given, only the first maxsplit occurrences are replaced.";
				3679
				3680	static PyObject*
				3681	unicode_replace(PyUnicodeObject self, PyObject args)
				3682	{
				3683	PyUnicodeObject *str1;
				3684	PyUnicodeObject *str2;
				3685	int maxcount = -1;
				3686	PyObject *result;
				3687
				3688	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3689	return NULL;
				3690	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3691	if (str1 == NULL)
				3692	return NULL;
				3693	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3694	if (str2 == NULL)
				3695	return NULL;
				3696
				3697	result = replace(self, str1, str2, maxcount);
				3698
				3699	Py_DECREF(str1);
				3700	Py_DECREF(str2);
				3701	return result;
				3702	}
				3703
				3704	static
				3705	PyObject unicode_repr(PyObject unicode)
				3706	{
				3707	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				3708	PyUnicode_GET_SIZE(unicode),
				3709	1);
				3710	}
				3711
				3712	static char rfind__doc__[] =
				3713	"S.rfind(sub [,start [,end]]) -> int\n\
				3714	\n\
				3715	Return the highest index in S where substring sub is found,\n\
				3716	such that sub is contained within s[start,end]. Optional\n\
				3717	arguments start and end are interpreted as in slice notation.\n\
				3718	\n\
				3719	Return -1 on failure.";
				3720
				3721	static PyObject *
				3722	unicode_rfind(PyUnicodeObject self, PyObject args)
				3723	{
				3724	PyUnicodeObject *substring;
				3725	int start = 0;
				3726	int end = INT_MAX;
				3727	PyObject *result;
				3728
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3729	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				3730	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3731	return NULL;
				3732	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3733	(PyObject *)substring);
				3734	if (substring == NULL)
				3735	return NULL;
				3736
				3737	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				3738
				3739	Py_DECREF(substring);
				3740	return result;
				3741	}
				3742
				3743	static char rindex__doc__[] =
				3744	"S.rindex(sub [,start [,end]]) -> int\n\
				3745	\n\
				3746	Like S.rfind() but raise ValueError when the substring is not found.";
				3747
				3748	static PyObject *
				3749	unicode_rindex(PyUnicodeObject self, PyObject args)
				3750	{
				3751	int result;
				3752	PyUnicodeObject *substring;
				3753	int start = 0;
				3754	int end = INT_MAX;
				3755
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3756	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				3757	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3758	return NULL;
				3759	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3760	(PyObject *)substring);
				3761	if (substring == NULL)
				3762	return NULL;
				3763
				3764	result = findstring(self, substring, start, end, -1);
				3765
				3766	Py_DECREF(substring);
				3767	if (result < 0) {
				3768	PyErr_SetString(PyExc_ValueError, "substring not found");
				3769	return NULL;
				3770	}
				3771	return PyInt_FromLong(result);
				3772	}
				3773
				3774	static char rjust__doc__[] =
				3775	"S.rjust(width) -> unicode\n\
				3776	\n\
				3777	Return S right justified in a Unicode string of length width. Padding is\n\
				3778	done using spaces.";
				3779
				3780	static PyObject *
				3781	unicode_rjust(PyUnicodeObject self, PyObject args)
				3782	{
				3783	int width;
				3784	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				3785	return NULL;
				3786
				3787	if (self->length >= width) {
				3788	Py_INCREF(self);
				3789	return (PyObject*) self;
				3790	}
				3791
				3792	return (PyObject*) pad(self, width - self->length, 0, ' ');
				3793	}
				3794
				3795	static char rstrip__doc__[] =
				3796	"S.rstrip() -> unicode\n\
				3797	\n\
				3798	Return a copy of the string S with trailing whitespace removed.";
				3799
				3800	static PyObject *
				3801	unicode_rstrip(PyUnicodeObject self, PyObject args)
				3802	{
				3803	if (!PyArg_NoArgs(args))
				3804	return NULL;
				3805	return strip(self, 0, 1);
				3806	}
				3807
				3808	static PyObject*
				3809	unicode_slice(PyUnicodeObject *self, int start, int end)
				3810	{
				3811	/* standard clamping */
				3812	if (start < 0)
				3813	start = 0;
				3814	if (end < 0)
				3815	end = 0;
				3816	if (end > self->length)
				3817	end = self->length;
				3818	if (start == 0 && end == self->length) {
				3819	/* full slice, return original string */
				3820	Py_INCREF(self);
				3821	return (PyObject*) self;
				3822	}
				3823	if (start > end)
				3824	start = end;
				3825	/* copy slice */
				3826	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				3827	end - start);
				3828	}
				3829
				3830	PyObject PyUnicode_Split(PyObject s,
				3831	PyObject *sep,
				3832	int maxsplit)
				3833	{
				3834	PyObject *result;
				3835
				3836	s = PyUnicode_FromObject(s);
				3837	if (s == NULL)
				3838	return NULL;
				3839	if (sep != NULL) {
				3840	sep = PyUnicode_FromObject(sep);
				3841	if (sep == NULL) {
				3842	Py_DECREF(s);
				3843	return NULL;
				3844	}
				3845	}
				3846
				3847	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				3848
				3849	Py_DECREF(s);
				3850	Py_XDECREF(sep);
				3851	return result;
				3852	}
				3853
				3854	static char split__doc__[] =
				3855	"S.split([sep [,maxsplit]]) -> list of strings\n\
				3856	\n\
				3857	Return a list of the words in S, using sep as the\n\
				3858	delimiter string. If maxsplit is given, at most maxsplit\n\
				3859	splits are done. If sep is not specified, any whitespace string\n\
				3860	is a separator.";
				3861
				3862	static PyObject*
				3863	unicode_split(PyUnicodeObject self, PyObject args)
				3864	{
				3865	PyObject *substring = Py_None;
				3866	int maxcount = -1;
				3867
				3868	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				3869	return NULL;
				3870
				3871	if (substring == Py_None)
				3872	return split(self, NULL, maxcount);
				3873	else if (PyUnicode_Check(substring))
				3874	return split(self, (PyUnicodeObject *)substring, maxcount);
				3875	else
				3876	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				3877	}
				3878
				3879	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3880	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3881	\n\
				3882	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3883	Line breaks are not included in the resulting list unless keepends\n\
				3884	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3885
				3886	static PyObject*
				3887	unicode_splitlines(PyUnicodeObject self, PyObject args)
				3888	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3889	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3890
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3891	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3892	return NULL;
				3893
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3894	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3895	}
				3896
				3897	static
				3898	PyObject unicode_str(PyUnicodeObject self)
				3899	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3900	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3901	}
				3902
				3903	static char strip__doc__[] =
				3904	"S.strip() -> unicode\n\
				3905	\n\
				3906	Return a copy of S with leading and trailing whitespace removed.";
				3907
				3908	static PyObject *
				3909	unicode_strip(PyUnicodeObject self, PyObject args)
				3910	{
				3911	if (!PyArg_NoArgs(args))
				3912	return NULL;
				3913	return strip(self, 1, 1);
				3914	}
				3915
				3916	static char swapcase__doc__[] =
				3917	"S.swapcase() -> unicode\n\
				3918	\n\
				3919	Return a copy of S with uppercase characters converted to lowercase\n\
				3920	and vice versa.";
				3921
				3922	static PyObject*
				3923	unicode_swapcase(PyUnicodeObject self, PyObject args)
				3924	{
				3925	if (!PyArg_NoArgs(args))
				3926	return NULL;
				3927	return fixup(self, fixswapcase);
				3928	}
				3929
				3930	static char translate__doc__[] =
				3931	"S.translate(table) -> unicode\n\
				3932	\n\
				3933	Return a copy of the string S, where all characters have been mapped\n\
				3934	through the given translation table, which must be a mapping of\n\
				3935	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				3936	are left untouched. Characters mapped to None are deleted.";
				3937
				3938	static PyObject*
				3939	unicode_translate(PyUnicodeObject self, PyObject args)
				3940	{
				3941	PyObject *table;
				3942
				3943	if (!PyArg_ParseTuple(args, "O:translate", &table))
				3944	return NULL;
				3945	return PyUnicode_TranslateCharmap(self->str,
				3946	self->length,
				3947	table,
				3948	"ignore");
				3949	}
				3950
				3951	static char upper__doc__[] =
				3952	"S.upper() -> unicode\n\
				3953	\n\
				3954	Return a copy of S converted to uppercase.";
				3955
				3956	static PyObject*
				3957	unicode_upper(PyUnicodeObject self, PyObject args)
				3958	{
				3959	if (!PyArg_NoArgs(args))
				3960	return NULL;
				3961	return fixup(self, fixupper);
				3962	}
				3963
				3964	#if 0
				3965	static char zfill__doc__[] =
				3966	"S.zfill(width) -> unicode\n\
				3967	\n\
				3968	Pad a numeric string x with zeros on the left, to fill a field\n\
				3969	of the specified width. The string x is never truncated.";
				3970
				3971	static PyObject *
				3972	unicode_zfill(PyUnicodeObject self, PyObject args)
				3973	{
				3974	int fill;
				3975	PyUnicodeObject *u;
				3976
				3977	int width;
				3978	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				3979	return NULL;
				3980
				3981	if (self->length >= width) {
				3982	Py_INCREF(self);
				3983	return (PyObject*) self;
				3984	}
				3985
				3986	fill = width - self->length;
				3987
				3988	u = pad(self, fill, 0, '0');
				3989
				3990	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				3991	/* move sign to beginning of string */
				3992	u->str[0] = u->str[fill];
				3993	u->str[fill] = '0';
				3994	}
				3995
				3996	return (PyObject*) u;
				3997	}
				3998	#endif
				3999
				4000	#if 0
				4001	static PyObject*
				4002	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4003	{
				4004	if (!PyArg_NoArgs(args))
				4005	return NULL;
				4006	return PyInt_FromLong(unicode_freelist_size);
				4007	}
				4008	#endif
				4009
				4010	static char startswith__doc__[] =
				4011	"S.startswith(prefix[, start[, end]]) -> int\n\
				4012	\n\
				4013	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4014	optional start, test S beginning at that position. With optional end, stop\n\
				4015	comparing S at that position.";
				4016
				4017	static PyObject *
				4018	unicode_startswith(PyUnicodeObject *self,
				4019	PyObject *args)
				4020	{
				4021	PyUnicodeObject *substring;
				4022	int start = 0;
				4023	int end = INT_MAX;
				4024	PyObject *result;
				4025
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4026	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4027	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4028	return NULL;
				4029	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4030	(PyObject *)substring);
				4031	if (substring == NULL)
				4032	return NULL;
				4033
				4034	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4035
				4036	Py_DECREF(substring);
				4037	return result;
				4038	}
				4039
				4040
				4041	static char endswith__doc__[] =
				4042	"S.endswith(suffix[, start[, end]]) -> int\n\
				4043	\n\
				4044	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4045	optional start, test S beginning at that position. With optional end, stop\n\
				4046	comparing S at that position.";
				4047
				4048	static PyObject *
				4049	unicode_endswith(PyUnicodeObject *self,
				4050	PyObject *args)
				4051	{
				4052	PyUnicodeObject *substring;
				4053	int start = 0;
				4054	int end = INT_MAX;
				4055	PyObject *result;
				4056
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4057	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4058	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4059	return NULL;
				4060	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4061	(PyObject *)substring);
				4062	if (substring == NULL)
				4063	return NULL;
				4064
				4065	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4066
				4067	Py_DECREF(substring);
				4068	return result;
				4069	}
				4070
				4071
				4072	static PyMethodDef unicode_methods[] = {
				4073
				4074	/* Order is according to common usage: often used methods should
				4075	appear first, since lookup is done sequentially. */
				4076
				4077	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4078	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4079	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4080	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4081	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4082	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4083	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4084	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4085	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4086	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4087	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4088	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4089	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4090	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4091	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4092	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4093	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4094	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4095	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4096	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4097	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4098	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4099	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4100	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4101	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4102	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4103	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4104	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4105	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4106	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4107	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4108	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4109	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
				4110	#if 0
				4111	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4112	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4113	#endif
				4114
				4115	#if 0
				4116	/* This one is just used for debugging the implementation. */
				4117	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4118	#endif
				4119
				4120	{NULL, NULL}
				4121	};
				4122
				4123	static PyObject *
				4124	unicode_getattr(PyUnicodeObject self, char name)
				4125	{
				4126	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4127	}
				4128
				4129	static PySequenceMethods unicode_as_sequence = {
				4130	(inquiry) unicode_length, /* sq_length */
				4131	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4132	(intargfunc) unicode_repeat, /* sq_repeat */
				4133	(intargfunc) unicode_getitem, /* sq_item */
				4134	(intintargfunc) unicode_slice, /* sq_slice */
				4135	0, /* sq_ass_item */
				4136	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4137	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4138	};
				4139
				4140	static int
				4141	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4142	int index,
				4143	const void **ptr)
				4144	{
				4145	if (index != 0) {
				4146	PyErr_SetString(PyExc_SystemError,
				4147	"accessing non-existent unicode segment");
				4148	return -1;
				4149	}
				4150	ptr = (void ) self->str;
				4151	return PyUnicode_GET_DATA_SIZE(self);
				4152	}
				4153
				4154	static int
				4155	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4156	const void **ptr)
				4157	{
				4158	PyErr_SetString(PyExc_TypeError,
				4159	"cannot use unicode as modifyable buffer");
				4160	return -1;
				4161	}
				4162
				4163	static int
				4164	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4165	int *lenp)
				4166	{
				4167	if (lenp)
				4168	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4169	return 1;
				4170	}
				4171
				4172	static int
				4173	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4174	int index,
				4175	const void **ptr)
				4176	{
				4177	PyObject *str;
				4178
				4179	if (index != 0) {
				4180	PyErr_SetString(PyExc_SystemError,
				4181	"accessing non-existent unicode segment");
				4182	return -1;
				4183	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	4184	str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4185	if (str == NULL)
				4186	return -1;
				4187	ptr = (void ) PyString_AS_STRING(str);
				4188	return PyString_GET_SIZE(str);
				4189	}
				4190
				4191	/* Helpers for PyUnicode_Format() */
				4192
				4193	static PyObject *
				4194	getnextarg(args, arglen, p_argidx)
				4195	PyObject *args;
				4196	int arglen;
				4197	int *p_argidx;
				4198	{
				4199	int argidx = *p_argidx;
				4200	if (argidx < arglen) {
				4201	(*p_argidx)++;
				4202	if (arglen < 0)
				4203	return args;
				4204	else
				4205	return PyTuple_GetItem(args, argidx);
				4206	}
				4207	PyErr_SetString(PyExc_TypeError,
				4208	"not enough arguments for format string");
				4209	return NULL;
				4210	}
				4211
				4212	#define F_LJUST (1<<0)
				4213	#define F_SIGN (1<<1)
				4214	#define F_BLANK (1<<2)
				4215	#define F_ALT (1<<3)
				4216	#define F_ZERO (1<<4)
				4217
				4218	static
				4219	#ifdef HAVE_STDARG_PROTOTYPES
				4220	int usprintf(register Py_UNICODE buffer, char format, ...)
				4221	#else
				4222	int usprintf(va_alist) va_dcl
				4223	#endif
				4224	{
				4225	register int i;
				4226	int len;
				4227	va_list va;
				4228	char *charbuffer;
				4229	#ifdef HAVE_STDARG_PROTOTYPES
				4230	va_start(va, format);
				4231	#else
				4232	Py_UNICODE *args;
				4233	char *format;
				4234
				4235	va_start(va);
				4236	buffer = va_arg(va, Py_UNICODE *);
				4237	format = va_arg(va, char *);
				4238	#endif
				4239
				4240	/* First, format the string as char array, then expand to Py_UNICODE
				4241	array. */
				4242	charbuffer = (char *)buffer;
				4243	len = vsprintf(charbuffer, format, va);
				4244	for (i = len - 1; i >= 0; i--)
				4245	buffer[i] = (Py_UNICODE) charbuffer[i];
				4246
				4247	va_end(va);
				4248	return len;
				4249	}
				4250
				4251	static int
				4252	formatfloat(Py_UNICODE *buf,
				4253	int flags,
				4254	int prec,
				4255	int type,
				4256	PyObject *v)
				4257	{
				4258	char fmt[20];
				4259	double x;
				4260
				4261	x = PyFloat_AsDouble(v);
				4262	if (x == -1.0 && PyErr_Occurred())
				4263	return -1;
				4264	if (prec < 0)
				4265	prec = 6;
				4266	if (prec > 50)
				4267	prec = 50; /* Arbitrary limitation */
				4268	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4269	type = 'g';
				4270	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
				4271	return usprintf(buf, fmt, x);
				4272	}
				4273
				4274	static int
				4275	formatint(Py_UNICODE *buf,
				4276	int flags,
				4277	int prec,
				4278	int type,
				4279	PyObject *v)
				4280	{
				4281	char fmt[20];
				4282	long x;
				4283
				4284	x = PyInt_AsLong(v);
				4285	if (x == -1 && PyErr_Occurred())
				4286	return -1;
				4287	if (prec < 0)
				4288	prec = 1;
				4289	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4290	return usprintf(buf, fmt, x);
				4291	}
				4292
				4293	static int
				4294	formatchar(Py_UNICODE *buf,
				4295	PyObject *v)
				4296	{
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4297	if (PyUnicode_Check(v)) {
				4298	if (PyUnicode_GET_SIZE(v) != 1)
				4299	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4300	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4301	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4302
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4303	else if (PyString_Check(v)) {
				4304	if (PyString_GET_SIZE(v) != 1)
				4305	goto onError;
				4306	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4307	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4308
				4309	else {
				4310	/* Integer input truncated to a character */
				4311	long x;
				4312	x = PyInt_AsLong(v);
				4313	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4314	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4315	buf[0] = (char) x;
				4316	}
				4317	buf[1] = '\0';
				4318	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4319
				4320	onError:
				4321	PyErr_SetString(PyExc_TypeError,
				4322	"%c requires int or char");
				4323	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4324	}
				4325
				4326	PyObject PyUnicode_Format(PyObject format,
				4327	PyObject *args)
				4328	{
				4329	Py_UNICODE fmt, res;
				4330	int fmtcnt, rescnt, reslen, arglen, argidx;
				4331	int args_owned = 0;
				4332	PyUnicodeObject *result = NULL;
				4333	PyObject *dict = NULL;
				4334	PyObject *uformat;
				4335
				4336	if (format == NULL \|\| args == NULL) {
				4337	PyErr_BadInternalCall();
				4338	return NULL;
				4339	}
				4340	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4341	if (uformat == NULL)
				4342	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4343	fmt = PyUnicode_AS_UNICODE(uformat);
				4344	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4345
				4346	reslen = rescnt = fmtcnt + 100;
				4347	result = _PyUnicode_New(reslen);
				4348	if (result == NULL)
				4349	goto onError;
				4350	res = PyUnicode_AS_UNICODE(result);
				4351
				4352	if (PyTuple_Check(args)) {
				4353	arglen = PyTuple_Size(args);
				4354	argidx = 0;
				4355	}
				4356	else {
				4357	arglen = -1;
				4358	argidx = -2;
				4359	}
				4360	if (args->ob_type->tp_as_mapping)
				4361	dict = args;
				4362
				4363	while (--fmtcnt >= 0) {
				4364	if (*fmt != '%') {
				4365	if (--rescnt < 0) {
				4366	rescnt = fmtcnt + 100;
				4367	reslen += rescnt;
				4368	if (_PyUnicode_Resize(result, reslen) < 0)
				4369	return NULL;
				4370	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4371	--rescnt;
				4372	}
				4373	res++ = fmt++;
				4374	}
				4375	else {
				4376	/* Got a format specifier */
				4377	int flags = 0;
				4378	int width = -1;
				4379	int prec = -1;
				4380	int size = 0;
				4381	Py_UNICODE c = '\0';
				4382	Py_UNICODE fill;
				4383	PyObject *v = NULL;
				4384	PyObject *temp = NULL;
				4385	Py_UNICODE *buf;
				4386	Py_UNICODE sign;
				4387	int len;
				4388	Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
				4389
				4390	fmt++;
				4391	if (*fmt == '(') {
				4392	Py_UNICODE *keystart;
				4393	int keylen;
				4394	PyObject *key;
				4395	int pcount = 1;
				4396
				4397	if (dict == NULL) {
				4398	PyErr_SetString(PyExc_TypeError,
				4399	"format requires a mapping");
				4400	goto onError;
				4401	}
				4402	++fmt;
				4403	--fmtcnt;
				4404	keystart = fmt;
				4405	/* Skip over balanced parentheses */
				4406	while (pcount > 0 && --fmtcnt >= 0) {
				4407	if (*fmt == ')')
				4408	--pcount;
				4409	else if (*fmt == '(')
				4410	++pcount;
				4411	fmt++;
				4412	}
				4413	keylen = fmt - keystart - 1;
				4414	if (fmtcnt < 0 \|\| pcount > 0) {
				4415	PyErr_SetString(PyExc_ValueError,
				4416	"incomplete format key");
				4417	goto onError;
				4418	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4419	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4420	then looked up since Python uses strings to hold
				4421	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4422	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4423	key = PyUnicode_EncodeUTF8(keystart,
				4424	keylen,
				4425	NULL);
				4426	if (key == NULL)
				4427	goto onError;
				4428	if (args_owned) {
				4429	Py_DECREF(args);
				4430	args_owned = 0;
				4431	}
				4432	args = PyObject_GetItem(dict, key);
				4433	Py_DECREF(key);
				4434	if (args == NULL) {
				4435	goto onError;
				4436	}
				4437	args_owned = 1;
				4438	arglen = -1;
				4439	argidx = -2;
				4440	}
				4441	while (--fmtcnt >= 0) {
				4442	switch (c = *fmt++) {
				4443	case '-': flags \|= F_LJUST; continue;
				4444	case '+': flags \|= F_SIGN; continue;
				4445	case ' ': flags \|= F_BLANK; continue;
				4446	case '#': flags \|= F_ALT; continue;
				4447	case '0': flags \|= F_ZERO; continue;
				4448	}
				4449	break;
				4450	}
				4451	if (c == '*') {
				4452	v = getnextarg(args, arglen, &argidx);
				4453	if (v == NULL)
				4454	goto onError;
				4455	if (!PyInt_Check(v)) {
				4456	PyErr_SetString(PyExc_TypeError,
				4457	"* wants int");
				4458	goto onError;
				4459	}
				4460	width = PyInt_AsLong(v);
				4461	if (width < 0) {
				4462	flags \|= F_LJUST;
				4463	width = -width;
				4464	}
				4465	if (--fmtcnt >= 0)
				4466	c = *fmt++;
				4467	}
				4468	else if (c >= '0' && c <= '9') {
				4469	width = c - '0';
				4470	while (--fmtcnt >= 0) {
				4471	c = *fmt++;
				4472	if (c < '0' \|\| c > '9')
				4473	break;
				4474	if ((width*10) / 10 != width) {
				4475	PyErr_SetString(PyExc_ValueError,
				4476	"width too big");
				4477	goto onError;
				4478	}
				4479	width = width*10 + (c - '0');
				4480	}
				4481	}
				4482	if (c == '.') {
				4483	prec = 0;
				4484	if (--fmtcnt >= 0)
				4485	c = *fmt++;
				4486	if (c == '*') {
				4487	v = getnextarg(args, arglen, &argidx);
				4488	if (v == NULL)
				4489	goto onError;
				4490	if (!PyInt_Check(v)) {
				4491	PyErr_SetString(PyExc_TypeError,
				4492	"* wants int");
				4493	goto onError;
				4494	}
				4495	prec = PyInt_AsLong(v);
				4496	if (prec < 0)
				4497	prec = 0;
				4498	if (--fmtcnt >= 0)
				4499	c = *fmt++;
				4500	}
				4501	else if (c >= '0' && c <= '9') {
				4502	prec = c - '0';
				4503	while (--fmtcnt >= 0) {
				4504	c = Py_CHARMASK(*fmt++);
				4505	if (c < '0' \|\| c > '9')
				4506	break;
				4507	if ((prec*10) / 10 != prec) {
				4508	PyErr_SetString(PyExc_ValueError,
				4509	"prec too big");
				4510	goto onError;
				4511	}
				4512	prec = prec*10 + (c - '0');
				4513	}
				4514	}
				4515	} /* prec */
				4516	if (fmtcnt >= 0) {
				4517	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4518	size = c;
				4519	if (--fmtcnt >= 0)
				4520	c = *fmt++;
				4521	}
				4522	}
				4523	if (fmtcnt < 0) {
				4524	PyErr_SetString(PyExc_ValueError,
				4525	"incomplete format");
				4526	goto onError;
				4527	}
				4528	if (c != '%') {
				4529	v = getnextarg(args, arglen, &argidx);
				4530	if (v == NULL)
				4531	goto onError;
				4532	}
				4533	sign = 0;
				4534	fill = ' ';
				4535	switch (c) {
				4536
				4537	case '%':
				4538	buf = tmpbuf;
				4539	buf[0] = '%';
				4540	len = 1;
				4541	break;
				4542
				4543	case 's':
				4544	case 'r':
				4545	if (PyUnicode_Check(v) && c == 's') {
				4546	temp = v;
				4547	Py_INCREF(temp);
				4548	}
				4549	else {
				4550	PyObject *unicode;
				4551	if (c == 's')
				4552	temp = PyObject_Str(v);
				4553	else
				4554	temp = PyObject_Repr(v);
				4555	if (temp == NULL)
				4556	goto onError;
				4557	if (!PyString_Check(temp)) {
				4558	/* XXX Note: this should never happen, since
				4559	PyObject_Repr() and PyObject_Str() assure
				4560	this */
				4561	Py_DECREF(temp);
				4562	PyErr_SetString(PyExc_TypeError,
				4563	"%s argument has non-string str()");
				4564	goto onError;
				4565	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4566	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4567	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4568	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4569	"strict");
				4570	Py_DECREF(temp);
				4571	temp = unicode;
				4572	if (temp == NULL)
				4573	goto onError;
				4574	}
				4575	buf = PyUnicode_AS_UNICODE(temp);
				4576	len = PyUnicode_GET_SIZE(temp);
				4577	if (prec >= 0 && len > prec)
				4578	len = prec;
				4579	break;
				4580
				4581	case 'i':
				4582	case 'd':
				4583	case 'u':
				4584	case 'o':
				4585	case 'x':
				4586	case 'X':
				4587	if (c == 'i')
				4588	c = 'd';
				4589	buf = tmpbuf;
				4590	len = formatint(buf, flags, prec, c, v);
				4591	if (len < 0)
				4592	goto onError;
				4593	sign = (c == 'd');
				4594	if (flags & F_ZERO) {
				4595	fill = '0';
				4596	if ((flags&F_ALT) &&
				4597	(c == 'x' \|\| c == 'X') &&
				4598	buf[0] == '0' && buf[1] == c) {
				4599	res++ = buf++;
				4600	res++ = buf++;
				4601	rescnt -= 2;
				4602	len -= 2;
				4603	width -= 2;
				4604	if (width < 0)
				4605	width = 0;
				4606	}
				4607	}
				4608	break;
				4609
				4610	case 'e':
				4611	case 'E':
				4612	case 'f':
				4613	case 'g':
				4614	case 'G':
				4615	buf = tmpbuf;
				4616	len = formatfloat(buf, flags, prec, c, v);
				4617	if (len < 0)
				4618	goto onError;
				4619	sign = 1;
				4620	if (flags&F_ZERO)
				4621	fill = '0';
				4622	break;
				4623
				4624	case 'c':
				4625	buf = tmpbuf;
				4626	len = formatchar(buf, v);
				4627	if (len < 0)
				4628	goto onError;
				4629	break;
				4630
				4631	default:
				4632	PyErr_Format(PyExc_ValueError,
				4633	"unsupported format character '%c' (0x%x)",
				4634	c, c);
				4635	goto onError;
				4636	}
				4637	if (sign) {
				4638	if (buf == '-' \|\| buf == '+') {
				4639	sign = *buf++;
				4640	len--;
				4641	}
				4642	else if (flags & F_SIGN)
				4643	sign = '+';
				4644	else if (flags & F_BLANK)
				4645	sign = ' ';
				4646	else
				4647	sign = 0;
				4648	}
				4649	if (width < len)
				4650	width = len;
				4651	if (rescnt < width + (sign != 0)) {
				4652	reslen -= rescnt;
				4653	rescnt = width + fmtcnt + 100;
				4654	reslen += rescnt;
				4655	if (_PyUnicode_Resize(result, reslen) < 0)
				4656	return NULL;
				4657	res = PyUnicode_AS_UNICODE(result)
				4658	+ reslen - rescnt;
				4659	}
				4660	if (sign) {
				4661	if (fill != ' ')
				4662	*res++ = sign;
				4663	rescnt--;
				4664	if (width > len)
				4665	width--;
				4666	}
				4667	if (width > len && !(flags & F_LJUST)) {
				4668	do {
				4669	--rescnt;
				4670	*res++ = fill;
				4671	} while (--width > len);
				4672	}
				4673	if (sign && fill == ' ')
				4674	*res++ = sign;
				4675	memcpy(res, buf, len * sizeof(Py_UNICODE));
				4676	res += len;
				4677	rescnt -= len;
				4678	while (--width >= len) {
				4679	--rescnt;
				4680	*res++ = ' ';
				4681	}
				4682	if (dict && (argidx < arglen) && c != '%') {
				4683	PyErr_SetString(PyExc_TypeError,
				4684	"not all arguments converted");
				4685	goto onError;
				4686	}
				4687	Py_XDECREF(temp);
				4688	} /* '%' */
				4689	} /* until end */
				4690	if (argidx < arglen && !dict) {
				4691	PyErr_SetString(PyExc_TypeError,
				4692	"not all arguments converted");
				4693	goto onError;
				4694	}
				4695
				4696	if (args_owned) {
				4697	Py_DECREF(args);
				4698	}
				4699	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4700	if (_PyUnicode_Resize(result, reslen - rescnt))
				4701	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4702	return (PyObject *)result;
				4703
				4704	onError:
				4705	Py_XDECREF(result);
				4706	Py_DECREF(uformat);
				4707	if (args_owned) {
				4708	Py_DECREF(args);
				4709	}
				4710	return NULL;
				4711	}
				4712
				4713	static PyBufferProcs unicode_as_buffer = {
				4714	(getreadbufferproc) unicode_buffer_getreadbuf,
				4715	(getwritebufferproc) unicode_buffer_getwritebuf,
				4716	(getsegcountproc) unicode_buffer_getsegcount,
				4717	(getcharbufferproc) unicode_buffer_getcharbuf,
				4718	};
				4719
				4720	PyTypeObject PyUnicode_Type = {
				4721	PyObject_HEAD_INIT(&PyType_Type)
				4722	0, /* ob_size */
				4723	"unicode", /* tp_name */
				4724	sizeof(PyUnicodeObject), /* tp_size */
				4725	0, /* tp_itemsize */
				4726	/* Slots */
				4727	(destructor)_PyUnicode_Free, /* tp_dealloc */
				4728	0, /* tp_print */
				4729	(getattrfunc)unicode_getattr, /* tp_getattr */
				4730	0, /* tp_setattr */
				4731	(cmpfunc) unicode_compare, /* tp_compare */
				4732	(reprfunc) unicode_repr, /* tp_repr */
				4733	0, /* tp_as_number */
				4734	&unicode_as_sequence, /* tp_as_sequence */
				4735	0, /* tp_as_mapping */
				4736	(hashfunc) unicode_hash, /* tp_hash*/
				4737	0, /* tp_call*/
				4738	(reprfunc) unicode_str, /* tp_str */
				4739	(getattrofunc) NULL, /* tp_getattro */
				4740	(setattrofunc) NULL, /* tp_setattro */
				4741	&unicode_as_buffer, /* tp_as_buffer */
				4742	Py_TPFLAGS_DEFAULT, /* tp_flags */
				4743	};
				4744
				4745	/* Initialize the Unicode implementation */
				4746
				4747	void _PyUnicode_Init()
				4748	{
				4749	/* Doublecheck the configuration... */
				4750	if (sizeof(Py_UNICODE) != 2)
				4751	Py_FatalError("Unicode configuration error: "
				4752	"sizeof(Py_UNICODE) != 2 bytes");
				4753
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4754	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4755	unicode_freelist = NULL;
				4756	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4757	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	4758	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4759	}
				4760
				4761	/* Finalize the Unicode implementation */
				4762
				4763	void
				4764	_PyUnicode_Fini()
				4765	{
				4766	PyUnicodeObject *u = unicode_freelist;
				4767
				4768	while (u != NULL) {
				4769	PyUnicodeObject *v = u;
				4770	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4771	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	4772	PyMem_DEL(v->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4773	Py_XDECREF(v->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	4774	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4775	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4776	unicode_freelist = NULL;
				4777	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4778	Py_XDECREF(unicode_empty);
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4779	unicode_empty = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4780	}