Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 3157cd89c516aa0339efba516ba62cfd97744eae [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
				7	(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
				8
				9
				10	Original header:
				11	--------------------------------------------------------------------
				12
				13	* Yet another Unicode string type for Python. This type supports the
				14	* 16-bit Basic Multilingual Plane (BMP) only.
				15	*
				16	* Note that this string class supports embedded NULL characters. End
				17	* of string is given by the length attribute. However, the internal
				18	* representation always stores a trailing NULL to make it easier to
				19	* use unicode strings with standard APIs.
				20	*
				21	* History:
				22	* 1999-01-23 fl Created
				23	* 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
				24	* 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
				25	* 1999-03-06 fl Moved declarations to separate file, etc.
				26	* 1999-06-13 fl Changed join method semantics according to Tim's proposal
				27	* 1999-08-10 fl Some minor tweaks
				28	*
				29	* Written by Fredrik Lundh, January 1999.
				30	*
				31	* Copyright (c) 1999 by Secret Labs AB.
				32	* Copyright (c) 1999 by Fredrik Lundh.
				33	*
				34	* fredrik@pythonware.com
				35	* http://www.pythonware.com
				36	*
				37	* --------------------------------------------------------------------
				38	* This Unicode String Type is
				39	*
				40	* Copyright (c) 1999 by Secret Labs AB
				41	* Copyright (c) 1999 by Fredrik Lundh
				42	*
				43	* By obtaining, using, and/or copying this software and/or its
				44	* associated documentation, you agree that you have read, understood,
				45	* and will comply with the following terms and conditions:
				46	*
				47	* Permission to use, copy, modify, and distribute this software and its
				48	* associated documentation for any purpose and without fee is hereby
				49	* granted, provided that the above copyright notice appears in all
				50	* copies, and that both that copyright notice and this permission notice
				51	* appear in supporting documentation, and that the name of Secret Labs
				52	* AB or the author not be used in advertising or publicity pertaining to
				53	* distribution of the software without specific, written prior
				54	* permission.
				55	*
				56	* SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				57	* THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				58	* FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				59	* ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				60	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				61	* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				62	* OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				63	* -------------------------------------------------------------------- */
				64
				65	#include "Python.h"
				66
				67	#include "mymath.h"
				68	#include "unicodeobject.h"
				69
				70	#if defined(HAVE_LIMITS_H)
				71	#include <limits.h>
				72	#else
				73	#define INT_MAX 2147483647
				74	#endif
				75
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	76	#ifdef MS_WIN32
				77	#include <windows.h>
				78	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	79
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	80	/* Limit for the Unicode object free list */
				81
				82	#define MAX_UNICODE_FREELIST_SIZE 1024
				83
				84	/* Limit for the Unicode object free list stay alive optimization.
				85
				86	The implementation will keep allocated Unicode memory intact for
				87	all objects on the free list having a size less than this
				88	limit. This reduces malloc() overhead for small Unicode objects.
				89
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	90	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	91	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	92	malloc()-overhead) bytes of unused garbage.
				93
				94	Setting the limit to 0 effectively turns the feature off.
				95
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	96	Note: This is an experimental feature ! If you get core dumps when
				97	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	98
				99	*/
				100
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	101	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	102
				103	/* Endianness switches; defaults to little endian */
				104
				105	#ifdef WORDS_BIGENDIAN
				106	# define BYTEORDER_IS_BIG_ENDIAN
				107	#else
				108	# define BYTEORDER_IS_LITTLE_ENDIAN
				109	#endif
				110
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	111	/* --- Globals ------------------------------------------------------------
				112
				113	The globals are initialized by the _PyUnicode_Init() API and should
				114	not be used before calling that API.
				115
				116	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	117
				118	/* The empty Unicode object */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	119	static PyUnicodeObject *unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	120
				121	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	122	static PyUnicodeObject *unicode_freelist;
				123	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	124
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	125	/* Default encoding to use and assume when NULL is passed as encoding
				126	parameter; it is initialized by _PyUnicode_Init().
				127
				128	Always use the PyUnicode_SetDefaultEncoding() and
				129	PyUnicode_GetDefaultEncoding() APIs to access this global.
				130
				131	*/
				132
				133	static char unicode_default_encoding[100];
				134
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	135	/* --- Unicode Object ----------------------------------------------------- */
				136
				137	static
				138	int _PyUnicode_Resize(register PyUnicodeObject *unicode,
				139	int length)
				140	{
				141	void *oldstr;
				142
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	143	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	144	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	145	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	146
				147	/* Resizing unicode_empty is not allowed. */
				148	if (unicode == unicode_empty) {
				149	PyErr_SetString(PyExc_SystemError,
				150	"can't resize empty unicode object");
				151	return -1;
				152	}
				153
				154	/* We allocate one more byte to make sure the string is
				155	Ux0000 terminated -- XXX is this needed ? */
				156	oldstr = unicode->str;
				157	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				158	if (!unicode->str) {
				159	unicode->str = oldstr;
				160	PyErr_NoMemory();
				161	return -1;
				162	}
				163	unicode->str[length] = 0;
				164	unicode->length = length;
				165
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	166	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	167	/* Reset the object caches */
				168	if (unicode->utf8str) {
				169	Py_DECREF(unicode->utf8str);
				170	unicode->utf8str = NULL;
				171	}
				172	unicode->hash = -1;
				173
				174	return 0;
				175	}
				176
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	177	int PyUnicode_Resize(PyObject **unicode,
				178	int length)
				179	{
				180	PyUnicodeObject *v;
				181
				182	if (unicode == NULL) {
				183	PyErr_BadInternalCall();
				184	return -1;
				185	}
				186	v = (PyUnicodeObject )unicode;
				187	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				188	PyErr_BadInternalCall();
				189	return -1;
				190	}
				191	return _PyUnicode_Resize(v, length);
				192	}
				193
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	194	/* We allocate one more byte to make sure the string is
				195	Ux0000 terminated -- XXX is this needed ?
				196
				197	XXX This allocator could further be enhanced by assuring that the
				198	free list never reduces its size below 1.
				199
				200	*/
				201
				202	static
				203	PyUnicodeObject *_PyUnicode_New(int length)
				204	{
				205	register PyUnicodeObject *unicode;
				206
				207	/* Optimization for empty strings */
				208	if (length == 0 && unicode_empty != NULL) {
				209	Py_INCREF(unicode_empty);
				210	return unicode_empty;
				211	}
				212
				213	/* Unicode freelist & memory allocation */
				214	if (unicode_freelist) {
				215	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	216	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	217	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	219	/* Keep-Alive optimization: we only upsize the buffer,
				220	never downsize it. */
				221	if ((unicode->length < length) &&
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	222	_PyUnicode_Resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	223	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	224	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	225	}
				226	}
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	227	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	228	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	229	}
				230	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	231	}
				232	else {
				233	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				234	if (unicode == NULL)
				235	return NULL;
				236	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				237	}
				238
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	239	if (!unicode->str) {
				240	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	241	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	242	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	243	unicode->str[length] = 0;
				244	unicode->length = length;
				245	unicode->hash = -1;
				246	unicode->utf8str = NULL;
				247	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	248
				249	onError:
				250	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	251	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	252	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	253	}
				254
				255	static
				256	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				257	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	258	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	259	/* Keep-Alive optimization */
				260	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	261	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	262	unicode->str = NULL;
				263	unicode->length = 0;
				264	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	265	if (unicode->utf8str) {
				266	Py_DECREF(unicode->utf8str);
				267	unicode->utf8str = NULL;
				268	}
				269	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	270	(PyUnicodeObject *)unicode = unicode_freelist;
				271	unicode_freelist = unicode;
				272	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	273	}
				274	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	275	PyMem_DEL(unicode->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	276	Py_XDECREF(unicode->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	277	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	278	}
				279	}
				280
				281	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				282	int size)
				283	{
				284	PyUnicodeObject *unicode;
				285
				286	unicode = _PyUnicode_New(size);
				287	if (!unicode)
				288	return NULL;
				289
				290	/* Copy the Unicode data into the new object */
				291	if (u != NULL)
				292	memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
				293
				294	return (PyObject *)unicode;
				295	}
				296
				297	#ifdef HAVE_WCHAR_H
				298
				299	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				300	int size)
				301	{
				302	PyUnicodeObject *unicode;
				303
				304	if (w == NULL) {
				305	PyErr_BadInternalCall();
				306	return NULL;
				307	}
				308
				309	unicode = _PyUnicode_New(size);
				310	if (!unicode)
				311	return NULL;
				312
				313	/* Copy the wchar_t data into the new object */
				314	#ifdef HAVE_USABLE_WCHAR_T
				315	memcpy(unicode->str, w, size * sizeof(wchar_t));
				316	#else
				317	{
				318	register Py_UNICODE *u;
				319	register int i;
				320	u = PyUnicode_AS_UNICODE(unicode);
				321	for (i = size; i >= 0; i--)
				322	u++ = w++;
				323	}
				324	#endif
				325
				326	return (PyObject *)unicode;
				327	}
				328
				329	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				330	register wchar_t *w,
				331	int size)
				332	{
				333	if (unicode == NULL) {
				334	PyErr_BadInternalCall();
				335	return -1;
				336	}
				337	if (size > PyUnicode_GET_SIZE(unicode))
				338	size = PyUnicode_GET_SIZE(unicode);
				339	#ifdef HAVE_USABLE_WCHAR_T
				340	memcpy(w, unicode->str, size * sizeof(wchar_t));
				341	#else
				342	{
				343	register Py_UNICODE *u;
				344	register int i;
				345	u = PyUnicode_AS_UNICODE(unicode);
				346	for (i = size; i >= 0; i--)
				347	w++ = u++;
				348	}
				349	#endif
				350
				351	return size;
				352	}
				353
				354	#endif
				355
				356	PyObject PyUnicode_FromObject(register PyObject obj)
				357	{
				358	const char *s;
				359	int len;
				360
				361	if (obj == NULL) {
				362	PyErr_BadInternalCall();
				363	return NULL;
				364	}
				365	else if (PyUnicode_Check(obj)) {
				366	Py_INCREF(obj);
				367	return obj;
				368	}
				369	else if (PyString_Check(obj)) {
				370	s = PyString_AS_STRING(obj);
				371	len = PyString_GET_SIZE(obj);
				372	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	373	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				374	/* Overwrite the error message with something more useful in
				375	case of a TypeError. */
				376	if (PyErr_ExceptionMatches(PyExc_TypeError))
				377	PyErr_SetString(PyExc_TypeError,
				378	"coercing to Unicode: need string or charbuffer");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	379	return NULL;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	380	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	381	if (len == 0) {
				382	Py_INCREF(unicode_empty);
				383	return (PyObject *)unicode_empty;
				384	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	385	return PyUnicode_Decode(s, len, NULL, "strict");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	386	}
				387
				388	PyObject PyUnicode_Decode(const char s,
				389	int size,
				390	const char *encoding,
				391	const char *errors)
				392	{
				393	PyObject buffer = NULL, unicode;
				394
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	395	if (encoding == NULL)
				396	encoding = PyUnicode_GetDefaultEncoding();
				397
				398	/* Shortcuts for common default encodings */
				399	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	400	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	401	else if (strcmp(encoding, "latin-1") == 0)
				402	return PyUnicode_DecodeLatin1(s, size, errors);
				403	else if (strcmp(encoding, "ascii") == 0)
				404	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	405
				406	/* Decode via the codec registry */
				407	buffer = PyBuffer_FromMemory((void *)s, size);
				408	if (buffer == NULL)
				409	goto onError;
				410	unicode = PyCodec_Decode(buffer, encoding, errors);
				411	if (unicode == NULL)
				412	goto onError;
				413	if (!PyUnicode_Check(unicode)) {
				414	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	415	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	416	unicode->ob_type->tp_name);
				417	Py_DECREF(unicode);
				418	goto onError;
				419	}
				420	Py_DECREF(buffer);
				421	return unicode;
				422
				423	onError:
				424	Py_XDECREF(buffer);
				425	return NULL;
				426	}
				427
				428	PyObject PyUnicode_Encode(const Py_UNICODE s,
				429	int size,
				430	const char *encoding,
				431	const char *errors)
				432	{
				433	PyObject v, unicode;
				434
				435	unicode = PyUnicode_FromUnicode(s, size);
				436	if (unicode == NULL)
				437	return NULL;
				438	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				439	Py_DECREF(unicode);
				440	return v;
				441	}
				442
				443	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				444	const char *encoding,
				445	const char *errors)
				446	{
				447	PyObject *v;
				448
				449	if (!PyUnicode_Check(unicode)) {
				450	PyErr_BadArgument();
				451	goto onError;
				452	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	453
				454	if (encoding == NULL)
				455	encoding = PyUnicode_GetDefaultEncoding();
				456
				457	/* Shortcuts for common default encodings */
				458	if (errors == NULL) {
				459	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	460	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	461	else if (strcmp(encoding, "latin-1") == 0)
				462	return PyUnicode_AsLatin1String(unicode);
				463	else if (strcmp(encoding, "ascii") == 0)
				464	return PyUnicode_AsASCIIString(unicode);
				465	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	466
				467	/* Encode via the codec registry */
				468	v = PyCodec_Encode(unicode, encoding, errors);
				469	if (v == NULL)
				470	goto onError;
				471	/* XXX Should we really enforce this ? */
				472	if (!PyString_Check(v)) {
				473	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	474	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	475	v->ob_type->tp_name);
				476	Py_DECREF(v);
				477	goto onError;
				478	}
				479	return v;
				480
				481	onError:
				482	return NULL;
				483	}
				484
				485	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				486	{
				487	if (!PyUnicode_Check(unicode)) {
				488	PyErr_BadArgument();
				489	goto onError;
				490	}
				491	return PyUnicode_AS_UNICODE(unicode);
				492
				493	onError:
				494	return NULL;
				495	}
				496
				497	int PyUnicode_GetSize(PyObject *unicode)
				498	{
				499	if (!PyUnicode_Check(unicode)) {
				500	PyErr_BadArgument();
				501	goto onError;
				502	}
				503	return PyUnicode_GET_SIZE(unicode);
				504
				505	onError:
				506	return -1;
				507	}
				508
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	509	const char *PyUnicode_GetDefaultEncoding()
				510	{
				511	return unicode_default_encoding;
				512	}
				513
				514	int PyUnicode_SetDefaultEncoding(const char *encoding)
				515	{
				516	PyObject *v;
				517
				518	/* Make sure the encoding is valid. As side effect, this also
				519	loads the encoding into the codec registry cache. */
				520	v = _PyCodec_Lookup(encoding);
				521	if (v == NULL)
				522	goto onError;
				523	Py_DECREF(v);
				524	strncpy(unicode_default_encoding,
				525	encoding,
				526	sizeof(unicode_default_encoding));
				527	return 0;
				528
				529	onError:
				530	return -1;
				531	}
				532
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	533	/* --- UTF-8 Codec -------------------------------------------------------- */
				534
				535	static
				536	char utf8_code_length[256] = {
				537	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				538	illegal prefix. see RFC 2279 for details */
				539	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				540	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				541	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				542	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				543	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				544	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				545	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				546	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				547	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				548	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				549	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				550	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				551	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				552	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				553	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				554	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				555	};
				556
				557	static
				558	int utf8_decoding_error(const char **source,
				559	Py_UNICODE **dest,
				560	const char *errors,
				561	const char *details)
				562	{
				563	if ((errors == NULL) \|\|
				564	(strcmp(errors,"strict") == 0)) {
				565	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	566	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	567	details);
				568	return -1;
				569	}
				570	else if (strcmp(errors,"ignore") == 0) {
				571	(*source)++;
				572	return 0;
				573	}
				574	else if (strcmp(errors,"replace") == 0) {
				575	(*source)++;
				576	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				577	(*dest)++;
				578	return 0;
				579	}
				580	else {
				581	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	582	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	583	errors);
				584	return -1;
				585	}
				586	}
				587
				588	#define UTF8_ERROR(details) do { \
				589	if (utf8_decoding_error(&s, &p, errors, details)) \
				590	goto onError; \
				591	continue; \
				592	} while (0)
				593
				594	PyObject PyUnicode_DecodeUTF8(const char s,
				595	int size,
				596	const char *errors)
				597	{
				598	int n;
				599	const char *e;
				600	PyUnicodeObject *unicode;
				601	Py_UNICODE *p;
				602
				603	/* Note: size will always be longer than the resulting Unicode
				604	character count */
				605	unicode = _PyUnicode_New(size);
				606	if (!unicode)
				607	return NULL;
				608	if (size == 0)
				609	return (PyObject *)unicode;
				610
				611	/* Unpack UTF-8 encoded data */
				612	p = unicode->str;
				613	e = s + size;
				614
				615	while (s < e) {
				616	register Py_UNICODE ch = (unsigned char)*s;
				617
				618	if (ch < 0x80) {
				619	*p++ = ch;
				620	s++;
				621	continue;
				622	}
				623
				624	n = utf8_code_length[ch];
				625
				626	if (s + n > e)
				627	UTF8_ERROR("unexpected end of data");
				628
				629	switch (n) {
				630
				631	case 0:
				632	UTF8_ERROR("unexpected code byte");
				633	break;
				634
				635	case 1:
				636	UTF8_ERROR("internal error");
				637	break;
				638
				639	case 2:
				640	if ((s[1] & 0xc0) != 0x80)
				641	UTF8_ERROR("invalid data");
				642	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
				643	if (ch < 0x80)
				644	UTF8_ERROR("illegal encoding");
				645	else
				646	*p++ = ch;
				647	break;
				648
				649	case 3:
				650	if ((s[1] & 0xc0) != 0x80 \|\|
				651	(s[2] & 0xc0) != 0x80)
				652	UTF8_ERROR("invalid data");
				653	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
				654	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000))
				655	UTF8_ERROR("illegal encoding");
				656	else
				657	*p++ = ch;
				658	break;
				659
				660	default:
				661	/* Other sizes are only needed for UCS-4 */
				662	UTF8_ERROR("unsupported Unicode code range");
				663	}
				664	s += n;
				665	}
				666
				667	/* Adjust length */
				668	if (_PyUnicode_Resize(unicode, p - unicode->str))
				669	goto onError;
				670
				671	return (PyObject *)unicode;
				672
				673	onError:
				674	Py_DECREF(unicode);
				675	return NULL;
				676	}
				677
				678	#undef UTF8_ERROR
				679
				680	static
				681	int utf8_encoding_error(const Py_UNICODE **source,
				682	char **dest,
				683	const char *errors,
				684	const char *details)
				685	{
				686	if ((errors == NULL) \|\|
				687	(strcmp(errors,"strict") == 0)) {
				688	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	689	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	690	details);
				691	return -1;
				692	}
				693	else if (strcmp(errors,"ignore") == 0) {
				694	return 0;
				695	}
				696	else if (strcmp(errors,"replace") == 0) {
				697	**dest = '?';
				698	(*dest)++;
				699	return 0;
				700	}
				701	else {
				702	PyErr_Format(PyExc_ValueError,
				703	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	704	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	705	errors);
				706	return -1;
				707	}
				708	}
				709
				710	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				711	int size,
				712	const char *errors)
				713	{
				714	PyObject *v;
				715	char *p;
				716	char *q;
				717
				718	v = PyString_FromStringAndSize(NULL, 3 * size);
				719	if (v == NULL)
				720	return NULL;
				721	if (size == 0)
				722	goto done;
				723
				724	p = q = PyString_AS_STRING(v);
				725	while (size-- > 0) {
				726	Py_UNICODE ch = *s++;
				727	if (ch < 0x80)
				728	*p++ = (char) ch;
				729	else if (ch < 0x0800) {
				730	*p++ = 0xc0 \| (ch >> 6);
				731	*p++ = 0x80 \| (ch & 0x3f);
				732	} else if (0xD800 <= ch && ch <= 0xDFFF) {
				733	/* These byte ranges are reserved for UTF-16 surrogate
				734	bytes which the Python implementation currently does
				735	not support. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	736	if (utf8_encoding_error(&s, &p, errors,
				737	"unsupported code range"))
				738	goto onError;
				739	} else {
				740	*p++ = 0xe0 \| (ch >> 12);
				741	*p++ = 0x80 \| ((ch >> 6) & 0x3f);
				742	*p++ = 0x80 \| (ch & 0x3f);
				743	}
				744	}
				745	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	746	if (_PyString_Resize(&v, p - q))
				747	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	748
				749	done:
				750	return v;
				751
				752	onError:
				753	Py_DECREF(v);
				754	return NULL;
				755	}
				756
				757	/* Return a Python string holding the UTF-8 encoded value of the
				758	Unicode object.
				759
				760	The resulting string is cached in the Unicode object for subsequent
				761	usage by this function. The cached version is needed to implement
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	762	the character buffer interface and will live (at least) as long as
				763	the Unicode object itself.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	764
				765	The refcount of the string is not incremented.
				766
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	767	* Exported for internal use by the interpreter only !!! *
				768
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	769	*/
				770
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	771	PyObject _PyUnicode_AsUTF8String(PyObject unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	772	const char *errors)
				773	{
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	774	PyObject v = ((PyUnicodeObject )unicode)->utf8str;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	775
				776	if (v)
				777	return v;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	778	v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				779	PyUnicode_GET_SIZE(unicode),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	780	errors);
				781	if (v && errors == NULL)
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	782	((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	783	return v;
				784	}
				785
				786	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				787	{
				788	PyObject *str;
				789
				790	if (!PyUnicode_Check(unicode)) {
				791	PyErr_BadArgument();
				792	return NULL;
				793	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	794	str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	795	if (str == NULL)
				796	return NULL;
				797	Py_INCREF(str);
				798	return str;
				799	}
				800
				801	/* --- UTF-16 Codec ------------------------------------------------------- */
				802
				803	static
				804	int utf16_decoding_error(const Py_UNICODE **source,
				805	Py_UNICODE **dest,
				806	const char *errors,
				807	const char *details)
				808	{
				809	if ((errors == NULL) \|\|
				810	(strcmp(errors,"strict") == 0)) {
				811	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	812	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	813	details);
				814	return -1;
				815	}
				816	else if (strcmp(errors,"ignore") == 0) {
				817	return 0;
				818	}
				819	else if (strcmp(errors,"replace") == 0) {
				820	if (dest) {
				821	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				822	(*dest)++;
				823	}
				824	return 0;
				825	}
				826	else {
				827	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	828	"UTF-16 decoding error; "
				829	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	830	errors);
				831	return -1;
				832	}
				833	}
				834
				835	#define UTF16_ERROR(details) do { \
				836	if (utf16_decoding_error(&q, &p, errors, details)) \
				837	goto onError; \
				838	continue; \
				839	} while(0)
				840
				841	PyObject PyUnicode_DecodeUTF16(const char s,
				842	int size,
				843	const char *errors,
				844	int *byteorder)
				845	{
				846	PyUnicodeObject *unicode;
				847	Py_UNICODE *p;
				848	const Py_UNICODE q, e;
				849	int bo = 0;
				850
				851	/* size should be an even number */
				852	if (size % sizeof(Py_UNICODE) != 0) {
				853	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				854	return NULL;
				855	/* The remaining input chars are ignored if we fall through
				856	here... */
				857	}
				858
				859	/* Note: size will always be longer than the resulting Unicode
				860	character count */
				861	unicode = _PyUnicode_New(size);
				862	if (!unicode)
				863	return NULL;
				864	if (size == 0)
				865	return (PyObject *)unicode;
				866
				867	/* Unpack UTF-16 encoded data */
				868	p = unicode->str;
				869	q = (Py_UNICODE *)s;
				870	e = q + (size / sizeof(Py_UNICODE));
				871
				872	if (byteorder)
				873	bo = *byteorder;
				874
				875	while (q < e) {
				876	register Py_UNICODE ch = *q++;
				877
				878	/* Check for BOM marks (U+FEFF) in the input and adjust
				879	current byte order setting accordingly. Swap input
				880	bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
				881	!) */
				882	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				883	if (ch == 0xFEFF) {
				884	bo = -1;
				885	continue;
				886	} else if (ch == 0xFFFE) {
				887	bo = 1;
				888	continue;
				889	}
				890	if (bo == 1)
				891	ch = (ch >> 8) \| (ch << 8);
				892	#else
				893	if (ch == 0xFEFF) {
				894	bo = 1;
				895	continue;
				896	} else if (ch == 0xFFFE) {
				897	bo = -1;
				898	continue;
				899	}
				900	if (bo == -1)
				901	ch = (ch >> 8) \| (ch << 8);
				902	#endif
				903	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				904	*p++ = ch;
				905	continue;
				906	}
				907
				908	/* UTF-16 code pair: */
				909	if (q >= e)
				910	UTF16_ERROR("unexpected end of data");
				911	if (0xDC00 <= q && q <= 0xDFFF) {
				912	q++;
				913	if (0xD800 <= q && q <= 0xDBFF)
				914	/* This is valid data (a UTF-16 surrogate pair), but
				915	we are not able to store this information since our
				916	Py_UNICODE type only has 16 bits... this might
				917	change someday, even though it's unlikely. */
				918	UTF16_ERROR("code pairs are not supported");
				919	else
				920	continue;
				921	}
				922	UTF16_ERROR("illegal encoding");
				923	}
				924
				925	if (byteorder)
				926	*byteorder = bo;
				927
				928	/* Adjust length */
				929	if (_PyUnicode_Resize(unicode, p - unicode->str))
				930	goto onError;
				931
				932	return (PyObject *)unicode;
				933
				934	onError:
				935	Py_DECREF(unicode);
				936	return NULL;
				937	}
				938
				939	#undef UTF16_ERROR
				940
				941	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				942	int size,
				943	const char *errors,
				944	int byteorder)
				945	{
				946	PyObject *v;
				947	Py_UNICODE *p;
				948	char *q;
				949
				950	/* We don't create UTF-16 pairs... */
				951	v = PyString_FromStringAndSize(NULL,
				952	sizeof(Py_UNICODE) * (size + (byteorder == 0)));
				953	if (v == NULL)
				954	return NULL;
				955	if (size == 0)
				956	goto done;
				957
				958	q = PyString_AS_STRING(v);
				959	p = (Py_UNICODE *)q;
				960
				961	if (byteorder == 0)
				962	*p++ = 0xFEFF;
				963	if (byteorder == 0 \|\|
				964	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				965	byteorder == -1
				966	#else
				967	byteorder == 1
				968	#endif
				969	)
				970	memcpy(p, s, size * sizeof(Py_UNICODE));
				971	else
				972	while (size-- > 0) {
				973	Py_UNICODE ch = *s++;
				974	*p++ = (ch >> 8) \| (ch << 8);
				975	}
				976	done:
				977	return v;
				978	}
				979
				980	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				981	{
				982	if (!PyUnicode_Check(unicode)) {
				983	PyErr_BadArgument();
				984	return NULL;
				985	}
				986	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				987	PyUnicode_GET_SIZE(unicode),
				988	NULL,
				989	0);
				990	}
				991
				992	/* --- Unicode Escape Codec ----------------------------------------------- */
				993
				994	static
				995	int unicodeescape_decoding_error(const char **source,
				996	unsigned int *x,
				997	const char *errors,
				998	const char *details)
				999	{
				1000	if ((errors == NULL) \|\|
				1001	(strcmp(errors,"strict") == 0)) {
				1002	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1003	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1004	details);
				1005	return -1;
				1006	}
				1007	else if (strcmp(errors,"ignore") == 0) {
				1008	return 0;
				1009	}
				1010	else if (strcmp(errors,"replace") == 0) {
				1011	*x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
				1012	return 0;
				1013	}
				1014	else {
				1015	PyErr_Format(PyExc_ValueError,
				1016	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1017	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1018	errors);
				1019	return -1;
				1020	}
				1021	}
				1022
				1023	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1024	int size,
				1025	const char *errors)
				1026	{
				1027	PyUnicodeObject *v;
				1028	Py_UNICODE p = NULL, buf = NULL;
				1029	const char *end;
				1030
				1031	/* Escaped strings will always be longer than the resulting
				1032	Unicode string, so we start with size here and then reduce the
				1033	length after conversion to the true value. */
				1034	v = _PyUnicode_New(size);
				1035	if (v == NULL)
				1036	goto onError;
				1037	if (size == 0)
				1038	return (PyObject *)v;
				1039	p = buf = PyUnicode_AS_UNICODE(v);
				1040	end = s + size;
				1041	while (s < end) {
				1042	unsigned char c;
				1043	unsigned int x;
				1044	int i;
				1045
				1046	/* Non-escape characters are interpreted as Unicode ordinals */
				1047	if (*s != '\\') {
				1048	p++ = (unsigned char)s++;
				1049	continue;
				1050	}
				1051
				1052	/* \ - Escapes */
				1053	s++;
				1054	switch (*s++) {
				1055
				1056	/* \x escapes */
				1057	case '\n': break;
				1058	case '\\': *p++ = '\\'; break;
				1059	case '\'': *p++ = '\''; break;
				1060	case '\"': *p++ = '\"'; break;
				1061	case 'b': *p++ = '\b'; break;
				1062	case 'f': p++ = '\014'; break; / FF */
				1063	case 't': *p++ = '\t'; break;
				1064	case 'n': *p++ = '\n'; break;
				1065	case 'r': *p++ = '\r'; break;
				1066	case 'v': p++ = '\013'; break; / VT */
				1067	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1068
				1069	/* \OOO (octal) escapes */
				1070	case '0': case '1': case '2': case '3':
				1071	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1072	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1073	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1074	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1075	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1076	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1077	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1078	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1079	break;
				1080
				1081	/* \xXXXX escape with 0-4 hex digits */
				1082	case 'x':
				1083	x = 0;
				1084	c = (unsigned char)*s;
				1085	if (isxdigit(c)) {
				1086	do {
				1087	x = (x<<4) & ~0xF;
				1088	if ('0' <= c && c <= '9')
				1089	x += c - '0';
				1090	else if ('a' <= c && c <= 'f')
				1091	x += 10 + c - 'a';
				1092	else
				1093	x += 10 + c - 'A';
				1094	c = (unsigned char)*++s;
				1095	} while (isxdigit(c));
				1096	*p++ = x;
				1097	} else {
				1098	*p++ = '\\';
				1099	*p++ = (unsigned char)s[-1];
				1100	}
				1101	break;
				1102
				1103	/* \uXXXX with 4 hex digits */
				1104	case 'u':
				1105	for (x = 0, i = 0; i < 4; i++) {
				1106	c = (unsigned char)s[i];
				1107	if (!isxdigit(c)) {
				1108	if (unicodeescape_decoding_error(&s, &x, errors,
				1109	"truncated \\uXXXX"))
				1110	goto onError;
				1111	i++;
				1112	break;
				1113	}
				1114	x = (x<<4) & ~0xF;
				1115	if (c >= '0' && c <= '9')
				1116	x += c - '0';
				1117	else if (c >= 'a' && c <= 'f')
				1118	x += 10 + c - 'a';
				1119	else
				1120	x += 10 + c - 'A';
				1121	}
				1122	s += i;
				1123	*p++ = x;
				1124	break;
				1125
				1126	default:
				1127	*p++ = '\\';
				1128	*p++ = (unsigned char)s[-1];
				1129	break;
				1130	}
				1131	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1132	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1133	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1134	return (PyObject *)v;
				1135
				1136	onError:
				1137	Py_XDECREF(v);
				1138	return NULL;
				1139	}
				1140
				1141	/* Return a Unicode-Escape string version of the Unicode object.
				1142
				1143	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1144	appropriate.
				1145
				1146	*/
				1147
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1148	static const Py_UNICODE findchar(const Py_UNICODE s,
				1149	int size,
				1150	Py_UNICODE ch);
				1151
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1152	static
				1153	PyObject unicodeescape_string(const Py_UNICODE s,
				1154	int size,
				1155	int quotes)
				1156	{
				1157	PyObject *repr;
				1158	char *p;
				1159	char *q;
				1160
				1161	static const char *hexdigit = "0123456789ABCDEF";
				1162
				1163	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1164	if (repr == NULL)
				1165	return NULL;
				1166
				1167	p = q = PyString_AS_STRING(repr);
				1168
				1169	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1170	*p++ = 'u';
				1171	*p++ = (findchar(s, size, '\'') &&
				1172	!findchar(s, size, '"')) ? '"' : '\'';
				1173	}
				1174	while (size-- > 0) {
				1175	Py_UNICODE ch = *s++;
				1176	/* Escape quotes */
				1177	if (quotes && (ch == q[1] \|\| ch == '\\')) {
				1178	*p++ = '\\';
				1179	*p++ = (char) ch;
				1180	}
				1181	/* Map 16-bit characters to '\uxxxx' */
				1182	else if (ch >= 256) {
				1183	*p++ = '\\';
				1184	*p++ = 'u';
				1185	*p++ = hexdigit[(ch >> 12) & 0xf];
				1186	*p++ = hexdigit[(ch >> 8) & 0xf];
				1187	*p++ = hexdigit[(ch >> 4) & 0xf];
				1188	*p++ = hexdigit[ch & 15];
				1189	}
				1190	/* Map non-printable US ASCII to '\ooo' */
				1191	else if (ch < ' ' \|\| ch >= 128) {
				1192	*p++ = '\\';
				1193	*p++ = hexdigit[(ch >> 6) & 7];
				1194	*p++ = hexdigit[(ch >> 3) & 7];
				1195	*p++ = hexdigit[ch & 7];
				1196	}
				1197	/* Copy everything else as-is */
				1198	else
				1199	*p++ = (char) ch;
				1200	}
				1201	if (quotes)
				1202	*p++ = q[1];
				1203
				1204	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1205	if (_PyString_Resize(&repr, p - q))
				1206	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1207
				1208	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1209
				1210	onError:
				1211	Py_DECREF(repr);
				1212	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1213	}
				1214
				1215	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1216	int size)
				1217	{
				1218	return unicodeescape_string(s, size, 0);
				1219	}
				1220
				1221	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1222	{
				1223	if (!PyUnicode_Check(unicode)) {
				1224	PyErr_BadArgument();
				1225	return NULL;
				1226	}
				1227	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1228	PyUnicode_GET_SIZE(unicode));
				1229	}
				1230
				1231	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1232
				1233	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1234	int size,
				1235	const char *errors)
				1236	{
				1237	PyUnicodeObject *v;
				1238	Py_UNICODE p, buf;
				1239	const char *end;
				1240	const char *bs;
				1241
				1242	/* Escaped strings will always be longer than the resulting
				1243	Unicode string, so we start with size here and then reduce the
				1244	length after conversion to the true value. */
				1245	v = _PyUnicode_New(size);
				1246	if (v == NULL)
				1247	goto onError;
				1248	if (size == 0)
				1249	return (PyObject *)v;
				1250	p = buf = PyUnicode_AS_UNICODE(v);
				1251	end = s + size;
				1252	while (s < end) {
				1253	unsigned char c;
				1254	unsigned int x;
				1255	int i;
				1256
				1257	/* Non-escape characters are interpreted as Unicode ordinals */
				1258	if (*s != '\\') {
				1259	p++ = (unsigned char)s++;
				1260	continue;
				1261	}
				1262
				1263	/* \u-escapes are only interpreted iff the number of leading
				1264	backslashes if odd */
				1265	bs = s;
				1266	for (;s < end;) {
				1267	if (*s != '\\')
				1268	break;
				1269	p++ = (unsigned char)s++;
				1270	}
				1271	if (((s - bs) & 1) == 0 \|\|
				1272	s >= end \|\|
				1273	*s != 'u') {
				1274	continue;
				1275	}
				1276	p--;
				1277	s++;
				1278
				1279	/* \uXXXX with 4 hex digits */
				1280	for (x = 0, i = 0; i < 4; i++) {
				1281	c = (unsigned char)s[i];
				1282	if (!isxdigit(c)) {
				1283	if (unicodeescape_decoding_error(&s, &x, errors,
				1284	"truncated \\uXXXX"))
				1285	goto onError;
				1286	i++;
				1287	break;
				1288	}
				1289	x = (x<<4) & ~0xF;
				1290	if (c >= '0' && c <= '9')
				1291	x += c - '0';
				1292	else if (c >= 'a' && c <= 'f')
				1293	x += 10 + c - 'a';
				1294	else
				1295	x += 10 + c - 'A';
				1296	}
				1297	s += i;
				1298	*p++ = x;
				1299	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1300	if (_PyUnicode_Resize(v, (int)(p - buf)))
				1301	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1302	return (PyObject *)v;
				1303
				1304	onError:
				1305	Py_XDECREF(v);
				1306	return NULL;
				1307	}
				1308
				1309	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1310	int size)
				1311	{
				1312	PyObject *repr;
				1313	char *p;
				1314	char *q;
				1315
				1316	static const char *hexdigit = "0123456789ABCDEF";
				1317
				1318	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1319	if (repr == NULL)
				1320	return NULL;
				1321
				1322	p = q = PyString_AS_STRING(repr);
				1323	while (size-- > 0) {
				1324	Py_UNICODE ch = *s++;
				1325	/* Map 16-bit characters to '\uxxxx' */
				1326	if (ch >= 256) {
				1327	*p++ = '\\';
				1328	*p++ = 'u';
				1329	*p++ = hexdigit[(ch >> 12) & 0xf];
				1330	*p++ = hexdigit[(ch >> 8) & 0xf];
				1331	*p++ = hexdigit[(ch >> 4) & 0xf];
				1332	*p++ = hexdigit[ch & 15];
				1333	}
				1334	/* Copy everything else as-is */
				1335	else
				1336	*p++ = (char) ch;
				1337	}
				1338	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1339	if (_PyString_Resize(&repr, p - q))
				1340	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1341
				1342	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1343
				1344	onError:
				1345	Py_DECREF(repr);
				1346	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1347	}
				1348
				1349	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1350	{
				1351	if (!PyUnicode_Check(unicode)) {
				1352	PyErr_BadArgument();
				1353	return NULL;
				1354	}
				1355	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1356	PyUnicode_GET_SIZE(unicode));
				1357	}
				1358
				1359	/* --- Latin-1 Codec ------------------------------------------------------ */
				1360
				1361	PyObject PyUnicode_DecodeLatin1(const char s,
				1362	int size,
				1363	const char *errors)
				1364	{
				1365	PyUnicodeObject *v;
				1366	Py_UNICODE *p;
				1367
				1368	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
				1369	v = _PyUnicode_New(size);
				1370	if (v == NULL)
				1371	goto onError;
				1372	if (size == 0)
				1373	return (PyObject *)v;
				1374	p = PyUnicode_AS_UNICODE(v);
				1375	while (size-- > 0)
				1376	p++ = (unsigned char)s++;
				1377	return (PyObject *)v;
				1378
				1379	onError:
				1380	Py_XDECREF(v);
				1381	return NULL;
				1382	}
				1383
				1384	static
				1385	int latin1_encoding_error(const Py_UNICODE **source,
				1386	char **dest,
				1387	const char *errors,
				1388	const char *details)
				1389	{
				1390	if ((errors == NULL) \|\|
				1391	(strcmp(errors,"strict") == 0)) {
				1392	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1393	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1394	details);
				1395	return -1;
				1396	}
				1397	else if (strcmp(errors,"ignore") == 0) {
				1398	return 0;
				1399	}
				1400	else if (strcmp(errors,"replace") == 0) {
				1401	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1402	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1403	return 0;
				1404	}
				1405	else {
				1406	PyErr_Format(PyExc_ValueError,
				1407	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1408	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1409	errors);
				1410	return -1;
				1411	}
				1412	}
				1413
				1414	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1415	int size,
				1416	const char *errors)
				1417	{
				1418	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1419	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1420	repr = PyString_FromStringAndSize(NULL, size);
				1421	if (repr == NULL)
				1422	return NULL;
				1423
				1424	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1425	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1426	while (size-- > 0) {
				1427	Py_UNICODE ch = *p++;
				1428	if (ch >= 256) {
				1429	if (latin1_encoding_error(&p, &s, errors,
				1430	"ordinal not in range(256)"))
				1431	goto onError;
				1432	}
				1433	else
				1434	*s++ = (char)ch;
				1435	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1436	/* Resize if error handling skipped some characters */
				1437	if (s - start < PyString_GET_SIZE(repr))
				1438	if (_PyString_Resize(&repr, s - start))
				1439	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1440	return repr;
				1441
				1442	onError:
				1443	Py_DECREF(repr);
				1444	return NULL;
				1445	}
				1446
				1447	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1448	{
				1449	if (!PyUnicode_Check(unicode)) {
				1450	PyErr_BadArgument();
				1451	return NULL;
				1452	}
				1453	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1454	PyUnicode_GET_SIZE(unicode),
				1455	NULL);
				1456	}
				1457
				1458	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1459
				1460	static
				1461	int ascii_decoding_error(const char **source,
				1462	Py_UNICODE **dest,
				1463	const char *errors,
				1464	const char *details)
				1465	{
				1466	if ((errors == NULL) \|\|
				1467	(strcmp(errors,"strict") == 0)) {
				1468	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1469	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1470	details);
				1471	return -1;
				1472	}
				1473	else if (strcmp(errors,"ignore") == 0) {
				1474	return 0;
				1475	}
				1476	else if (strcmp(errors,"replace") == 0) {
				1477	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1478	(*dest)++;
				1479	return 0;
				1480	}
				1481	else {
				1482	PyErr_Format(PyExc_ValueError,
				1483	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1484	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1485	errors);
				1486	return -1;
				1487	}
				1488	}
				1489
				1490	PyObject PyUnicode_DecodeASCII(const char s,
				1491	int size,
				1492	const char *errors)
				1493	{
				1494	PyUnicodeObject *v;
				1495	Py_UNICODE *p;
				1496
				1497	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
				1498	v = _PyUnicode_New(size);
				1499	if (v == NULL)
				1500	goto onError;
				1501	if (size == 0)
				1502	return (PyObject *)v;
				1503	p = PyUnicode_AS_UNICODE(v);
				1504	while (size-- > 0) {
				1505	register unsigned char c;
				1506
				1507	c = (unsigned char)*s++;
				1508	if (c < 128)
				1509	*p++ = c;
				1510	else if (ascii_decoding_error(&s, &p, errors,
				1511	"ordinal not in range(128)"))
				1512	goto onError;
				1513	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1514	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
				1515	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1516	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1517	return (PyObject *)v;
				1518
				1519	onError:
				1520	Py_XDECREF(v);
				1521	return NULL;
				1522	}
				1523
				1524	static
				1525	int ascii_encoding_error(const Py_UNICODE **source,
				1526	char **dest,
				1527	const char *errors,
				1528	const char *details)
				1529	{
				1530	if ((errors == NULL) \|\|
				1531	(strcmp(errors,"strict") == 0)) {
				1532	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1533	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1534	details);
				1535	return -1;
				1536	}
				1537	else if (strcmp(errors,"ignore") == 0) {
				1538	return 0;
				1539	}
				1540	else if (strcmp(errors,"replace") == 0) {
				1541	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1542	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1543	return 0;
				1544	}
				1545	else {
				1546	PyErr_Format(PyExc_ValueError,
				1547	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1548	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1549	errors);
				1550	return -1;
				1551	}
				1552	}
				1553
				1554	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1555	int size,
				1556	const char *errors)
				1557	{
				1558	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1559	char s, start;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1560	repr = PyString_FromStringAndSize(NULL, size);
				1561	if (repr == NULL)
				1562	return NULL;
				1563
				1564	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1565	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1566	while (size-- > 0) {
				1567	Py_UNICODE ch = *p++;
				1568	if (ch >= 128) {
				1569	if (ascii_encoding_error(&p, &s, errors,
				1570	"ordinal not in range(128)"))
				1571	goto onError;
				1572	}
				1573	else
				1574	*s++ = (char)ch;
				1575	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1576	/* Resize if error handling skipped some characters */
				1577	if (s - start < PyString_GET_SIZE(repr))
				1578	if (_PyString_Resize(&repr, s - start))
				1579	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1580	return repr;
				1581
				1582	onError:
				1583	Py_DECREF(repr);
				1584	return NULL;
				1585	}
				1586
				1587	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1588	{
				1589	if (!PyUnicode_Check(unicode)) {
				1590	PyErr_BadArgument();
				1591	return NULL;
				1592	}
				1593	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1594	PyUnicode_GET_SIZE(unicode),
				1595	NULL);
				1596	}
				1597
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1598	#ifdef MS_WIN32
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1599
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1600	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1601
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1602	PyObject PyUnicode_DecodeMBCS(const char s,
				1603	int size,
				1604	const char *errors)
				1605	{
				1606	PyUnicodeObject *v;
				1607	Py_UNICODE *p;
				1608
				1609	/* First get the size of the result */
				1610	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1611	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1612	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1613
				1614	v = _PyUnicode_New(usize);
				1615	if (v == NULL)
				1616	return NULL;
				1617	if (usize == 0)
				1618	return (PyObject *)v;
				1619	p = PyUnicode_AS_UNICODE(v);
				1620	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1621	Py_DECREF(v);
				1622	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1623	}
				1624
				1625	return (PyObject *)v;
				1626	}
				1627
				1628	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1629	int size,
				1630	const char *errors)
				1631	{
				1632	PyObject *repr;
				1633	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1634	DWORD mbcssize;
				1635
				1636	/* If there are no characters, bail now! */
				1637	if (size==0)
				1638	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1639
				1640	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1641	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1642	if (mbcssize==0)
				1643	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1644
				1645	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1646	if (repr == NULL)
				1647	return NULL;
				1648	if (mbcssize==0)
				1649	return repr;
				1650
				1651	/* Do the conversion */
				1652	s = PyString_AS_STRING(repr);
				1653	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1654	Py_DECREF(repr);
				1655	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1656	}
				1657	return repr;
				1658	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1659
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1660	#endif /* MS_WIN32 */
				1661
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1662	/* --- Character Mapping Codec -------------------------------------------- */
				1663
				1664	static
				1665	int charmap_decoding_error(const char **source,
				1666	Py_UNICODE **dest,
				1667	const char *errors,
				1668	const char *details)
				1669	{
				1670	if ((errors == NULL) \|\|
				1671	(strcmp(errors,"strict") == 0)) {
				1672	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1673	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1674	details);
				1675	return -1;
				1676	}
				1677	else if (strcmp(errors,"ignore") == 0) {
				1678	return 0;
				1679	}
				1680	else if (strcmp(errors,"replace") == 0) {
				1681	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1682	(*dest)++;
				1683	return 0;
				1684	}
				1685	else {
				1686	PyErr_Format(PyExc_ValueError,
				1687	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1688	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1689	errors);
				1690	return -1;
				1691	}
				1692	}
				1693
				1694	PyObject PyUnicode_DecodeCharmap(const char s,
				1695	int size,
				1696	PyObject *mapping,
				1697	const char *errors)
				1698	{
				1699	PyUnicodeObject *v;
				1700	Py_UNICODE *p;
				1701
				1702	/* Default to Latin-1 */
				1703	if (mapping == NULL)
				1704	return PyUnicode_DecodeLatin1(s, size, errors);
				1705
				1706	v = _PyUnicode_New(size);
				1707	if (v == NULL)
				1708	goto onError;
				1709	if (size == 0)
				1710	return (PyObject *)v;
				1711	p = PyUnicode_AS_UNICODE(v);
				1712	while (size-- > 0) {
				1713	unsigned char ch = *s++;
				1714	PyObject w, x;
				1715
				1716	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				1717	w = PyInt_FromLong((long)ch);
				1718	if (w == NULL)
				1719	goto onError;
				1720	x = PyObject_GetItem(mapping, w);
				1721	Py_DECREF(w);
				1722	if (x == NULL) {
				1723	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1724	/* No mapping found: default to Latin-1 mapping */
				1725	PyErr_Clear();
				1726	*p++ = (Py_UNICODE)ch;
				1727	continue;
				1728	}
				1729	goto onError;
				1730	}
				1731
				1732	/* Apply mapping */
				1733	if (PyInt_Check(x)) {
				1734	int value = PyInt_AS_LONG(x);
				1735	if (value < 0 \|\| value > 65535) {
				1736	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	1737	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1738	Py_DECREF(x);
				1739	goto onError;
				1740	}
				1741	*p++ = (Py_UNICODE)value;
				1742	}
				1743	else if (x == Py_None) {
				1744	/* undefined mapping */
				1745	if (charmap_decoding_error(&s, &p, errors,
				1746	"character maps to <undefined>")) {
				1747	Py_DECREF(x);
				1748	goto onError;
				1749	}
				1750	}
				1751	else if (PyUnicode_Check(x)) {
				1752	if (PyUnicode_GET_SIZE(x) != 1) {
				1753	/* 1-n mapping */
				1754	PyErr_SetString(PyExc_NotImplementedError,
				1755	"1-n mappings are currently not implemented");
				1756	Py_DECREF(x);
				1757	goto onError;
				1758	}
				1759	p++ = PyUnicode_AS_UNICODE(x);
				1760	}
				1761	else {
				1762	/* wrong return value */
				1763	PyErr_SetString(PyExc_TypeError,
				1764	"character mapping must return integer, None or unicode");
				1765	Py_DECREF(x);
				1766	goto onError;
				1767	}
				1768	Py_DECREF(x);
				1769	}
				1770	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
				1771	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				1772	goto onError;
				1773	return (PyObject *)v;
				1774
				1775	onError:
				1776	Py_XDECREF(v);
				1777	return NULL;
				1778	}
				1779
				1780	static
				1781	int charmap_encoding_error(const Py_UNICODE **source,
				1782	char **dest,
				1783	const char *errors,
				1784	const char *details)
				1785	{
				1786	if ((errors == NULL) \|\|
				1787	(strcmp(errors,"strict") == 0)) {
				1788	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1789	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1790	details);
				1791	return -1;
				1792	}
				1793	else if (strcmp(errors,"ignore") == 0) {
				1794	return 0;
				1795	}
				1796	else if (strcmp(errors,"replace") == 0) {
				1797	**dest = '?';
				1798	(*dest)++;
				1799	return 0;
				1800	}
				1801	else {
				1802	PyErr_Format(PyExc_ValueError,
				1803	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1804	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1805	errors);
				1806	return -1;
				1807	}
				1808	}
				1809
				1810	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				1811	int size,
				1812	PyObject *mapping,
				1813	const char *errors)
				1814	{
				1815	PyObject *v;
				1816	char *s;
				1817
				1818	/* Default to Latin-1 */
				1819	if (mapping == NULL)
				1820	return PyUnicode_EncodeLatin1(p, size, errors);
				1821
				1822	v = PyString_FromStringAndSize(NULL, size);
				1823	if (v == NULL)
				1824	return NULL;
				1825	s = PyString_AS_STRING(v);
				1826	while (size-- > 0) {
				1827	Py_UNICODE ch = *p++;
				1828	PyObject w, x;
				1829
				1830	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				1831	w = PyInt_FromLong((long)ch);
				1832	if (w == NULL)
				1833	goto onError;
				1834	x = PyObject_GetItem(mapping, w);
				1835	Py_DECREF(w);
				1836	if (x == NULL) {
				1837	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1838	/* No mapping found: default to Latin-1 mapping if possible */
				1839	PyErr_Clear();
				1840	if (ch < 256) {
				1841	*s++ = (char)ch;
				1842	continue;
				1843	}
				1844	else if (!charmap_encoding_error(&p, &s, errors,
				1845	"missing character mapping"))
				1846	continue;
				1847	}
				1848	goto onError;
				1849	}
				1850
				1851	/* Apply mapping */
				1852	if (PyInt_Check(x)) {
				1853	int value = PyInt_AS_LONG(x);
				1854	if (value < 0 \|\| value > 255) {
				1855	PyErr_SetString(PyExc_TypeError,
				1856	"character mapping must be in range(256)");
				1857	Py_DECREF(x);
				1858	goto onError;
				1859	}
				1860	*s++ = (char)value;
				1861	}
				1862	else if (x == Py_None) {
				1863	/* undefined mapping */
				1864	if (charmap_encoding_error(&p, &s, errors,
				1865	"character maps to <undefined>")) {
				1866	Py_DECREF(x);
				1867	goto onError;
				1868	}
				1869	}
				1870	else if (PyString_Check(x)) {
				1871	if (PyString_GET_SIZE(x) != 1) {
				1872	/* 1-n mapping */
				1873	PyErr_SetString(PyExc_NotImplementedError,
				1874	"1-n mappings are currently not implemented");
				1875	Py_DECREF(x);
				1876	goto onError;
				1877	}
				1878	s++ = PyString_AS_STRING(x);
				1879	}
				1880	else {
				1881	/* wrong return value */
				1882	PyErr_SetString(PyExc_TypeError,
				1883	"character mapping must return integer, None or unicode");
				1884	Py_DECREF(x);
				1885	goto onError;
				1886	}
				1887	Py_DECREF(x);
				1888	}
				1889	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				1890	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				1891	goto onError;
				1892	return v;
				1893
				1894	onError:
				1895	Py_DECREF(v);
				1896	return NULL;
				1897	}
				1898
				1899	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				1900	PyObject *mapping)
				1901	{
				1902	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				1903	PyErr_BadArgument();
				1904	return NULL;
				1905	}
				1906	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				1907	PyUnicode_GET_SIZE(unicode),
				1908	mapping,
				1909	NULL);
				1910	}
				1911
				1912	static
				1913	int translate_error(const Py_UNICODE **source,
				1914	Py_UNICODE **dest,
				1915	const char *errors,
				1916	const char *details)
				1917	{
				1918	if ((errors == NULL) \|\|
				1919	(strcmp(errors,"strict") == 0)) {
				1920	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1921	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1922	details);
				1923	return -1;
				1924	}
				1925	else if (strcmp(errors,"ignore") == 0) {
				1926	return 0;
				1927	}
				1928	else if (strcmp(errors,"replace") == 0) {
				1929	**dest = '?';
				1930	(*dest)++;
				1931	return 0;
				1932	}
				1933	else {
				1934	PyErr_Format(PyExc_ValueError,
				1935	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1936	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1937	errors);
				1938	return -1;
				1939	}
				1940	}
				1941
				1942	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				1943	int size,
				1944	PyObject *mapping,
				1945	const char *errors)
				1946	{
				1947	PyUnicodeObject *v;
				1948	Py_UNICODE *p;
				1949
				1950	if (mapping == NULL) {
				1951	PyErr_BadArgument();
				1952	return NULL;
				1953	}
				1954
				1955	/* Output will never be longer than input */
				1956	v = _PyUnicode_New(size);
				1957	if (v == NULL)
				1958	goto onError;
				1959	if (size == 0)
				1960	goto done;
				1961	p = PyUnicode_AS_UNICODE(v);
				1962	while (size-- > 0) {
				1963	Py_UNICODE ch = *s++;
				1964	PyObject w, x;
				1965
				1966	/* Get mapping */
				1967	w = PyInt_FromLong(ch);
				1968	if (w == NULL)
				1969	goto onError;
				1970	x = PyObject_GetItem(mapping, w);
				1971	Py_DECREF(w);
				1972	if (x == NULL) {
				1973	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				1974	/* No mapping found: default to 1-1 mapping */
				1975	PyErr_Clear();
				1976	*p++ = ch;
				1977	continue;
				1978	}
				1979	goto onError;
				1980	}
				1981
				1982	/* Apply mapping */
				1983	if (PyInt_Check(x))
				1984	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				1985	else if (x == Py_None) {
				1986	/* undefined mapping */
				1987	if (translate_error(&s, &p, errors,
				1988	"character maps to <undefined>")) {
				1989	Py_DECREF(x);
				1990	goto onError;
				1991	}
				1992	}
				1993	else if (PyUnicode_Check(x)) {
				1994	if (PyUnicode_GET_SIZE(x) != 1) {
				1995	/* 1-n mapping */
				1996	PyErr_SetString(PyExc_NotImplementedError,
				1997	"1-n mappings are currently not implemented");
				1998	Py_DECREF(x);
				1999	goto onError;
				2000	}
				2001	p++ = PyUnicode_AS_UNICODE(x);
				2002	}
				2003	else {
				2004	/* wrong return value */
				2005	PyErr_SetString(PyExc_TypeError,
				2006	"translate mapping must return integer, None or unicode");
				2007	Py_DECREF(x);
				2008	goto onError;
				2009	}
				2010	Py_DECREF(x);
				2011	}
				2012	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2013	if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
				2014	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2015
				2016	done:
				2017	return (PyObject *)v;
				2018
				2019	onError:
				2020	Py_XDECREF(v);
				2021	return NULL;
				2022	}
				2023
				2024	PyObject PyUnicode_Translate(PyObject str,
				2025	PyObject *mapping,
				2026	const char *errors)
				2027	{
				2028	PyObject *result;
				2029
				2030	str = PyUnicode_FromObject(str);
				2031	if (str == NULL)
				2032	goto onError;
				2033	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2034	PyUnicode_GET_SIZE(str),
				2035	mapping,
				2036	errors);
				2037	Py_DECREF(str);
				2038	return result;
				2039
				2040	onError:
				2041	Py_XDECREF(str);
				2042	return NULL;
				2043	}
				2044
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2045	/* --- Decimal Encoder ---------------------------------------------------- */
				2046
				2047	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2048	int length,
				2049	char *output,
				2050	const char *errors)
				2051	{
				2052	Py_UNICODE p, end;
				2053
				2054	if (output == NULL) {
				2055	PyErr_BadArgument();
				2056	return -1;
				2057	}
				2058
				2059	p = s;
				2060	end = s + length;
				2061	while (p < end) {
				2062	register Py_UNICODE ch = *p++;
				2063	int decimal;
				2064
				2065	if (Py_UNICODE_ISSPACE(ch)) {
				2066	*output++ = ' ';
				2067	continue;
				2068	}
				2069	decimal = Py_UNICODE_TODECIMAL(ch);
				2070	if (decimal >= 0) {
				2071	*output++ = '0' + decimal;
				2072	continue;
				2073	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2074	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2075	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2076	continue;
				2077	}
				2078	/* All other characters are considered invalid */
				2079	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2080	PyErr_SetString(PyExc_ValueError,
				2081	"invalid decimal Unicode string");
				2082	goto onError;
				2083	}
				2084	else if (strcmp(errors, "ignore") == 0)
				2085	continue;
				2086	else if (strcmp(errors, "replace") == 0) {
				2087	*output++ = '?';
				2088	continue;
				2089	}
				2090	}
				2091	/* 0-terminate the output string */
				2092	*output++ = '\0';
				2093	return 0;
				2094
				2095	onError:
				2096	return -1;
				2097	}
				2098
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2099	/* --- Helpers ------------------------------------------------------------ */
				2100
				2101	static
				2102	int count(PyUnicodeObject *self,
				2103	int start,
				2104	int end,
				2105	PyUnicodeObject *substring)
				2106	{
				2107	int count = 0;
				2108
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2109	if (substring->length == 0)
				2110	return (end - start + 1);
				2111
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2112	end -= substring->length;
				2113
				2114	while (start <= end)
				2115	if (Py_UNICODE_MATCH(self, start, substring)) {
				2116	count++;
				2117	start += substring->length;
				2118	} else
				2119	start++;
				2120
				2121	return count;
				2122	}
				2123
				2124	int PyUnicode_Count(PyObject *str,
				2125	PyObject *substr,
				2126	int start,
				2127	int end)
				2128	{
				2129	int result;
				2130
				2131	str = PyUnicode_FromObject(str);
				2132	if (str == NULL)
				2133	return -1;
				2134	substr = PyUnicode_FromObject(substr);
				2135	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2136	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2137	return -1;
				2138	}
				2139
				2140	result = count((PyUnicodeObject *)str,
				2141	start, end,
				2142	(PyUnicodeObject *)substr);
				2143
				2144	Py_DECREF(str);
				2145	Py_DECREF(substr);
				2146	return result;
				2147	}
				2148
				2149	static
				2150	int findstring(PyUnicodeObject *self,
				2151	PyUnicodeObject *substring,
				2152	int start,
				2153	int end,
				2154	int direction)
				2155	{
				2156	if (start < 0)
				2157	start += self->length;
				2158	if (start < 0)
				2159	start = 0;
				2160
				2161	if (substring->length == 0)
				2162	return start;
				2163
				2164	if (end > self->length)
				2165	end = self->length;
				2166	if (end < 0)
				2167	end += self->length;
				2168	if (end < 0)
				2169	end = 0;
				2170
				2171	end -= substring->length;
				2172
				2173	if (direction < 0) {
				2174	for (; end >= start; end--)
				2175	if (Py_UNICODE_MATCH(self, end, substring))
				2176	return end;
				2177	} else {
				2178	for (; start <= end; start++)
				2179	if (Py_UNICODE_MATCH(self, start, substring))
				2180	return start;
				2181	}
				2182
				2183	return -1;
				2184	}
				2185
				2186	int PyUnicode_Find(PyObject *str,
				2187	PyObject *substr,
				2188	int start,
				2189	int end,
				2190	int direction)
				2191	{
				2192	int result;
				2193
				2194	str = PyUnicode_FromObject(str);
				2195	if (str == NULL)
				2196	return -1;
				2197	substr = PyUnicode_FromObject(substr);
				2198	if (substr == NULL) {
				2199	Py_DECREF(substr);
				2200	return -1;
				2201	}
				2202
				2203	result = findstring((PyUnicodeObject *)str,
				2204	(PyUnicodeObject *)substr,
				2205	start, end, direction);
				2206	Py_DECREF(str);
				2207	Py_DECREF(substr);
				2208	return result;
				2209	}
				2210
				2211	static
				2212	int tailmatch(PyUnicodeObject *self,
				2213	PyUnicodeObject *substring,
				2214	int start,
				2215	int end,
				2216	int direction)
				2217	{
				2218	if (start < 0)
				2219	start += self->length;
				2220	if (start < 0)
				2221	start = 0;
				2222
				2223	if (substring->length == 0)
				2224	return 1;
				2225
				2226	if (end > self->length)
				2227	end = self->length;
				2228	if (end < 0)
				2229	end += self->length;
				2230	if (end < 0)
				2231	end = 0;
				2232
				2233	end -= substring->length;
				2234	if (end < start)
				2235	return 0;
				2236
				2237	if (direction > 0) {
				2238	if (Py_UNICODE_MATCH(self, end, substring))
				2239	return 1;
				2240	} else {
				2241	if (Py_UNICODE_MATCH(self, start, substring))
				2242	return 1;
				2243	}
				2244
				2245	return 0;
				2246	}
				2247
				2248	int PyUnicode_Tailmatch(PyObject *str,
				2249	PyObject *substr,
				2250	int start,
				2251	int end,
				2252	int direction)
				2253	{
				2254	int result;
				2255
				2256	str = PyUnicode_FromObject(str);
				2257	if (str == NULL)
				2258	return -1;
				2259	substr = PyUnicode_FromObject(substr);
				2260	if (substr == NULL) {
				2261	Py_DECREF(substr);
				2262	return -1;
				2263	}
				2264
				2265	result = tailmatch((PyUnicodeObject *)str,
				2266	(PyUnicodeObject *)substr,
				2267	start, end, direction);
				2268	Py_DECREF(str);
				2269	Py_DECREF(substr);
				2270	return result;
				2271	}
				2272
				2273	static
				2274	const Py_UNICODE findchar(const Py_UNICODE s,
				2275	int size,
				2276	Py_UNICODE ch)
				2277	{
				2278	/* like wcschr, but doesn't stop at NULL characters */
				2279
				2280	while (size-- > 0) {
				2281	if (*s == ch)
				2282	return s;
				2283	s++;
				2284	}
				2285
				2286	return NULL;
				2287	}
				2288
				2289	/* Apply fixfct filter to the Unicode object self and return a
				2290	reference to the modified object */
				2291
				2292	static
				2293	PyObject fixup(PyUnicodeObject self,
				2294	int (fixfct)(PyUnicodeObject s))
				2295	{
				2296
				2297	PyUnicodeObject *u;
				2298
				2299	u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
				2300	self->length);
				2301	if (u == NULL)
				2302	return NULL;
				2303	if (!fixfct(u)) {
				2304	/* fixfct should return TRUE if it modified the buffer. If
				2305	FALSE, return a reference to the original buffer instead
				2306	(to save space, not time) */
				2307	Py_INCREF(self);
				2308	Py_DECREF(u);
				2309	return (PyObject*) self;
				2310	}
				2311	return (PyObject*) u;
				2312	}
				2313
				2314	static
				2315	int fixupper(PyUnicodeObject *self)
				2316	{
				2317	int len = self->length;
				2318	Py_UNICODE *s = self->str;
				2319	int status = 0;
				2320
				2321	while (len-- > 0) {
				2322	register Py_UNICODE ch;
				2323
				2324	ch = Py_UNICODE_TOUPPER(*s);
				2325	if (ch != *s) {
				2326	status = 1;
				2327	*s = ch;
				2328	}
				2329	s++;
				2330	}
				2331
				2332	return status;
				2333	}
				2334
				2335	static
				2336	int fixlower(PyUnicodeObject *self)
				2337	{
				2338	int len = self->length;
				2339	Py_UNICODE *s = self->str;
				2340	int status = 0;
				2341
				2342	while (len-- > 0) {
				2343	register Py_UNICODE ch;
				2344
				2345	ch = Py_UNICODE_TOLOWER(*s);
				2346	if (ch != *s) {
				2347	status = 1;
				2348	*s = ch;
				2349	}
				2350	s++;
				2351	}
				2352
				2353	return status;
				2354	}
				2355
				2356	static
				2357	int fixswapcase(PyUnicodeObject *self)
				2358	{
				2359	int len = self->length;
				2360	Py_UNICODE *s = self->str;
				2361	int status = 0;
				2362
				2363	while (len-- > 0) {
				2364	if (Py_UNICODE_ISUPPER(*s)) {
				2365	s = Py_UNICODE_TOLOWER(s);
				2366	status = 1;
				2367	} else if (Py_UNICODE_ISLOWER(*s)) {
				2368	s = Py_UNICODE_TOUPPER(s);
				2369	status = 1;
				2370	}
				2371	s++;
				2372	}
				2373
				2374	return status;
				2375	}
				2376
				2377	static
				2378	int fixcapitalize(PyUnicodeObject *self)
				2379	{
				2380	if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
				2381	self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
				2382	return 1;
				2383	}
				2384	return 0;
				2385	}
				2386
				2387	static
				2388	int fixtitle(PyUnicodeObject *self)
				2389	{
				2390	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2391	register Py_UNICODE *e;
				2392	int previous_is_cased;
				2393
				2394	/* Shortcut for single character strings */
				2395	if (PyUnicode_GET_SIZE(self) == 1) {
				2396	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2397	if (*p != ch) {
				2398	*p = ch;
				2399	return 1;
				2400	}
				2401	else
				2402	return 0;
				2403	}
				2404
				2405	e = p + PyUnicode_GET_SIZE(self);
				2406	previous_is_cased = 0;
				2407	for (; p < e; p++) {
				2408	register const Py_UNICODE ch = *p;
				2409
				2410	if (previous_is_cased)
				2411	*p = Py_UNICODE_TOLOWER(ch);
				2412	else
				2413	*p = Py_UNICODE_TOTITLE(ch);
				2414
				2415	if (Py_UNICODE_ISLOWER(ch) \|\|
				2416	Py_UNICODE_ISUPPER(ch) \|\|
				2417	Py_UNICODE_ISTITLE(ch))
				2418	previous_is_cased = 1;
				2419	else
				2420	previous_is_cased = 0;
				2421	}
				2422	return 1;
				2423	}
				2424
				2425	PyObject PyUnicode_Join(PyObject separator,
				2426	PyObject *seq)
				2427	{
				2428	Py_UNICODE *sep;
				2429	int seplen;
				2430	PyUnicodeObject *res = NULL;
				2431	int reslen = 0;
				2432	Py_UNICODE *p;
				2433	int seqlen = 0;
				2434	int sz = 100;
				2435	int i;
				2436
				2437	seqlen = PySequence_Length(seq);
				2438	if (seqlen < 0 && PyErr_Occurred())
				2439	return NULL;
				2440
				2441	if (separator == NULL) {
				2442	Py_UNICODE blank = ' ';
				2443	sep = &blank;
				2444	seplen = 1;
				2445	}
				2446	else {
				2447	separator = PyUnicode_FromObject(separator);
				2448	if (separator == NULL)
				2449	return NULL;
				2450	sep = PyUnicode_AS_UNICODE(separator);
				2451	seplen = PyUnicode_GET_SIZE(separator);
				2452	}
				2453
				2454	res = _PyUnicode_New(sz);
				2455	if (res == NULL)
				2456	goto onError;
				2457	p = PyUnicode_AS_UNICODE(res);
				2458	reslen = 0;
				2459
				2460	for (i = 0; i < seqlen; i++) {
				2461	int itemlen;
				2462	PyObject *item;
				2463
				2464	item = PySequence_GetItem(seq, i);
				2465	if (item == NULL)
				2466	goto onError;
				2467	if (!PyUnicode_Check(item)) {
				2468	PyObject *v;
				2469	v = PyUnicode_FromObject(item);
				2470	Py_DECREF(item);
				2471	item = v;
				2472	if (item == NULL)
				2473	goto onError;
				2474	}
				2475	itemlen = PyUnicode_GET_SIZE(item);
				2476	while (reslen + itemlen + seplen >= sz) {
				2477	if (_PyUnicode_Resize(res, sz*2))
				2478	goto onError;
				2479	sz *= 2;
				2480	p = PyUnicode_AS_UNICODE(res) + reslen;
				2481	}
				2482	if (i > 0) {
				2483	memcpy(p, sep, seplen * sizeof(Py_UNICODE));
				2484	p += seplen;
				2485	reslen += seplen;
				2486	}
				2487	memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
				2488	p += itemlen;
				2489	reslen += itemlen;
				2490	Py_DECREF(item);
				2491	}
				2492	if (_PyUnicode_Resize(res, reslen))
				2493	goto onError;
				2494
				2495	Py_XDECREF(separator);
				2496	return (PyObject *)res;
				2497
				2498	onError:
				2499	Py_XDECREF(separator);
				2500	Py_DECREF(res);
				2501	return NULL;
				2502	}
				2503
				2504	static
				2505	PyUnicodeObject pad(PyUnicodeObject self,
				2506	int left,
				2507	int right,
				2508	Py_UNICODE fill)
				2509	{
				2510	PyUnicodeObject *u;
				2511
				2512	if (left < 0)
				2513	left = 0;
				2514	if (right < 0)
				2515	right = 0;
				2516
				2517	if (left == 0 && right == 0) {
				2518	Py_INCREF(self);
				2519	return self;
				2520	}
				2521
				2522	u = _PyUnicode_New(left + self->length + right);
				2523	if (u) {
				2524	if (left)
				2525	Py_UNICODE_FILL(u->str, fill, left);
				2526	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2527	if (right)
				2528	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2529	}
				2530
				2531	return u;
				2532	}
				2533
				2534	#define SPLIT_APPEND(data, left, right) \
				2535	str = PyUnicode_FromUnicode(data + left, right - left); \
				2536	if (!str) \
				2537	goto onError; \
				2538	if (PyList_Append(list, str)) { \
				2539	Py_DECREF(str); \
				2540	goto onError; \
				2541	} \
				2542	else \
				2543	Py_DECREF(str);
				2544
				2545	static
				2546	PyObject split_whitespace(PyUnicodeObject self,
				2547	PyObject *list,
				2548	int maxcount)
				2549	{
				2550	register int i;
				2551	register int j;
				2552	int len = self->length;
				2553	PyObject *str;
				2554
				2555	for (i = j = 0; i < len; ) {
				2556	/* find a token */
				2557	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2558	i++;
				2559	j = i;
				2560	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2561	i++;
				2562	if (j < i) {
				2563	if (maxcount-- <= 0)
				2564	break;
				2565	SPLIT_APPEND(self->str, j, i);
				2566	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2567	i++;
				2568	j = i;
				2569	}
				2570	}
				2571	if (j < len) {
				2572	SPLIT_APPEND(self->str, j, len);
				2573	}
				2574	return list;
				2575
				2576	onError:
				2577	Py_DECREF(list);
				2578	return NULL;
				2579	}
				2580
				2581	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2582	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2583	{
				2584	register int i;
				2585	register int j;
				2586	int len;
				2587	PyObject *list;
				2588	PyObject *str;
				2589	Py_UNICODE *data;
				2590
				2591	string = PyUnicode_FromObject(string);
				2592	if (string == NULL)
				2593	return NULL;
				2594	data = PyUnicode_AS_UNICODE(string);
				2595	len = PyUnicode_GET_SIZE(string);
				2596
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2597	list = PyList_New(0);
				2598	if (!list)
				2599	goto onError;
				2600
				2601	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2602	int eol;
				2603
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2604	/* Find a line and append it */
				2605	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2606	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2607
				2608	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2609	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2610	if (i < len) {
				2611	if (data[i] == '\r' && i + 1 < len &&
				2612	data[i+1] == '\n')
				2613	i += 2;
				2614	else
				2615	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2616	if (keepends)
				2617	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2618	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2619	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2620	j = i;
				2621	}
				2622	if (j < len) {
				2623	SPLIT_APPEND(data, j, len);
				2624	}
				2625
				2626	Py_DECREF(string);
				2627	return list;
				2628
				2629	onError:
				2630	Py_DECREF(list);
				2631	Py_DECREF(string);
				2632	return NULL;
				2633	}
				2634
				2635	static
				2636	PyObject split_char(PyUnicodeObject self,
				2637	PyObject *list,
				2638	Py_UNICODE ch,
				2639	int maxcount)
				2640	{
				2641	register int i;
				2642	register int j;
				2643	int len = self->length;
				2644	PyObject *str;
				2645
				2646	for (i = j = 0; i < len; ) {
				2647	if (self->str[i] == ch) {
				2648	if (maxcount-- <= 0)
				2649	break;
				2650	SPLIT_APPEND(self->str, j, i);
				2651	i = j = i + 1;
				2652	} else
				2653	i++;
				2654	}
				2655	if (j <= len) {
				2656	SPLIT_APPEND(self->str, j, len);
				2657	}
				2658	return list;
				2659
				2660	onError:
				2661	Py_DECREF(list);
				2662	return NULL;
				2663	}
				2664
				2665	static
				2666	PyObject split_substring(PyUnicodeObject self,
				2667	PyObject *list,
				2668	PyUnicodeObject *substring,
				2669	int maxcount)
				2670	{
				2671	register int i;
				2672	register int j;
				2673	int len = self->length;
				2674	int sublen = substring->length;
				2675	PyObject *str;
				2676
				2677	for (i = j = 0; i < len - sublen; ) {
				2678	if (Py_UNICODE_MATCH(self, i, substring)) {
				2679	if (maxcount-- <= 0)
				2680	break;
				2681	SPLIT_APPEND(self->str, j, i);
				2682	i = j = i + sublen;
				2683	} else
				2684	i++;
				2685	}
				2686	if (j <= len) {
				2687	SPLIT_APPEND(self->str, j, len);
				2688	}
				2689	return list;
				2690
				2691	onError:
				2692	Py_DECREF(list);
				2693	return NULL;
				2694	}
				2695
				2696	#undef SPLIT_APPEND
				2697
				2698	static
				2699	PyObject split(PyUnicodeObject self,
				2700	PyUnicodeObject *substring,
				2701	int maxcount)
				2702	{
				2703	PyObject *list;
				2704
				2705	if (maxcount < 0)
				2706	maxcount = INT_MAX;
				2707
				2708	list = PyList_New(0);
				2709	if (!list)
				2710	return NULL;
				2711
				2712	if (substring == NULL)
				2713	return split_whitespace(self,list,maxcount);
				2714
				2715	else if (substring->length == 1)
				2716	return split_char(self,list,substring->str[0],maxcount);
				2717
				2718	else if (substring->length == 0) {
				2719	Py_DECREF(list);
				2720	PyErr_SetString(PyExc_ValueError, "empty separator");
				2721	return NULL;
				2722	}
				2723	else
				2724	return split_substring(self,list,substring,maxcount);
				2725	}
				2726
				2727	static
				2728	PyObject strip(PyUnicodeObject self,
				2729	int left,
				2730	int right)
				2731	{
				2732	Py_UNICODE *p = self->str;
				2733	int start = 0;
				2734	int end = self->length;
				2735
				2736	if (left)
				2737	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				2738	start++;
				2739
				2740	if (right)
				2741	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				2742	end--;
				2743
				2744	if (start == 0 && end == self->length) {
				2745	/* couldn't strip anything off, return original string */
				2746	Py_INCREF(self);
				2747	return (PyObject*) self;
				2748	}
				2749
				2750	return (PyObject*) PyUnicode_FromUnicode(
				2751	self->str + start,
				2752	end - start
				2753	);
				2754	}
				2755
				2756	static
				2757	PyObject replace(PyUnicodeObject self,
				2758	PyUnicodeObject *str1,
				2759	PyUnicodeObject *str2,
				2760	int maxcount)
				2761	{
				2762	PyUnicodeObject *u;
				2763
				2764	if (maxcount < 0)
				2765	maxcount = INT_MAX;
				2766
				2767	if (str1->length == 1 && str2->length == 1) {
				2768	int i;
				2769
				2770	/* replace characters */
				2771	if (!findchar(self->str, self->length, str1->str[0])) {
				2772	/* nothing to replace, return original string */
				2773	Py_INCREF(self);
				2774	u = self;
				2775	} else {
				2776	Py_UNICODE u1 = str1->str[0];
				2777	Py_UNICODE u2 = str2->str[0];
				2778
				2779	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
				2780	self->str,
				2781	self->length
				2782	);
				2783	if (u)
				2784	for (i = 0; i < u->length; i++)
				2785	if (u->str[i] == u1) {
				2786	if (--maxcount < 0)
				2787	break;
				2788	u->str[i] = u2;
				2789	}
				2790	}
				2791
				2792	} else {
				2793	int n, i;
				2794	Py_UNICODE *p;
				2795
				2796	/* replace strings */
				2797	n = count(self, 0, self->length, str1);
				2798	if (n > maxcount)
				2799	n = maxcount;
				2800	if (n == 0) {
				2801	/* nothing to replace, return original string */
				2802	Py_INCREF(self);
				2803	u = self;
				2804	} else {
				2805	u = _PyUnicode_New(
				2806	self->length + n * (str2->length - str1->length));
				2807	if (u) {
				2808	i = 0;
				2809	p = u->str;
				2810	while (i <= self->length - str1->length)
				2811	if (Py_UNICODE_MATCH(self, i, str1)) {
				2812	/* replace string segment */
				2813	Py_UNICODE_COPY(p, str2->str, str2->length);
				2814	p += str2->length;
				2815	i += str1->length;
				2816	if (--n <= 0) {
				2817	/* copy remaining part */
				2818	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				2819	break;
				2820	}
				2821	} else
				2822	*p++ = self->str[i++];
				2823	}
				2824	}
				2825	}
				2826
				2827	return (PyObject *) u;
				2828	}
				2829
				2830	/* --- Unicode Object Methods --------------------------------------------- */
				2831
				2832	static char title__doc__[] =
				2833	"S.title() -> unicode\n\
				2834	\n\
				2835	Return a titlecased version of S, i.e. words start with title case\n\
				2836	characters, all remaining cased characters have lower case.";
				2837
				2838	static PyObject*
				2839	unicode_title(PyUnicodeObject self, PyObject args)
				2840	{
				2841	if (!PyArg_NoArgs(args))
				2842	return NULL;
				2843	return fixup(self, fixtitle);
				2844	}
				2845
				2846	static char capitalize__doc__[] =
				2847	"S.capitalize() -> unicode\n\
				2848	\n\
				2849	Return a capitalized version of S, i.e. make the first character\n\
				2850	have upper case.";
				2851
				2852	static PyObject*
				2853	unicode_capitalize(PyUnicodeObject self, PyObject args)
				2854	{
				2855	if (!PyArg_NoArgs(args))
				2856	return NULL;
				2857	return fixup(self, fixcapitalize);
				2858	}
				2859
				2860	#if 0
				2861	static char capwords__doc__[] =
				2862	"S.capwords() -> unicode\n\
				2863	\n\
				2864	Apply .capitalize() to all words in S and return the result with\n\
				2865	normalized whitespace (all whitespace strings are replaced by ' ').";
				2866
				2867	static PyObject*
				2868	unicode_capwords(PyUnicodeObject self, PyObject args)
				2869	{
				2870	PyObject *list;
				2871	PyObject *item;
				2872	int i;
				2873
				2874	if (!PyArg_NoArgs(args))
				2875	return NULL;
				2876
				2877	/* Split into words */
				2878	list = split(self, NULL, -1);
				2879	if (!list)
				2880	return NULL;
				2881
				2882	/* Capitalize each word */
				2883	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				2884	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				2885	fixcapitalize);
				2886	if (item == NULL)
				2887	goto onError;
				2888	Py_DECREF(PyList_GET_ITEM(list, i));
				2889	PyList_SET_ITEM(list, i, item);
				2890	}
				2891
				2892	/* Join the words to form a new string */
				2893	item = PyUnicode_Join(NULL, list);
				2894
				2895	onError:
				2896	Py_DECREF(list);
				2897	return (PyObject *)item;
				2898	}
				2899	#endif
				2900
				2901	static char center__doc__[] =
				2902	"S.center(width) -> unicode\n\
				2903	\n\
				2904	Return S centered in a Unicode string of length width. Padding is done\n\
				2905	using spaces.";
				2906
				2907	static PyObject *
				2908	unicode_center(PyUnicodeObject self, PyObject args)
				2909	{
				2910	int marg, left;
				2911	int width;
				2912
				2913	if (!PyArg_ParseTuple(args, "i:center", &width))
				2914	return NULL;
				2915
				2916	if (self->length >= width) {
				2917	Py_INCREF(self);
				2918	return (PyObject*) self;
				2919	}
				2920
				2921	marg = width - self->length;
				2922	left = marg / 2 + (marg & width & 1);
				2923
				2924	return (PyObject*) pad(self, left, marg - left, ' ');
				2925	}
				2926
				2927	static int
				2928	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				2929	{
				2930	int len1, len2;
				2931	Py_UNICODE *s1 = str1->str;
				2932	Py_UNICODE *s2 = str2->str;
				2933
				2934	len1 = str1->length;
				2935	len2 = str2->length;
				2936
				2937	while (len1 > 0 && len2 > 0) {
				2938	int cmp = (s1++) - (s2++);
				2939	if (cmp)
				2940	/* This should make Christian happy! */
				2941	return (cmp < 0) ? -1 : (cmp != 0);
				2942	len1--, len2--;
				2943	}
				2944
				2945	return (len1 < len2) ? -1 : (len1 != len2);
				2946	}
				2947
				2948	int PyUnicode_Compare(PyObject *left,
				2949	PyObject *right)
				2950	{
				2951	PyUnicodeObject u = NULL, v = NULL;
				2952	int result;
				2953
				2954	/* Coerce the two arguments */
				2955	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				2956	if (u == NULL)
				2957	goto onError;
				2958	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				2959	if (v == NULL)
				2960	goto onError;
				2961
				2962	/* Shortcut for emtpy or interned objects */
				2963	if (v == u) {
				2964	Py_DECREF(u);
				2965	Py_DECREF(v);
				2966	return 0;
				2967	}
				2968
				2969	result = unicode_compare(u, v);
				2970
				2971	Py_DECREF(u);
				2972	Py_DECREF(v);
				2973	return result;
				2974
				2975	onError:
				2976	Py_XDECREF(u);
				2977	Py_XDECREF(v);
				2978	return -1;
				2979	}
				2980
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2981	int PyUnicode_Contains(PyObject *container,
				2982	PyObject *element)
				2983	{
				2984	PyUnicodeObject u = NULL, v = NULL;
				2985	int result;
				2986	register const Py_UNICODE p, e;
				2987	register Py_UNICODE ch;
				2988
				2989	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2990	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
				2991	if (v == NULL)
				2992	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2993	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				2994	if (u == NULL) {
				2995	Py_DECREF(v);
				2996	goto onError;
				2997	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	2998
				2999	/* Check v in u */
				3000	if (PyUnicode_GET_SIZE(v) != 1) {
				3001	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3002	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3003	goto onError;
				3004	}
				3005	ch = *PyUnicode_AS_UNICODE(v);
				3006	p = PyUnicode_AS_UNICODE(u);
				3007	e = p + PyUnicode_GET_SIZE(u);
				3008	result = 0;
				3009	while (p < e) {
				3010	if (*p++ == ch) {
				3011	result = 1;
				3012	break;
				3013	}
				3014	}
				3015
				3016	Py_DECREF(u);
				3017	Py_DECREF(v);
				3018	return result;
				3019
				3020	onError:
				3021	Py_XDECREF(u);
				3022	Py_XDECREF(v);
				3023	return -1;
				3024	}
				3025
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3026	/* Concat to string or Unicode object giving a new Unicode object. */
				3027
				3028	PyObject PyUnicode_Concat(PyObject left,
				3029	PyObject *right)
				3030	{
				3031	PyUnicodeObject u = NULL, v = NULL, *w;
				3032
				3033	/* Coerce the two arguments */
				3034	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3035	if (u == NULL)
				3036	goto onError;
				3037	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3038	if (v == NULL)
				3039	goto onError;
				3040
				3041	/* Shortcuts */
				3042	if (v == unicode_empty) {
				3043	Py_DECREF(v);
				3044	return (PyObject *)u;
				3045	}
				3046	if (u == unicode_empty) {
				3047	Py_DECREF(u);
				3048	return (PyObject *)v;
				3049	}
				3050
				3051	/* Concat the two Unicode strings */
				3052	w = _PyUnicode_New(u->length + v->length);
				3053	if (w == NULL)
				3054	goto onError;
				3055	Py_UNICODE_COPY(w->str, u->str, u->length);
				3056	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3057
				3058	Py_DECREF(u);
				3059	Py_DECREF(v);
				3060	return (PyObject *)w;
				3061
				3062	onError:
				3063	Py_XDECREF(u);
				3064	Py_XDECREF(v);
				3065	return NULL;
				3066	}
				3067
				3068	static char count__doc__[] =
				3069	"S.count(sub[, start[, end]]) -> int\n\
				3070	\n\
				3071	Return the number of occurrences of substring sub in Unicode string\n\
				3072	S[start:end]. Optional arguments start and end are\n\
				3073	interpreted as in slice notation.";
				3074
				3075	static PyObject *
				3076	unicode_count(PyUnicodeObject self, PyObject args)
				3077	{
				3078	PyUnicodeObject *substring;
				3079	int start = 0;
				3080	int end = INT_MAX;
				3081	PyObject *result;
				3082
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3083	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3084	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3085	return NULL;
				3086
				3087	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3088	(PyObject *)substring);
				3089	if (substring == NULL)
				3090	return NULL;
				3091
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3092	if (start < 0)
				3093	start += self->length;
				3094	if (start < 0)
				3095	start = 0;
				3096	if (end > self->length)
				3097	end = self->length;
				3098	if (end < 0)
				3099	end += self->length;
				3100	if (end < 0)
				3101	end = 0;
				3102
				3103	result = PyInt_FromLong((long) count(self, start, end, substring));
				3104
				3105	Py_DECREF(substring);
				3106	return result;
				3107	}
				3108
				3109	static char encode__doc__[] =
				3110	"S.encode([encoding[,errors]]) -> string\n\
				3111	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3112	Return an encoded string version of S. Default encoding is the current\n\
				3113	default string encoding. errors may be given to set a different error\n\
				3114	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3115	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3116
				3117	static PyObject *
				3118	unicode_encode(PyUnicodeObject self, PyObject args)
				3119	{
				3120	char *encoding = NULL;
				3121	char *errors = NULL;
				3122	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3123	return NULL;
				3124	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3125	}
				3126
				3127	static char expandtabs__doc__[] =
				3128	"S.expandtabs([tabsize]) -> unicode\n\
				3129	\n\
				3130	Return a copy of S where all tab characters are expanded using spaces.\n\
				3131	If tabsize is not given, a tab size of 8 characters is assumed.";
				3132
				3133	static PyObject*
				3134	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3135	{
				3136	Py_UNICODE *e;
				3137	Py_UNICODE *p;
				3138	Py_UNICODE *q;
				3139	int i, j;
				3140	PyUnicodeObject *u;
				3141	int tabsize = 8;
				3142
				3143	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3144	return NULL;
				3145
				3146	/* First pass: determine size of ouput string */
				3147	i = j = 0;
				3148	e = self->str + self->length;
				3149	for (p = self->str; p < e; p++)
				3150	if (*p == '\t') {
				3151	if (tabsize > 0)
				3152	j += tabsize - (j % tabsize);
				3153	}
				3154	else {
				3155	j++;
				3156	if (p == '\n' \|\| p == '\r') {
				3157	i += j;
				3158	j = 0;
				3159	}
				3160	}
				3161
				3162	/* Second pass: create output string and fill it */
				3163	u = _PyUnicode_New(i + j);
				3164	if (!u)
				3165	return NULL;
				3166
				3167	j = 0;
				3168	q = u->str;
				3169
				3170	for (p = self->str; p < e; p++)
				3171	if (*p == '\t') {
				3172	if (tabsize > 0) {
				3173	i = tabsize - (j % tabsize);
				3174	j += i;
				3175	while (i--)
				3176	*q++ = ' ';
				3177	}
				3178	}
				3179	else {
				3180	j++;
				3181	q++ = p;
				3182	if (p == '\n' \|\| p == '\r')
				3183	j = 0;
				3184	}
				3185
				3186	return (PyObject*) u;
				3187	}
				3188
				3189	static char find__doc__[] =
				3190	"S.find(sub [,start [,end]]) -> int\n\
				3191	\n\
				3192	Return the lowest index in S where substring sub is found,\n\
				3193	such that sub is contained within s[start,end]. Optional\n\
				3194	arguments start and end are interpreted as in slice notation.\n\
				3195	\n\
				3196	Return -1 on failure.";
				3197
				3198	static PyObject *
				3199	unicode_find(PyUnicodeObject self, PyObject args)
				3200	{
				3201	PyUnicodeObject *substring;
				3202	int start = 0;
				3203	int end = INT_MAX;
				3204	PyObject *result;
				3205
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3206	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3207	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3208	return NULL;
				3209	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3210	(PyObject *)substring);
				3211	if (substring == NULL)
				3212	return NULL;
				3213
				3214	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3215
				3216	Py_DECREF(substring);
				3217	return result;
				3218	}
				3219
				3220	static PyObject *
				3221	unicode_getitem(PyUnicodeObject *self, int index)
				3222	{
				3223	if (index < 0 \|\| index >= self->length) {
				3224	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3225	return NULL;
				3226	}
				3227
				3228	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3229	}
				3230
				3231	static long
				3232	unicode_hash(PyUnicodeObject *self)
				3233	{
				3234	long hash;
				3235	PyObject *utf8;
				3236
				3237	/* Since Unicode objects compare equal to their UTF-8 string
				3238	counterparts, they should also use the UTF-8 strings as basis
				3239	for their hash value. This is needed to assure that strings and
				3240	Unicode objects behave in the same way as dictionary
				3241	keys. Unfortunately, this costs some performance and also some
				3242	memory if the cached UTF-8 representation is not used later
				3243	on. */
				3244	if (self->hash != -1)
				3245	return self->hash;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	3246	utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3247	if (utf8 == NULL)
				3248	return -1;
				3249	hash = PyObject_Hash(utf8);
				3250	if (hash == -1)
				3251	return -1;
				3252	self->hash = hash;
				3253	return hash;
				3254	}
				3255
				3256	static char index__doc__[] =
				3257	"S.index(sub [,start [,end]]) -> int\n\
				3258	\n\
				3259	Like S.find() but raise ValueError when the substring is not found.";
				3260
				3261	static PyObject *
				3262	unicode_index(PyUnicodeObject self, PyObject args)
				3263	{
				3264	int result;
				3265	PyUnicodeObject *substring;
				3266	int start = 0;
				3267	int end = INT_MAX;
				3268
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3269	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3270	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3271	return NULL;
				3272
				3273	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3274	(PyObject *)substring);
				3275	if (substring == NULL)
				3276	return NULL;
				3277
				3278	result = findstring(self, substring, start, end, 1);
				3279
				3280	Py_DECREF(substring);
				3281	if (result < 0) {
				3282	PyErr_SetString(PyExc_ValueError, "substring not found");
				3283	return NULL;
				3284	}
				3285	return PyInt_FromLong(result);
				3286	}
				3287
				3288	static char islower__doc__[] =
				3289	"S.islower() -> int\n\
				3290	\n\
				3291	Return 1 if all cased characters in S are lowercase and there is\n\
				3292	at least one cased character in S, 0 otherwise.";
				3293
				3294	static PyObject*
				3295	unicode_islower(PyUnicodeObject self, PyObject args)
				3296	{
				3297	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3298	register const Py_UNICODE *e;
				3299	int cased;
				3300
				3301	if (!PyArg_NoArgs(args))
				3302	return NULL;
				3303
				3304	/* Shortcut for single character strings */
				3305	if (PyUnicode_GET_SIZE(self) == 1)
				3306	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3307
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3308	/* Special case for empty strings */
				3309	if (PyString_GET_SIZE(self) == 0)
				3310	return PyInt_FromLong(0);
				3311
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3312	e = p + PyUnicode_GET_SIZE(self);
				3313	cased = 0;
				3314	for (; p < e; p++) {
				3315	register const Py_UNICODE ch = *p;
				3316
				3317	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3318	return PyInt_FromLong(0);
				3319	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3320	cased = 1;
				3321	}
				3322	return PyInt_FromLong(cased);
				3323	}
				3324
				3325	static char isupper__doc__[] =
				3326	"S.isupper() -> int\n\
				3327	\n\
				3328	Return 1 if all cased characters in S are uppercase and there is\n\
				3329	at least one cased character in S, 0 otherwise.";
				3330
				3331	static PyObject*
				3332	unicode_isupper(PyUnicodeObject self, PyObject args)
				3333	{
				3334	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3335	register const Py_UNICODE *e;
				3336	int cased;
				3337
				3338	if (!PyArg_NoArgs(args))
				3339	return NULL;
				3340
				3341	/* Shortcut for single character strings */
				3342	if (PyUnicode_GET_SIZE(self) == 1)
				3343	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3344
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3345	/* Special case for empty strings */
				3346	if (PyString_GET_SIZE(self) == 0)
				3347	return PyInt_FromLong(0);
				3348
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3349	e = p + PyUnicode_GET_SIZE(self);
				3350	cased = 0;
				3351	for (; p < e; p++) {
				3352	register const Py_UNICODE ch = *p;
				3353
				3354	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3355	return PyInt_FromLong(0);
				3356	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3357	cased = 1;
				3358	}
				3359	return PyInt_FromLong(cased);
				3360	}
				3361
				3362	static char istitle__doc__[] =
				3363	"S.istitle() -> int\n\
				3364	\n\
				3365	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3366	may only follow uncased characters and lowercase characters only cased\n\
				3367	ones. Return 0 otherwise.";
				3368
				3369	static PyObject*
				3370	unicode_istitle(PyUnicodeObject self, PyObject args)
				3371	{
				3372	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3373	register const Py_UNICODE *e;
				3374	int cased, previous_is_cased;
				3375
				3376	if (!PyArg_NoArgs(args))
				3377	return NULL;
				3378
				3379	/* Shortcut for single character strings */
				3380	if (PyUnicode_GET_SIZE(self) == 1)
				3381	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3382	(Py_UNICODE_ISUPPER(*p) != 0));
				3383
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3384	/* Special case for empty strings */
				3385	if (PyString_GET_SIZE(self) == 0)
				3386	return PyInt_FromLong(0);
				3387
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3388	e = p + PyUnicode_GET_SIZE(self);
				3389	cased = 0;
				3390	previous_is_cased = 0;
				3391	for (; p < e; p++) {
				3392	register const Py_UNICODE ch = *p;
				3393
				3394	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3395	if (previous_is_cased)
				3396	return PyInt_FromLong(0);
				3397	previous_is_cased = 1;
				3398	cased = 1;
				3399	}
				3400	else if (Py_UNICODE_ISLOWER(ch)) {
				3401	if (!previous_is_cased)
				3402	return PyInt_FromLong(0);
				3403	previous_is_cased = 1;
				3404	cased = 1;
				3405	}
				3406	else
				3407	previous_is_cased = 0;
				3408	}
				3409	return PyInt_FromLong(cased);
				3410	}
				3411
				3412	static char isspace__doc__[] =
				3413	"S.isspace() -> int\n\
				3414	\n\
				3415	Return 1 if there are only whitespace characters in S,\n\
				3416	0 otherwise.";
				3417
				3418	static PyObject*
				3419	unicode_isspace(PyUnicodeObject self, PyObject args)
				3420	{
				3421	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3422	register const Py_UNICODE *e;
				3423
				3424	if (!PyArg_NoArgs(args))
				3425	return NULL;
				3426
				3427	/* Shortcut for single character strings */
				3428	if (PyUnicode_GET_SIZE(self) == 1 &&
				3429	Py_UNICODE_ISSPACE(*p))
				3430	return PyInt_FromLong(1);
				3431
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3432	/* Special case for empty strings */
				3433	if (PyString_GET_SIZE(self) == 0)
				3434	return PyInt_FromLong(0);
				3435
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3436	e = p + PyUnicode_GET_SIZE(self);
				3437	for (; p < e; p++) {
				3438	if (!Py_UNICODE_ISSPACE(*p))
				3439	return PyInt_FromLong(0);
				3440	}
				3441	return PyInt_FromLong(1);
				3442	}
				3443
				3444	static char isdecimal__doc__[] =
				3445	"S.isdecimal() -> int\n\
				3446	\n\
				3447	Return 1 if there are only decimal characters in S,\n\
				3448	0 otherwise.";
				3449
				3450	static PyObject*
				3451	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3452	{
				3453	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3454	register const Py_UNICODE *e;
				3455
				3456	if (!PyArg_NoArgs(args))
				3457	return NULL;
				3458
				3459	/* Shortcut for single character strings */
				3460	if (PyUnicode_GET_SIZE(self) == 1 &&
				3461	Py_UNICODE_ISDECIMAL(*p))
				3462	return PyInt_FromLong(1);
				3463
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3464	/* Special case for empty strings */
				3465	if (PyString_GET_SIZE(self) == 0)
				3466	return PyInt_FromLong(0);
				3467
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3468	e = p + PyUnicode_GET_SIZE(self);
				3469	for (; p < e; p++) {
				3470	if (!Py_UNICODE_ISDECIMAL(*p))
				3471	return PyInt_FromLong(0);
				3472	}
				3473	return PyInt_FromLong(1);
				3474	}
				3475
				3476	static char isdigit__doc__[] =
				3477	"S.isdigit() -> int\n\
				3478	\n\
				3479	Return 1 if there are only digit characters in S,\n\
				3480	0 otherwise.";
				3481
				3482	static PyObject*
				3483	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3484	{
				3485	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3486	register const Py_UNICODE *e;
				3487
				3488	if (!PyArg_NoArgs(args))
				3489	return NULL;
				3490
				3491	/* Shortcut for single character strings */
				3492	if (PyUnicode_GET_SIZE(self) == 1 &&
				3493	Py_UNICODE_ISDIGIT(*p))
				3494	return PyInt_FromLong(1);
				3495
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3496	/* Special case for empty strings */
				3497	if (PyString_GET_SIZE(self) == 0)
				3498	return PyInt_FromLong(0);
				3499
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3500	e = p + PyUnicode_GET_SIZE(self);
				3501	for (; p < e; p++) {
				3502	if (!Py_UNICODE_ISDIGIT(*p))
				3503	return PyInt_FromLong(0);
				3504	}
				3505	return PyInt_FromLong(1);
				3506	}
				3507
				3508	static char isnumeric__doc__[] =
				3509	"S.isnumeric() -> int\n\
				3510	\n\
				3511	Return 1 if there are only numeric characters in S,\n\
				3512	0 otherwise.";
				3513
				3514	static PyObject*
				3515	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				3516	{
				3517	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3518	register const Py_UNICODE *e;
				3519
				3520	if (!PyArg_NoArgs(args))
				3521	return NULL;
				3522
				3523	/* Shortcut for single character strings */
				3524	if (PyUnicode_GET_SIZE(self) == 1 &&
				3525	Py_UNICODE_ISNUMERIC(*p))
				3526	return PyInt_FromLong(1);
				3527
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3528	/* Special case for empty strings */
				3529	if (PyString_GET_SIZE(self) == 0)
				3530	return PyInt_FromLong(0);
				3531
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3532	e = p + PyUnicode_GET_SIZE(self);
				3533	for (; p < e; p++) {
				3534	if (!Py_UNICODE_ISNUMERIC(*p))
				3535	return PyInt_FromLong(0);
				3536	}
				3537	return PyInt_FromLong(1);
				3538	}
				3539
				3540	static char join__doc__[] =
				3541	"S.join(sequence) -> unicode\n\
				3542	\n\
				3543	Return a string which is the concatenation of the strings in the\n\
				3544	sequence. The separator between elements is S.";
				3545
				3546	static PyObject*
				3547	unicode_join(PyUnicodeObject self, PyObject args)
				3548	{
				3549	PyObject *data;
				3550	if (!PyArg_ParseTuple(args, "O:join", &data))
				3551	return NULL;
				3552
				3553	return PyUnicode_Join((PyObject *)self, data);
				3554	}
				3555
				3556	static int
				3557	unicode_length(PyUnicodeObject *self)
				3558	{
				3559	return self->length;
				3560	}
				3561
				3562	static char ljust__doc__[] =
				3563	"S.ljust(width) -> unicode\n\
				3564	\n\
				3565	Return S left justified in a Unicode string of length width. Padding is\n\
				3566	done using spaces.";
				3567
				3568	static PyObject *
				3569	unicode_ljust(PyUnicodeObject self, PyObject args)
				3570	{
				3571	int width;
				3572	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				3573	return NULL;
				3574
				3575	if (self->length >= width) {
				3576	Py_INCREF(self);
				3577	return (PyObject*) self;
				3578	}
				3579
				3580	return (PyObject*) pad(self, 0, width - self->length, ' ');
				3581	}
				3582
				3583	static char lower__doc__[] =
				3584	"S.lower() -> unicode\n\
				3585	\n\
				3586	Return a copy of the string S converted to lowercase.";
				3587
				3588	static PyObject*
				3589	unicode_lower(PyUnicodeObject self, PyObject args)
				3590	{
				3591	if (!PyArg_NoArgs(args))
				3592	return NULL;
				3593	return fixup(self, fixlower);
				3594	}
				3595
				3596	static char lstrip__doc__[] =
				3597	"S.lstrip() -> unicode\n\
				3598	\n\
				3599	Return a copy of the string S with leading whitespace removed.";
				3600
				3601	static PyObject *
				3602	unicode_lstrip(PyUnicodeObject self, PyObject args)
				3603	{
				3604	if (!PyArg_NoArgs(args))
				3605	return NULL;
				3606	return strip(self, 1, 0);
				3607	}
				3608
				3609	static PyObject*
				3610	unicode_repeat(PyUnicodeObject *str, int len)
				3611	{
				3612	PyUnicodeObject *u;
				3613	Py_UNICODE *p;
				3614
				3615	if (len < 0)
				3616	len = 0;
				3617
				3618	if (len == 1) {
				3619	/* no repeat, return original string */
				3620	Py_INCREF(str);
				3621	return (PyObject*) str;
				3622	}
				3623
				3624	u = _PyUnicode_New(len * str->length);
				3625	if (!u)
				3626	return NULL;
				3627
				3628	p = u->str;
				3629
				3630	while (len-- > 0) {
				3631	Py_UNICODE_COPY(p, str->str, str->length);
				3632	p += str->length;
				3633	}
				3634
				3635	return (PyObject*) u;
				3636	}
				3637
				3638	PyObject PyUnicode_Replace(PyObject obj,
				3639	PyObject *subobj,
				3640	PyObject *replobj,
				3641	int maxcount)
				3642	{
				3643	PyObject *self;
				3644	PyObject *str1;
				3645	PyObject *str2;
				3646	PyObject *result;
				3647
				3648	self = PyUnicode_FromObject(obj);
				3649	if (self == NULL)
				3650	return NULL;
				3651	str1 = PyUnicode_FromObject(subobj);
				3652	if (str1 == NULL) {
				3653	Py_DECREF(self);
				3654	return NULL;
				3655	}
				3656	str2 = PyUnicode_FromObject(replobj);
				3657	if (str2 == NULL) {
				3658	Py_DECREF(self);
				3659	Py_DECREF(str1);
				3660	return NULL;
				3661	}
				3662	result = replace((PyUnicodeObject *)self,
				3663	(PyUnicodeObject *)str1,
				3664	(PyUnicodeObject *)str2,
				3665	maxcount);
				3666	Py_DECREF(self);
				3667	Py_DECREF(str1);
				3668	Py_DECREF(str2);
				3669	return result;
				3670	}
				3671
				3672	static char replace__doc__[] =
				3673	"S.replace (old, new[, maxsplit]) -> unicode\n\
				3674	\n\
				3675	Return a copy of S with all occurrences of substring\n\
				3676	old replaced by new. If the optional argument maxsplit is\n\
				3677	given, only the first maxsplit occurrences are replaced.";
				3678
				3679	static PyObject*
				3680	unicode_replace(PyUnicodeObject self, PyObject args)
				3681	{
				3682	PyUnicodeObject *str1;
				3683	PyUnicodeObject *str2;
				3684	int maxcount = -1;
				3685	PyObject *result;
				3686
				3687	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				3688	return NULL;
				3689	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				3690	if (str1 == NULL)
				3691	return NULL;
				3692	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				3693	if (str2 == NULL)
				3694	return NULL;
				3695
				3696	result = replace(self, str1, str2, maxcount);
				3697
				3698	Py_DECREF(str1);
				3699	Py_DECREF(str2);
				3700	return result;
				3701	}
				3702
				3703	static
				3704	PyObject unicode_repr(PyObject unicode)
				3705	{
				3706	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				3707	PyUnicode_GET_SIZE(unicode),
				3708	1);
				3709	}
				3710
				3711	static char rfind__doc__[] =
				3712	"S.rfind(sub [,start [,end]]) -> int\n\
				3713	\n\
				3714	Return the highest index in S where substring sub is found,\n\
				3715	such that sub is contained within s[start,end]. Optional\n\
				3716	arguments start and end are interpreted as in slice notation.\n\
				3717	\n\
				3718	Return -1 on failure.";
				3719
				3720	static PyObject *
				3721	unicode_rfind(PyUnicodeObject self, PyObject args)
				3722	{
				3723	PyUnicodeObject *substring;
				3724	int start = 0;
				3725	int end = INT_MAX;
				3726	PyObject *result;
				3727
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3728	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				3729	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3730	return NULL;
				3731	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3732	(PyObject *)substring);
				3733	if (substring == NULL)
				3734	return NULL;
				3735
				3736	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				3737
				3738	Py_DECREF(substring);
				3739	return result;
				3740	}
				3741
				3742	static char rindex__doc__[] =
				3743	"S.rindex(sub [,start [,end]]) -> int\n\
				3744	\n\
				3745	Like S.rfind() but raise ValueError when the substring is not found.";
				3746
				3747	static PyObject *
				3748	unicode_rindex(PyUnicodeObject self, PyObject args)
				3749	{
				3750	int result;
				3751	PyUnicodeObject *substring;
				3752	int start = 0;
				3753	int end = INT_MAX;
				3754
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3755	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				3756	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3757	return NULL;
				3758	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3759	(PyObject *)substring);
				3760	if (substring == NULL)
				3761	return NULL;
				3762
				3763	result = findstring(self, substring, start, end, -1);
				3764
				3765	Py_DECREF(substring);
				3766	if (result < 0) {
				3767	PyErr_SetString(PyExc_ValueError, "substring not found");
				3768	return NULL;
				3769	}
				3770	return PyInt_FromLong(result);
				3771	}
				3772
				3773	static char rjust__doc__[] =
				3774	"S.rjust(width) -> unicode\n\
				3775	\n\
				3776	Return S right justified in a Unicode string of length width. Padding is\n\
				3777	done using spaces.";
				3778
				3779	static PyObject *
				3780	unicode_rjust(PyUnicodeObject self, PyObject args)
				3781	{
				3782	int width;
				3783	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				3784	return NULL;
				3785
				3786	if (self->length >= width) {
				3787	Py_INCREF(self);
				3788	return (PyObject*) self;
				3789	}
				3790
				3791	return (PyObject*) pad(self, width - self->length, 0, ' ');
				3792	}
				3793
				3794	static char rstrip__doc__[] =
				3795	"S.rstrip() -> unicode\n\
				3796	\n\
				3797	Return a copy of the string S with trailing whitespace removed.";
				3798
				3799	static PyObject *
				3800	unicode_rstrip(PyUnicodeObject self, PyObject args)
				3801	{
				3802	if (!PyArg_NoArgs(args))
				3803	return NULL;
				3804	return strip(self, 0, 1);
				3805	}
				3806
				3807	static PyObject*
				3808	unicode_slice(PyUnicodeObject *self, int start, int end)
				3809	{
				3810	/* standard clamping */
				3811	if (start < 0)
				3812	start = 0;
				3813	if (end < 0)
				3814	end = 0;
				3815	if (end > self->length)
				3816	end = self->length;
				3817	if (start == 0 && end == self->length) {
				3818	/* full slice, return original string */
				3819	Py_INCREF(self);
				3820	return (PyObject*) self;
				3821	}
				3822	if (start > end)
				3823	start = end;
				3824	/* copy slice */
				3825	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				3826	end - start);
				3827	}
				3828
				3829	PyObject PyUnicode_Split(PyObject s,
				3830	PyObject *sep,
				3831	int maxsplit)
				3832	{
				3833	PyObject *result;
				3834
				3835	s = PyUnicode_FromObject(s);
				3836	if (s == NULL)
				3837	return NULL;
				3838	if (sep != NULL) {
				3839	sep = PyUnicode_FromObject(sep);
				3840	if (sep == NULL) {
				3841	Py_DECREF(s);
				3842	return NULL;
				3843	}
				3844	}
				3845
				3846	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				3847
				3848	Py_DECREF(s);
				3849	Py_XDECREF(sep);
				3850	return result;
				3851	}
				3852
				3853	static char split__doc__[] =
				3854	"S.split([sep [,maxsplit]]) -> list of strings\n\
				3855	\n\
				3856	Return a list of the words in S, using sep as the\n\
				3857	delimiter string. If maxsplit is given, at most maxsplit\n\
				3858	splits are done. If sep is not specified, any whitespace string\n\
				3859	is a separator.";
				3860
				3861	static PyObject*
				3862	unicode_split(PyUnicodeObject self, PyObject args)
				3863	{
				3864	PyObject *substring = Py_None;
				3865	int maxcount = -1;
				3866
				3867	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				3868	return NULL;
				3869
				3870	if (substring == Py_None)
				3871	return split(self, NULL, maxcount);
				3872	else if (PyUnicode_Check(substring))
				3873	return split(self, (PyUnicodeObject *)substring, maxcount);
				3874	else
				3875	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				3876	}
				3877
				3878	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3879	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3880	\n\
				3881	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3882	Line breaks are not included in the resulting list unless keepends\n\
				3883	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3884
				3885	static PyObject*
				3886	unicode_splitlines(PyUnicodeObject self, PyObject args)
				3887	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3888	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3889
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3890	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3891	return NULL;
				3892
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3893	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3894	}
				3895
				3896	static
				3897	PyObject unicode_str(PyUnicodeObject self)
				3898	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3899	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3900	}
				3901
				3902	static char strip__doc__[] =
				3903	"S.strip() -> unicode\n\
				3904	\n\
				3905	Return a copy of S with leading and trailing whitespace removed.";
				3906
				3907	static PyObject *
				3908	unicode_strip(PyUnicodeObject self, PyObject args)
				3909	{
				3910	if (!PyArg_NoArgs(args))
				3911	return NULL;
				3912	return strip(self, 1, 1);
				3913	}
				3914
				3915	static char swapcase__doc__[] =
				3916	"S.swapcase() -> unicode\n\
				3917	\n\
				3918	Return a copy of S with uppercase characters converted to lowercase\n\
				3919	and vice versa.";
				3920
				3921	static PyObject*
				3922	unicode_swapcase(PyUnicodeObject self, PyObject args)
				3923	{
				3924	if (!PyArg_NoArgs(args))
				3925	return NULL;
				3926	return fixup(self, fixswapcase);
				3927	}
				3928
				3929	static char translate__doc__[] =
				3930	"S.translate(table) -> unicode\n\
				3931	\n\
				3932	Return a copy of the string S, where all characters have been mapped\n\
				3933	through the given translation table, which must be a mapping of\n\
				3934	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				3935	are left untouched. Characters mapped to None are deleted.";
				3936
				3937	static PyObject*
				3938	unicode_translate(PyUnicodeObject self, PyObject args)
				3939	{
				3940	PyObject *table;
				3941
				3942	if (!PyArg_ParseTuple(args, "O:translate", &table))
				3943	return NULL;
				3944	return PyUnicode_TranslateCharmap(self->str,
				3945	self->length,
				3946	table,
				3947	"ignore");
				3948	}
				3949
				3950	static char upper__doc__[] =
				3951	"S.upper() -> unicode\n\
				3952	\n\
				3953	Return a copy of S converted to uppercase.";
				3954
				3955	static PyObject*
				3956	unicode_upper(PyUnicodeObject self, PyObject args)
				3957	{
				3958	if (!PyArg_NoArgs(args))
				3959	return NULL;
				3960	return fixup(self, fixupper);
				3961	}
				3962
				3963	#if 0
				3964	static char zfill__doc__[] =
				3965	"S.zfill(width) -> unicode\n\
				3966	\n\
				3967	Pad a numeric string x with zeros on the left, to fill a field\n\
				3968	of the specified width. The string x is never truncated.";
				3969
				3970	static PyObject *
				3971	unicode_zfill(PyUnicodeObject self, PyObject args)
				3972	{
				3973	int fill;
				3974	PyUnicodeObject *u;
				3975
				3976	int width;
				3977	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				3978	return NULL;
				3979
				3980	if (self->length >= width) {
				3981	Py_INCREF(self);
				3982	return (PyObject*) self;
				3983	}
				3984
				3985	fill = width - self->length;
				3986
				3987	u = pad(self, fill, 0, '0');
				3988
				3989	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				3990	/* move sign to beginning of string */
				3991	u->str[0] = u->str[fill];
				3992	u->str[fill] = '0';
				3993	}
				3994
				3995	return (PyObject*) u;
				3996	}
				3997	#endif
				3998
				3999	#if 0
				4000	static PyObject*
				4001	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4002	{
				4003	if (!PyArg_NoArgs(args))
				4004	return NULL;
				4005	return PyInt_FromLong(unicode_freelist_size);
				4006	}
				4007	#endif
				4008
				4009	static char startswith__doc__[] =
				4010	"S.startswith(prefix[, start[, end]]) -> int\n\
				4011	\n\
				4012	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4013	optional start, test S beginning at that position. With optional end, stop\n\
				4014	comparing S at that position.";
				4015
				4016	static PyObject *
				4017	unicode_startswith(PyUnicodeObject *self,
				4018	PyObject *args)
				4019	{
				4020	PyUnicodeObject *substring;
				4021	int start = 0;
				4022	int end = INT_MAX;
				4023	PyObject *result;
				4024
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4025	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4026	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4027	return NULL;
				4028	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4029	(PyObject *)substring);
				4030	if (substring == NULL)
				4031	return NULL;
				4032
				4033	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4034
				4035	Py_DECREF(substring);
				4036	return result;
				4037	}
				4038
				4039
				4040	static char endswith__doc__[] =
				4041	"S.endswith(suffix[, start[, end]]) -> int\n\
				4042	\n\
				4043	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4044	optional start, test S beginning at that position. With optional end, stop\n\
				4045	comparing S at that position.";
				4046
				4047	static PyObject *
				4048	unicode_endswith(PyUnicodeObject *self,
				4049	PyObject *args)
				4050	{
				4051	PyUnicodeObject *substring;
				4052	int start = 0;
				4053	int end = INT_MAX;
				4054	PyObject *result;
				4055
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4056	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4057	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4058	return NULL;
				4059	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4060	(PyObject *)substring);
				4061	if (substring == NULL)
				4062	return NULL;
				4063
				4064	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4065
				4066	Py_DECREF(substring);
				4067	return result;
				4068	}
				4069
				4070
				4071	static PyMethodDef unicode_methods[] = {
				4072
				4073	/* Order is according to common usage: often used methods should
				4074	appear first, since lookup is done sequentially. */
				4075
				4076	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4077	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4078	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4079	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4080	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4081	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4082	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4083	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4084	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4085	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4086	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4087	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4088	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4089	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4090	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4091	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4092	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4093	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4094	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4095	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4096	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4097	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4098	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4099	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4100	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4101	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4102	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4103	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4104	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4105	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4106	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4107	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4108	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
				4109	#if 0
				4110	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4111	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4112	#endif
				4113
				4114	#if 0
				4115	/* This one is just used for debugging the implementation. */
				4116	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4117	#endif
				4118
				4119	{NULL, NULL}
				4120	};
				4121
				4122	static PyObject *
				4123	unicode_getattr(PyUnicodeObject self, char name)
				4124	{
				4125	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4126	}
				4127
				4128	static PySequenceMethods unicode_as_sequence = {
				4129	(inquiry) unicode_length, /* sq_length */
				4130	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4131	(intargfunc) unicode_repeat, /* sq_repeat */
				4132	(intargfunc) unicode_getitem, /* sq_item */
				4133	(intintargfunc) unicode_slice, /* sq_slice */
				4134	0, /* sq_ass_item */
				4135	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4136	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4137	};
				4138
				4139	static int
				4140	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4141	int index,
				4142	const void **ptr)
				4143	{
				4144	if (index != 0) {
				4145	PyErr_SetString(PyExc_SystemError,
				4146	"accessing non-existent unicode segment");
				4147	return -1;
				4148	}
				4149	ptr = (void ) self->str;
				4150	return PyUnicode_GET_DATA_SIZE(self);
				4151	}
				4152
				4153	static int
				4154	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4155	const void **ptr)
				4156	{
				4157	PyErr_SetString(PyExc_TypeError,
				4158	"cannot use unicode as modifyable buffer");
				4159	return -1;
				4160	}
				4161
				4162	static int
				4163	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4164	int *lenp)
				4165	{
				4166	if (lenp)
				4167	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4168	return 1;
				4169	}
				4170
				4171	static int
				4172	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4173	int index,
				4174	const void **ptr)
				4175	{
				4176	PyObject *str;
				4177
				4178	if (index != 0) {
				4179	PyErr_SetString(PyExc_SystemError,
				4180	"accessing non-existent unicode segment");
				4181	return -1;
				4182	}
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	4183	str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4184	if (str == NULL)
				4185	return -1;
				4186	ptr = (void ) PyString_AS_STRING(str);
				4187	return PyString_GET_SIZE(str);
				4188	}
				4189
				4190	/* Helpers for PyUnicode_Format() */
				4191
				4192	static PyObject *
				4193	getnextarg(args, arglen, p_argidx)
				4194	PyObject *args;
				4195	int arglen;
				4196	int *p_argidx;
				4197	{
				4198	int argidx = *p_argidx;
				4199	if (argidx < arglen) {
				4200	(*p_argidx)++;
				4201	if (arglen < 0)
				4202	return args;
				4203	else
				4204	return PyTuple_GetItem(args, argidx);
				4205	}
				4206	PyErr_SetString(PyExc_TypeError,
				4207	"not enough arguments for format string");
				4208	return NULL;
				4209	}
				4210
				4211	#define F_LJUST (1<<0)
				4212	#define F_SIGN (1<<1)
				4213	#define F_BLANK (1<<2)
				4214	#define F_ALT (1<<3)
				4215	#define F_ZERO (1<<4)
				4216
				4217	static
				4218	#ifdef HAVE_STDARG_PROTOTYPES
				4219	int usprintf(register Py_UNICODE buffer, char format, ...)
				4220	#else
				4221	int usprintf(va_alist) va_dcl
				4222	#endif
				4223	{
				4224	register int i;
				4225	int len;
				4226	va_list va;
				4227	char *charbuffer;
				4228	#ifdef HAVE_STDARG_PROTOTYPES
				4229	va_start(va, format);
				4230	#else
				4231	Py_UNICODE *args;
				4232	char *format;
				4233
				4234	va_start(va);
				4235	buffer = va_arg(va, Py_UNICODE *);
				4236	format = va_arg(va, char *);
				4237	#endif
				4238
				4239	/* First, format the string as char array, then expand to Py_UNICODE
				4240	array. */
				4241	charbuffer = (char *)buffer;
				4242	len = vsprintf(charbuffer, format, va);
				4243	for (i = len - 1; i >= 0; i--)
				4244	buffer[i] = (Py_UNICODE) charbuffer[i];
				4245
				4246	va_end(va);
				4247	return len;
				4248	}
				4249
				4250	static int
				4251	formatfloat(Py_UNICODE *buf,
				4252	int flags,
				4253	int prec,
				4254	int type,
				4255	PyObject *v)
				4256	{
				4257	char fmt[20];
				4258	double x;
				4259
				4260	x = PyFloat_AsDouble(v);
				4261	if (x == -1.0 && PyErr_Occurred())
				4262	return -1;
				4263	if (prec < 0)
				4264	prec = 6;
				4265	if (prec > 50)
				4266	prec = 50; /* Arbitrary limitation */
				4267	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4268	type = 'g';
				4269	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
				4270	return usprintf(buf, fmt, x);
				4271	}
				4272
				4273	static int
				4274	formatint(Py_UNICODE *buf,
				4275	int flags,
				4276	int prec,
				4277	int type,
				4278	PyObject *v)
				4279	{
				4280	char fmt[20];
				4281	long x;
				4282
				4283	x = PyInt_AsLong(v);
				4284	if (x == -1 && PyErr_Occurred())
				4285	return -1;
				4286	if (prec < 0)
				4287	prec = 1;
				4288	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
				4289	return usprintf(buf, fmt, x);
				4290	}
				4291
				4292	static int
				4293	formatchar(Py_UNICODE *buf,
				4294	PyObject *v)
				4295	{
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4296	if (PyUnicode_Check(v)) {
				4297	if (PyUnicode_GET_SIZE(v) != 1)
				4298	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4299	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4300	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4301
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4302	else if (PyString_Check(v)) {
				4303	if (PyString_GET_SIZE(v) != 1)
				4304	goto onError;
				4305	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4306	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4307
				4308	else {
				4309	/* Integer input truncated to a character */
				4310	long x;
				4311	x = PyInt_AsLong(v);
				4312	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4313	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4314	buf[0] = (char) x;
				4315	}
				4316	buf[1] = '\0';
				4317	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4318
				4319	onError:
				4320	PyErr_SetString(PyExc_TypeError,
				4321	"%c requires int or char");
				4322	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4323	}
				4324
				4325	PyObject PyUnicode_Format(PyObject format,
				4326	PyObject *args)
				4327	{
				4328	Py_UNICODE fmt, res;
				4329	int fmtcnt, rescnt, reslen, arglen, argidx;
				4330	int args_owned = 0;
				4331	PyUnicodeObject *result = NULL;
				4332	PyObject *dict = NULL;
				4333	PyObject *uformat;
				4334
				4335	if (format == NULL \|\| args == NULL) {
				4336	PyErr_BadInternalCall();
				4337	return NULL;
				4338	}
				4339	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4340	if (uformat == NULL)
				4341	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4342	fmt = PyUnicode_AS_UNICODE(uformat);
				4343	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4344
				4345	reslen = rescnt = fmtcnt + 100;
				4346	result = _PyUnicode_New(reslen);
				4347	if (result == NULL)
				4348	goto onError;
				4349	res = PyUnicode_AS_UNICODE(result);
				4350
				4351	if (PyTuple_Check(args)) {
				4352	arglen = PyTuple_Size(args);
				4353	argidx = 0;
				4354	}
				4355	else {
				4356	arglen = -1;
				4357	argidx = -2;
				4358	}
				4359	if (args->ob_type->tp_as_mapping)
				4360	dict = args;
				4361
				4362	while (--fmtcnt >= 0) {
				4363	if (*fmt != '%') {
				4364	if (--rescnt < 0) {
				4365	rescnt = fmtcnt + 100;
				4366	reslen += rescnt;
				4367	if (_PyUnicode_Resize(result, reslen) < 0)
				4368	return NULL;
				4369	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4370	--rescnt;
				4371	}
				4372	res++ = fmt++;
				4373	}
				4374	else {
				4375	/* Got a format specifier */
				4376	int flags = 0;
				4377	int width = -1;
				4378	int prec = -1;
				4379	int size = 0;
				4380	Py_UNICODE c = '\0';
				4381	Py_UNICODE fill;
				4382	PyObject *v = NULL;
				4383	PyObject *temp = NULL;
				4384	Py_UNICODE *buf;
				4385	Py_UNICODE sign;
				4386	int len;
				4387	Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
				4388
				4389	fmt++;
				4390	if (*fmt == '(') {
				4391	Py_UNICODE *keystart;
				4392	int keylen;
				4393	PyObject *key;
				4394	int pcount = 1;
				4395
				4396	if (dict == NULL) {
				4397	PyErr_SetString(PyExc_TypeError,
				4398	"format requires a mapping");
				4399	goto onError;
				4400	}
				4401	++fmt;
				4402	--fmtcnt;
				4403	keystart = fmt;
				4404	/* Skip over balanced parentheses */
				4405	while (pcount > 0 && --fmtcnt >= 0) {
				4406	if (*fmt == ')')
				4407	--pcount;
				4408	else if (*fmt == '(')
				4409	++pcount;
				4410	fmt++;
				4411	}
				4412	keylen = fmt - keystart - 1;
				4413	if (fmtcnt < 0 \|\| pcount > 0) {
				4414	PyErr_SetString(PyExc_ValueError,
				4415	"incomplete format key");
				4416	goto onError;
				4417	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4418	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4419	then looked up since Python uses strings to hold
				4420	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4421	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4422	key = PyUnicode_EncodeUTF8(keystart,
				4423	keylen,
				4424	NULL);
				4425	if (key == NULL)
				4426	goto onError;
				4427	if (args_owned) {
				4428	Py_DECREF(args);
				4429	args_owned = 0;
				4430	}
				4431	args = PyObject_GetItem(dict, key);
				4432	Py_DECREF(key);
				4433	if (args == NULL) {
				4434	goto onError;
				4435	}
				4436	args_owned = 1;
				4437	arglen = -1;
				4438	argidx = -2;
				4439	}
				4440	while (--fmtcnt >= 0) {
				4441	switch (c = *fmt++) {
				4442	case '-': flags \|= F_LJUST; continue;
				4443	case '+': flags \|= F_SIGN; continue;
				4444	case ' ': flags \|= F_BLANK; continue;
				4445	case '#': flags \|= F_ALT; continue;
				4446	case '0': flags \|= F_ZERO; continue;
				4447	}
				4448	break;
				4449	}
				4450	if (c == '*') {
				4451	v = getnextarg(args, arglen, &argidx);
				4452	if (v == NULL)
				4453	goto onError;
				4454	if (!PyInt_Check(v)) {
				4455	PyErr_SetString(PyExc_TypeError,
				4456	"* wants int");
				4457	goto onError;
				4458	}
				4459	width = PyInt_AsLong(v);
				4460	if (width < 0) {
				4461	flags \|= F_LJUST;
				4462	width = -width;
				4463	}
				4464	if (--fmtcnt >= 0)
				4465	c = *fmt++;
				4466	}
				4467	else if (c >= '0' && c <= '9') {
				4468	width = c - '0';
				4469	while (--fmtcnt >= 0) {
				4470	c = *fmt++;
				4471	if (c < '0' \|\| c > '9')
				4472	break;
				4473	if ((width*10) / 10 != width) {
				4474	PyErr_SetString(PyExc_ValueError,
				4475	"width too big");
				4476	goto onError;
				4477	}
				4478	width = width*10 + (c - '0');
				4479	}
				4480	}
				4481	if (c == '.') {
				4482	prec = 0;
				4483	if (--fmtcnt >= 0)
				4484	c = *fmt++;
				4485	if (c == '*') {
				4486	v = getnextarg(args, arglen, &argidx);
				4487	if (v == NULL)
				4488	goto onError;
				4489	if (!PyInt_Check(v)) {
				4490	PyErr_SetString(PyExc_TypeError,
				4491	"* wants int");
				4492	goto onError;
				4493	}
				4494	prec = PyInt_AsLong(v);
				4495	if (prec < 0)
				4496	prec = 0;
				4497	if (--fmtcnt >= 0)
				4498	c = *fmt++;
				4499	}
				4500	else if (c >= '0' && c <= '9') {
				4501	prec = c - '0';
				4502	while (--fmtcnt >= 0) {
				4503	c = Py_CHARMASK(*fmt++);
				4504	if (c < '0' \|\| c > '9')
				4505	break;
				4506	if ((prec*10) / 10 != prec) {
				4507	PyErr_SetString(PyExc_ValueError,
				4508	"prec too big");
				4509	goto onError;
				4510	}
				4511	prec = prec*10 + (c - '0');
				4512	}
				4513	}
				4514	} /* prec */
				4515	if (fmtcnt >= 0) {
				4516	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
				4517	size = c;
				4518	if (--fmtcnt >= 0)
				4519	c = *fmt++;
				4520	}
				4521	}
				4522	if (fmtcnt < 0) {
				4523	PyErr_SetString(PyExc_ValueError,
				4524	"incomplete format");
				4525	goto onError;
				4526	}
				4527	if (c != '%') {
				4528	v = getnextarg(args, arglen, &argidx);
				4529	if (v == NULL)
				4530	goto onError;
				4531	}
				4532	sign = 0;
				4533	fill = ' ';
				4534	switch (c) {
				4535
				4536	case '%':
				4537	buf = tmpbuf;
				4538	buf[0] = '%';
				4539	len = 1;
				4540	break;
				4541
				4542	case 's':
				4543	case 'r':
				4544	if (PyUnicode_Check(v) && c == 's') {
				4545	temp = v;
				4546	Py_INCREF(temp);
				4547	}
				4548	else {
				4549	PyObject *unicode;
				4550	if (c == 's')
				4551	temp = PyObject_Str(v);
				4552	else
				4553	temp = PyObject_Repr(v);
				4554	if (temp == NULL)
				4555	goto onError;
				4556	if (!PyString_Check(temp)) {
				4557	/* XXX Note: this should never happen, since
				4558	PyObject_Repr() and PyObject_Str() assure
				4559	this */
				4560	Py_DECREF(temp);
				4561	PyErr_SetString(PyExc_TypeError,
				4562	"%s argument has non-string str()");
				4563	goto onError;
				4564	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4565	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4566	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4567	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4568	"strict");
				4569	Py_DECREF(temp);
				4570	temp = unicode;
				4571	if (temp == NULL)
				4572	goto onError;
				4573	}
				4574	buf = PyUnicode_AS_UNICODE(temp);
				4575	len = PyUnicode_GET_SIZE(temp);
				4576	if (prec >= 0 && len > prec)
				4577	len = prec;
				4578	break;
				4579
				4580	case 'i':
				4581	case 'd':
				4582	case 'u':
				4583	case 'o':
				4584	case 'x':
				4585	case 'X':
				4586	if (c == 'i')
				4587	c = 'd';
				4588	buf = tmpbuf;
				4589	len = formatint(buf, flags, prec, c, v);
				4590	if (len < 0)
				4591	goto onError;
				4592	sign = (c == 'd');
				4593	if (flags & F_ZERO) {
				4594	fill = '0';
				4595	if ((flags&F_ALT) &&
				4596	(c == 'x' \|\| c == 'X') &&
				4597	buf[0] == '0' && buf[1] == c) {
				4598	res++ = buf++;
				4599	res++ = buf++;
				4600	rescnt -= 2;
				4601	len -= 2;
				4602	width -= 2;
				4603	if (width < 0)
				4604	width = 0;
				4605	}
				4606	}
				4607	break;
				4608
				4609	case 'e':
				4610	case 'E':
				4611	case 'f':
				4612	case 'g':
				4613	case 'G':
				4614	buf = tmpbuf;
				4615	len = formatfloat(buf, flags, prec, c, v);
				4616	if (len < 0)
				4617	goto onError;
				4618	sign = 1;
				4619	if (flags&F_ZERO)
				4620	fill = '0';
				4621	break;
				4622
				4623	case 'c':
				4624	buf = tmpbuf;
				4625	len = formatchar(buf, v);
				4626	if (len < 0)
				4627	goto onError;
				4628	break;
				4629
				4630	default:
				4631	PyErr_Format(PyExc_ValueError,
				4632	"unsupported format character '%c' (0x%x)",
				4633	c, c);
				4634	goto onError;
				4635	}
				4636	if (sign) {
				4637	if (buf == '-' \|\| buf == '+') {
				4638	sign = *buf++;
				4639	len--;
				4640	}
				4641	else if (flags & F_SIGN)
				4642	sign = '+';
				4643	else if (flags & F_BLANK)
				4644	sign = ' ';
				4645	else
				4646	sign = 0;
				4647	}
				4648	if (width < len)
				4649	width = len;
				4650	if (rescnt < width + (sign != 0)) {
				4651	reslen -= rescnt;
				4652	rescnt = width + fmtcnt + 100;
				4653	reslen += rescnt;
				4654	if (_PyUnicode_Resize(result, reslen) < 0)
				4655	return NULL;
				4656	res = PyUnicode_AS_UNICODE(result)
				4657	+ reslen - rescnt;
				4658	}
				4659	if (sign) {
				4660	if (fill != ' ')
				4661	*res++ = sign;
				4662	rescnt--;
				4663	if (width > len)
				4664	width--;
				4665	}
				4666	if (width > len && !(flags & F_LJUST)) {
				4667	do {
				4668	--rescnt;
				4669	*res++ = fill;
				4670	} while (--width > len);
				4671	}
				4672	if (sign && fill == ' ')
				4673	*res++ = sign;
				4674	memcpy(res, buf, len * sizeof(Py_UNICODE));
				4675	res += len;
				4676	rescnt -= len;
				4677	while (--width >= len) {
				4678	--rescnt;
				4679	*res++ = ' ';
				4680	}
				4681	if (dict && (argidx < arglen) && c != '%') {
				4682	PyErr_SetString(PyExc_TypeError,
				4683	"not all arguments converted");
				4684	goto onError;
				4685	}
				4686	Py_XDECREF(temp);
				4687	} /* '%' */
				4688	} /* until end */
				4689	if (argidx < arglen && !dict) {
				4690	PyErr_SetString(PyExc_TypeError,
				4691	"not all arguments converted");
				4692	goto onError;
				4693	}
				4694
				4695	if (args_owned) {
				4696	Py_DECREF(args);
				4697	}
				4698	Py_DECREF(uformat);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4699	if (_PyUnicode_Resize(result, reslen - rescnt))
				4700	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4701	return (PyObject *)result;
				4702
				4703	onError:
				4704	Py_XDECREF(result);
				4705	Py_DECREF(uformat);
				4706	if (args_owned) {
				4707	Py_DECREF(args);
				4708	}
				4709	return NULL;
				4710	}
				4711
				4712	static PyBufferProcs unicode_as_buffer = {
				4713	(getreadbufferproc) unicode_buffer_getreadbuf,
				4714	(getwritebufferproc) unicode_buffer_getwritebuf,
				4715	(getsegcountproc) unicode_buffer_getsegcount,
				4716	(getcharbufferproc) unicode_buffer_getcharbuf,
				4717	};
				4718
				4719	PyTypeObject PyUnicode_Type = {
				4720	PyObject_HEAD_INIT(&PyType_Type)
				4721	0, /* ob_size */
				4722	"unicode", /* tp_name */
				4723	sizeof(PyUnicodeObject), /* tp_size */
				4724	0, /* tp_itemsize */
				4725	/* Slots */
				4726	(destructor)_PyUnicode_Free, /* tp_dealloc */
				4727	0, /* tp_print */
				4728	(getattrfunc)unicode_getattr, /* tp_getattr */
				4729	0, /* tp_setattr */
				4730	(cmpfunc) unicode_compare, /* tp_compare */
				4731	(reprfunc) unicode_repr, /* tp_repr */
				4732	0, /* tp_as_number */
				4733	&unicode_as_sequence, /* tp_as_sequence */
				4734	0, /* tp_as_mapping */
				4735	(hashfunc) unicode_hash, /* tp_hash*/
				4736	0, /* tp_call*/
				4737	(reprfunc) unicode_str, /* tp_str */
				4738	(getattrofunc) NULL, /* tp_getattro */
				4739	(setattrofunc) NULL, /* tp_setattro */
				4740	&unicode_as_buffer, /* tp_as_buffer */
				4741	Py_TPFLAGS_DEFAULT, /* tp_flags */
				4742	};
				4743
				4744	/* Initialize the Unicode implementation */
				4745
				4746	void _PyUnicode_Init()
				4747	{
				4748	/* Doublecheck the configuration... */
				4749	if (sizeof(Py_UNICODE) != 2)
				4750	Py_FatalError("Unicode configuration error: "
				4751	"sizeof(Py_UNICODE) != 2 bytes");
				4752
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4753	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4754	unicode_freelist = NULL;
				4755	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4756	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	4757	strcpy(unicode_default_encoding, "ascii");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4758	}
				4759
				4760	/* Finalize the Unicode implementation */
				4761
				4762	void
				4763	_PyUnicode_Fini()
				4764	{
				4765	PyUnicodeObject *u = unicode_freelist;
				4766
				4767	while (u != NULL) {
				4768	PyUnicodeObject *v = u;
				4769	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4770	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	4771	PyMem_DEL(v->str);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	4772	Py_XDECREF(v->utf8str);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	4773	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4774	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4775	unicode_freelist = NULL;
				4776	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4777	Py_XDECREF(unicode_empty);
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4778	unicode_empty = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4779	}