Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 172c61caba05776eee2ad57f0ad1c0d5fae983de [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame^]	107	PyUnicode_GetMax(void)
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	108	{
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	109	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
				204	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	222	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
				227	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				228	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	230	/* Keep-Alive optimization */
				231	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	232	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	233	unicode->str = NULL;
				234	unicode->length = 0;
				235	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	236	if (unicode->defenc) {
				237	Py_DECREF(unicode->defenc);
				238	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	239	}
				240	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	(PyUnicodeObject *)unicode = unicode_freelist;
				242	unicode_freelist = unicode;
				243	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	244	}
				245	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	246	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	247	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	248	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	249	}
				250	}
				251
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	252	int PyUnicode_Resize(PyObject **unicode,
				253	int length)
				254	{
				255	register PyUnicodeObject *v;
				256
				257	/* Argument checks */
				258	if (unicode == NULL) {
				259	PyErr_BadInternalCall();
				260	return -1;
				261	}
				262	v = (PyUnicodeObject )unicode;
				263	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				264	PyErr_BadInternalCall();
				265	return -1;
				266	}
				267
				268	/* Resizing unicode_empty and single character objects is not
				269	possible since these are being shared. We simply return a fresh
				270	copy with the same Unicode content. */
				271	if (v->length != length &&
				272	(v == unicode_empty \|\| v->length == 1)) {
				273	PyUnicodeObject *w = _PyUnicode_New(length);
				274	if (w == NULL)
				275	return -1;
				276	Py_UNICODE_COPY(w->str, v->str,
				277	length < v->length ? length : v->length);
				278	unicode = (PyObject )w;
				279	return 0;
				280	}
				281
				282	/* Note that we don't have to modify *unicode for unshared Unicode
				283	objects, since we can modify them in-place. */
				284	return unicode_resize(v, length);
				285	}
				286
				287	/* Internal API for use in unicodeobject.c only ! */
				288	#define _PyUnicode_Resize(unicodevar, length) \
				289	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				290
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	291	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				292	int size)
				293	{
				294	PyUnicodeObject *unicode;
				295
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	296	/* If the Unicode data is known at construction time, we can apply
				297	some optimizations which share commonly used objects. */
				298	if (u != NULL) {
				299
				300	/* Optimization for empty strings */
				301	if (size == 0 && unicode_empty != NULL) {
				302	Py_INCREF(unicode_empty);
				303	return (PyObject *)unicode_empty;
				304	}
				305
				306	/* Single character Unicode objects in the Latin-1 range are
				307	shared when using this constructor */
				308	if (size == 1 && *u < 256) {
				309	unicode = unicode_latin1[*u];
				310	if (!unicode) {
				311	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	312	if (!unicode)
				313	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	314	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	315	unicode_latin1[*u] = unicode;
				316	}
				317	Py_INCREF(unicode);
				318	return (PyObject *)unicode;
				319	}
				320	}
				321
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	322	unicode = _PyUnicode_New(size);
				323	if (!unicode)
				324	return NULL;
				325
				326	/* Copy the Unicode data into the new object */
				327	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	328	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	329
				330	return (PyObject *)unicode;
				331	}
				332
				333	#ifdef HAVE_WCHAR_H
				334
				335	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				336	int size)
				337	{
				338	PyUnicodeObject *unicode;
				339
				340	if (w == NULL) {
				341	PyErr_BadInternalCall();
				342	return NULL;
				343	}
				344
				345	unicode = _PyUnicode_New(size);
				346	if (!unicode)
				347	return NULL;
				348
				349	/* Copy the wchar_t data into the new object */
				350	#ifdef HAVE_USABLE_WCHAR_T
				351	memcpy(unicode->str, w, size * sizeof(wchar_t));
				352	#else
				353	{
				354	register Py_UNICODE *u;
				355	register int i;
				356	u = PyUnicode_AS_UNICODE(unicode);
				357	for (i = size; i >= 0; i--)
				358	u++ = w++;
				359	}
				360	#endif
				361
				362	return (PyObject *)unicode;
				363	}
				364
				365	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				366	register wchar_t *w,
				367	int size)
				368	{
				369	if (unicode == NULL) {
				370	PyErr_BadInternalCall();
				371	return -1;
				372	}
				373	if (size > PyUnicode_GET_SIZE(unicode))
				374	size = PyUnicode_GET_SIZE(unicode);
				375	#ifdef HAVE_USABLE_WCHAR_T
				376	memcpy(w, unicode->str, size * sizeof(wchar_t));
				377	#else
				378	{
				379	register Py_UNICODE *u;
				380	register int i;
				381	u = PyUnicode_AS_UNICODE(unicode);
				382	for (i = size; i >= 0; i--)
				383	w++ = u++;
				384	}
				385	#endif
				386
				387	return size;
				388	}
				389
				390	#endif
				391
				392	PyObject PyUnicode_FromObject(register PyObject obj)
				393	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	394	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				395	}
				396
				397	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				398	const char *encoding,
				399	const char *errors)
				400	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	401	const char *s;
				402	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	403	int owned = 0;
				404	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	405
				406	if (obj == NULL) {
				407	PyErr_BadInternalCall();
				408	return NULL;
				409	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	410
				411	/* Coerce object */
				412	if (PyInstance_Check(obj)) {
				413	PyObject *func;
				414	func = PyObject_GetAttrString(obj, "__str__");
				415	if (func == NULL) {
				416	PyErr_SetString(PyExc_TypeError,
				417	"coercing to Unicode: instance doesn't define __str__");
				418	return NULL;
				419	}
				420	obj = PyEval_CallObject(func, NULL);
				421	Py_DECREF(func);
				422	if (obj == NULL)
				423	return NULL;
				424	owned = 1;
				425	}
				426	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	427	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	428	v = obj;
				429	if (encoding) {
				430	PyErr_SetString(PyExc_TypeError,
				431	"decoding Unicode is not supported");
				432	return NULL;
				433	}
				434	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	435	}
				436	else if (PyString_Check(obj)) {
				437	s = PyString_AS_STRING(obj);
				438	len = PyString_GET_SIZE(obj);
				439	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	440	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				441	/* Overwrite the error message with something more useful in
				442	case of a TypeError. */
				443	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	444	PyErr_Format(PyExc_TypeError,
				445	"coercing to Unicode: need string or buffer, "
				446	"%.80s found",
				447	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	448	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	449	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	450
				451	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	452	if (len == 0) {
				453	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	454	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	455	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	456	else
				457	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	458
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	459	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	460	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	461	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	462	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	463	return v;
				464
				465	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	466	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	467	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	468	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	469	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	470	}
				471
				472	PyObject PyUnicode_Decode(const char s,
				473	int size,
				474	const char *encoding,
				475	const char *errors)
				476	{
				477	PyObject buffer = NULL, unicode;
				478
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	479	if (encoding == NULL)
				480	encoding = PyUnicode_GetDefaultEncoding();
				481
				482	/* Shortcuts for common default encodings */
				483	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	484	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	485	else if (strcmp(encoding, "latin-1") == 0)
				486	return PyUnicode_DecodeLatin1(s, size, errors);
				487	else if (strcmp(encoding, "ascii") == 0)
				488	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	489
				490	/* Decode via the codec registry */
				491	buffer = PyBuffer_FromMemory((void *)s, size);
				492	if (buffer == NULL)
				493	goto onError;
				494	unicode = PyCodec_Decode(buffer, encoding, errors);
				495	if (unicode == NULL)
				496	goto onError;
				497	if (!PyUnicode_Check(unicode)) {
				498	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	499	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	500	unicode->ob_type->tp_name);
				501	Py_DECREF(unicode);
				502	goto onError;
				503	}
				504	Py_DECREF(buffer);
				505	return unicode;
				506
				507	onError:
				508	Py_XDECREF(buffer);
				509	return NULL;
				510	}
				511
				512	PyObject PyUnicode_Encode(const Py_UNICODE s,
				513	int size,
				514	const char *encoding,
				515	const char *errors)
				516	{
				517	PyObject v, unicode;
				518
				519	unicode = PyUnicode_FromUnicode(s, size);
				520	if (unicode == NULL)
				521	return NULL;
				522	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				523	Py_DECREF(unicode);
				524	return v;
				525	}
				526
				527	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				528	const char *encoding,
				529	const char *errors)
				530	{
				531	PyObject *v;
				532
				533	if (!PyUnicode_Check(unicode)) {
				534	PyErr_BadArgument();
				535	goto onError;
				536	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	537
				538	if (encoding == NULL)
				539	encoding = PyUnicode_GetDefaultEncoding();
				540
				541	/* Shortcuts for common default encodings */
				542	if (errors == NULL) {
				543	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	544	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	545	else if (strcmp(encoding, "latin-1") == 0)
				546	return PyUnicode_AsLatin1String(unicode);
				547	else if (strcmp(encoding, "ascii") == 0)
				548	return PyUnicode_AsASCIIString(unicode);
				549	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	550
				551	/* Encode via the codec registry */
				552	v = PyCodec_Encode(unicode, encoding, errors);
				553	if (v == NULL)
				554	goto onError;
				555	/* XXX Should we really enforce this ? */
				556	if (!PyString_Check(v)) {
				557	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	558	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	559	v->ob_type->tp_name);
				560	Py_DECREF(v);
				561	goto onError;
				562	}
				563	return v;
				564
				565	onError:
				566	return NULL;
				567	}
				568
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	569	/* Return a Python string holding the default encoded value of the
				570	Unicode object.
				571
				572	The resulting string is cached in the Unicode object for subsequent
				573	usage by this function. The cached version is needed to implement
				574	the character buffer interface and will live (at least) as long as
				575	the Unicode object itself.
				576
				577	The refcount of the string is not incremented.
				578
				579	* Exported for internal use by the interpreter only !!! *
				580
				581	*/
				582
				583	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				584	const char *errors)
				585	{
				586	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				587
				588	if (v)
				589	return v;
				590	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				591	if (v && errors == NULL)
				592	((PyUnicodeObject *)unicode)->defenc = v;
				593	return v;
				594	}
				595
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	596	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				597	{
				598	if (!PyUnicode_Check(unicode)) {
				599	PyErr_BadArgument();
				600	goto onError;
				601	}
				602	return PyUnicode_AS_UNICODE(unicode);
				603
				604	onError:
				605	return NULL;
				606	}
				607
				608	int PyUnicode_GetSize(PyObject *unicode)
				609	{
				610	if (!PyUnicode_Check(unicode)) {
				611	PyErr_BadArgument();
				612	goto onError;
				613	}
				614	return PyUnicode_GET_SIZE(unicode);
				615
				616	onError:
				617	return -1;
				618	}
				619
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	620	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	621	{
				622	return unicode_default_encoding;
				623	}
				624
				625	int PyUnicode_SetDefaultEncoding(const char *encoding)
				626	{
				627	PyObject *v;
				628
				629	/* Make sure the encoding is valid. As side effect, this also
				630	loads the encoding into the codec registry cache. */
				631	v = _PyCodec_Lookup(encoding);
				632	if (v == NULL)
				633	goto onError;
				634	Py_DECREF(v);
				635	strncpy(unicode_default_encoding,
				636	encoding,
				637	sizeof(unicode_default_encoding));
				638	return 0;
				639
				640	onError:
				641	return -1;
				642	}
				643
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	644	/* --- UTF-8 Codec -------------------------------------------------------- */
				645
				646	static
				647	char utf8_code_length[256] = {
				648	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				649	illegal prefix. see RFC 2279 for details */
				650	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				651	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				652	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				653	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				654	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				655	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				656	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				657	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				658	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				659	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				660	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				661	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				662	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				663	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				664	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				665	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				666	};
				667
				668	static
				669	int utf8_decoding_error(const char **source,
				670	Py_UNICODE **dest,
				671	const char *errors,
				672	const char *details)
				673	{
				674	if ((errors == NULL) \|\|
				675	(strcmp(errors,"strict") == 0)) {
				676	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	677	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	678	details);
				679	return -1;
				680	}
				681	else if (strcmp(errors,"ignore") == 0) {
				682	(*source)++;
				683	return 0;
				684	}
				685	else if (strcmp(errors,"replace") == 0) {
				686	(*source)++;
				687	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				688	(*dest)++;
				689	return 0;
				690	}
				691	else {
				692	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	693	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	694	errors);
				695	return -1;
				696	}
				697	}
				698
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699	PyObject PyUnicode_DecodeUTF8(const char s,
				700	int size,
				701	const char *errors)
				702	{
				703	int n;
				704	const char *e;
				705	PyUnicodeObject *unicode;
				706	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	707	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	708
				709	/* Note: size will always be longer than the resulting Unicode
				710	character count */
				711	unicode = _PyUnicode_New(size);
				712	if (!unicode)
				713	return NULL;
				714	if (size == 0)
				715	return (PyObject *)unicode;
				716
				717	/* Unpack UTF-8 encoded data */
				718	p = unicode->str;
				719	e = s + size;
				720
				721	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	722	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	723
				724	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	725	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	726	s++;
				727	continue;
				728	}
				729
				730	n = utf8_code_length[ch];
				731
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	732	if (s + n > e) {
				733	errmsg = "unexpected end of data";
				734	goto utf8Error;
				735	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	736
				737	switch (n) {
				738
				739	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	740	errmsg = "unexpected code byte";
				741	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	742
				743	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	744	errmsg = "internal error";
				745	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	746
				747	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	748	if ((s[1] & 0xc0) != 0x80) {
				749	errmsg = "invalid data";
				750	goto utf8Error;
				751	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	752	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	753	if (ch < 0x80) {
				754	errmsg = "illegal encoding";
				755	goto utf8Error;
				756	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	757	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	758	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	759	break;
				760
				761	case 3:
				762	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	763	(s[2] & 0xc0) != 0x80) {
				764	errmsg = "invalid data";
				765	goto utf8Error;
				766	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	767	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	768	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				769	errmsg = "illegal encoding";
				770	goto utf8Error;
				771	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	772	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	773	*p++ = (Py_UNICODE)ch;
				774	break;
				775
				776	case 4:
				777	if ((s[1] & 0xc0) != 0x80 \|\|
				778	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	779	(s[3] & 0xc0) != 0x80) {
				780	errmsg = "invalid data";
				781	goto utf8Error;
				782	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	783	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				784	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				785	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	786	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	787	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	788	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	789	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	790	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	791	errmsg = "illegal encoding";
				792	goto utf8Error;
				793	}
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	794	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	795	*p++ = (Py_UNICODE)ch;
				796	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	797	/* compute and append the two surrogates: */
				798
				799	/* translate from 10000..10FFFF to 0..FFFF */
				800	ch -= 0x10000;
				801
				802	/* high surrogate = top 10 bits added to D800 */
				803	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				804
				805	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	806	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	807	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	808	break;
				809
				810	default:
				811	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	812	errmsg = "unsupported Unicode code range";
				813	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	814	}
				815	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	816	continue;
				817
				818	utf8Error:
				819	if (utf8_decoding_error(&s, &p, errors, errmsg))
				820	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	821	}
				822
				823	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	824	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	825	goto onError;
				826
				827	return (PyObject *)unicode;
				828
				829	onError:
				830	Py_DECREF(unicode);
				831	return NULL;
				832	}
				833
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	834	/* Not used anymore, now that the encoder supports UTF-16
				835	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	836	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	837	static
				838	int utf8_encoding_error(const Py_UNICODE **source,
				839	char **dest,
				840	const char *errors,
				841	const char *details)
				842	{
				843	if ((errors == NULL) \|\|
				844	(strcmp(errors,"strict") == 0)) {
				845	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	846	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	847	details);
				848	return -1;
				849	}
				850	else if (strcmp(errors,"ignore") == 0) {
				851	return 0;
				852	}
				853	else if (strcmp(errors,"replace") == 0) {
				854	**dest = '?';
				855	(*dest)++;
				856	return 0;
				857	}
				858	else {
				859	PyErr_Format(PyExc_ValueError,
				860	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	861	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	862	errors);
				863	return -1;
				864	}
				865	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	866	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	867
				868	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				869	int size,
				870	const char *errors)
				871	{
				872	PyObject *v;
				873	char *p;
				874	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	875	Py_UCS4 ch2;
				876	unsigned int cbAllocated = 3 * size;
				877	unsigned int cbWritten = 0;
				878	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	879
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	880	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	881	if (v == NULL)
				882	return NULL;
				883	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	884	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	885
				886	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	887	while (i < size) {
				888	Py_UCS4 ch = s[i++];
				889	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	890	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	891	cbWritten++;
				892	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	893	else if (ch < 0x0800) {
				894	*p++ = 0xc0 \| (ch >> 6);
				895	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	896	cbWritten += 2;
				897	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	898	else if (ch < 0x10000) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	899	/* Check for high surrogate */
				900	if (0xD800 <= ch && ch <= 0xDBFF) {
				901	if (i != size) {
				902	ch2 = s[i];
				903	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				904
				905	if (cbWritten >= (cbAllocated - 4)) {
				906	/* Provide enough room for some more
				907	surrogates */
				908	cbAllocated += 4*10;
				909	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	910	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	911	}
				912
				913	/* combine the two values */
				914	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				915
				916	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	917	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	918	i++;
				919	cbWritten += 4;
				920	}
				921	}
				922	}
				923	else {
				924	*p++ = (char)(0xe0 \| (ch >> 12));
				925	cbWritten += 3;
				926	}
				927	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				928	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	929	} else {
				930	*p++ = 0xf0 \| (ch>>18);
				931	*p++ = 0x80 \| ((ch>>12) & 0x3f);
				932	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				933	*p++ = 0x80 \| (ch & 0x3f);
				934	cbWritten += 4;
				935	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	936	}
				937	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	938	if (_PyString_Resize(&v, p - q))
				939	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	940	return v;
				941
				942	onError:
				943	Py_DECREF(v);
				944	return NULL;
				945	}
				946
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	947	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				948	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	949	if (!PyUnicode_Check(unicode)) {
				950	PyErr_BadArgument();
				951	return NULL;
				952	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	953	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				954	PyUnicode_GET_SIZE(unicode),
				955	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	956	}
				957
				958	/* --- UTF-16 Codec ------------------------------------------------------- */
				959
				960	static
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	961	int utf16_decoding_error(const Py_UCS2 **source,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	962	Py_UNICODE **dest,
				963	const char *errors,
				964	const char *details)
				965	{
				966	if ((errors == NULL) \|\|
				967	(strcmp(errors,"strict") == 0)) {
				968	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	969	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	970	details);
				971	return -1;
				972	}
				973	else if (strcmp(errors,"ignore") == 0) {
				974	return 0;
				975	}
				976	else if (strcmp(errors,"replace") == 0) {
				977	if (dest) {
				978	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				979	(*dest)++;
				980	}
				981	return 0;
				982	}
				983	else {
				984	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	985	"UTF-16 decoding error; "
				986	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	987	errors);
				988	return -1;
				989	}
				990	}
				991
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	992	PyObject PyUnicode_DecodeUTF16(const char s,
				993	int size,
				994	const char *errors,
				995	int *byteorder)
				996	{
				997	PyUnicodeObject *unicode;
				998	Py_UNICODE *p;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	999	const Py_UCS2 q, e;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1000	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1001	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1002
				1003	/* size should be an even number */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1004	if (size % sizeof(Py_UCS2) != 0) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1005	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				1006	return NULL;
				1007	/* The remaining input chars are ignored if we fall through
				1008	here... */
				1009	}
				1010
				1011	/* Note: size will always be longer than the resulting Unicode
				1012	character count */
				1013	unicode = _PyUnicode_New(size);
				1014	if (!unicode)
				1015	return NULL;
				1016	if (size == 0)
				1017	return (PyObject *)unicode;
				1018
				1019	/* Unpack UTF-16 encoded data */
				1020	p = unicode->str;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1021	q = (Py_UCS2 *)s;
				1022	e = q + (size / sizeof(Py_UCS2));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1023
				1024	if (byteorder)
				1025	bo = *byteorder;
				1026
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1027	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1028	byte order setting accordingly. In native mode, the leading BOM
				1029	mark is skipped, in all other modes, it is copied to the output
				1030	stream as-is (giving a ZWNBSP character). */
				1031	if (bo == 0) {
				1032	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1033	if (*q == 0xFEFF) {
				1034	q++;
				1035	bo = -1;
				1036	} else if (*q == 0xFFFE) {
				1037	q++;
				1038	bo = 1;
				1039	}
				1040	#else
				1041	if (*q == 0xFEFF) {
				1042	q++;
				1043	bo = 1;
				1044	} else if (*q == 0xFFFE) {
				1045	q++;
				1046	bo = -1;
				1047	}
				1048	#endif
				1049	}
				1050
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1051	while (q < e) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1052	register Py_UCS2 ch = *q++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1053
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1054	/* Swap input bytes if needed. (This assumes
				1055	sizeof(Py_UNICODE) == 2 !) */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1056	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1057	if (bo == 1)
				1058	ch = (ch >> 8) \| (ch << 8);
				1059	#else
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1060	if (bo == -1)
				1061	ch = (ch >> 8) \| (ch << 8);
				1062	#endif
				1063	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1064	*p++ = ch;
				1065	continue;
				1066	}
				1067
				1068	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1069	if (q >= e) {
				1070	errmsg = "unexpected end of data";
				1071	goto utf16Error;
				1072	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1073	if (0xD800 <= ch && ch <= 0xDBFF) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1074	Py_UCS2 ch2 = *q++;
				1075	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1076	if (bo == 1)
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1077	ch2 = (ch2 >> 8) \| (ch2 << 8);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1078	#else
				1079	if (bo == -1)
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1080	ch2 = (ch2 >> 8) \| (ch2 << 8);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1081	#endif
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1082	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1083	#ifndef Py_UNICODE_WIDE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame^]	1084	*p++ = ch;
				1085	*p++ = ch2;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1086	#else
				1087	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1088	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame^]	1089	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1090	}
				1091	else {
				1092	errmsg = "illegal UTF-16 surrogate";
				1093	goto utf16Error;
				1094	}
				1095
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1096	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1097	errmsg = "illegal encoding";
				1098	/* Fall through to report the error */
				1099
				1100	utf16Error:
				1101	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1102	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1103	}
				1104
				1105	if (byteorder)
				1106	*byteorder = bo;
				1107
				1108	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1109	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1110	goto onError;
				1111
				1112	return (PyObject *)unicode;
				1113
				1114	onError:
				1115	Py_DECREF(unicode);
				1116	return NULL;
				1117	}
				1118
				1119	#undef UTF16_ERROR
				1120
				1121	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1122	int size,
				1123	const char *errors,
				1124	int byteorder)
				1125	{
				1126	PyObject *v;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1127	Py_UCS2 *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1128	char *q;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1129	int i, pairs, doswap = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1130
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1131	for (i = pairs = 0; i < size; i++)
				1132	if (s[i] >= 0x10000)
				1133	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1134	v = PyString_FromStringAndSize(NULL,
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1135	sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1136	if (v == NULL)
				1137	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1138
				1139	q = PyString_AS_STRING(v);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1140	p = (Py_UCS2 *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1141	if (byteorder == 0)
				1142	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1143	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1144	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1145	if (byteorder == 0 \|\|
				1146	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1147	byteorder == -1
				1148	#else
				1149	byteorder == 1
				1150	#endif
				1151	)
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1152	doswap = 0;
				1153	while (size-- > 0) {
				1154	Py_UNICODE ch = *s++;
				1155	Py_UNICODE ch2 = 0;
				1156	if (ch >= 0x10000) {
				1157	ch2 = 0xDC00\|((ch-0x10000) & 0x3FF);
				1158	ch = 0xD800\|((ch-0x10000)>>10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1159	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1160	if (doswap){
				1161	*p++ = (ch >> 8) \| (ch << 8);
				1162	if (ch2)
				1163	*p++ = (ch2 >> 8) \| (ch2 << 8);
				1164	}else{
				1165	*p++ = ch;
				1166	if(ch2)
				1167	*p++ = ch2;
				1168	}
				1169	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1170	return v;
				1171	}
				1172
				1173	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1174	{
				1175	if (!PyUnicode_Check(unicode)) {
				1176	PyErr_BadArgument();
				1177	return NULL;
				1178	}
				1179	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1180	PyUnicode_GET_SIZE(unicode),
				1181	NULL,
				1182	0);
				1183	}
				1184
				1185	/* --- Unicode Escape Codec ----------------------------------------------- */
				1186
				1187	static
				1188	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1189	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1190	const char *errors,
				1191	const char *details)
				1192	{
				1193	if ((errors == NULL) \|\|
				1194	(strcmp(errors,"strict") == 0)) {
				1195	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1196	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1197	details);
				1198	return -1;
				1199	}
				1200	else if (strcmp(errors,"ignore") == 0) {
				1201	return 0;
				1202	}
				1203	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1204	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1205	return 0;
				1206	}
				1207	else {
				1208	PyErr_Format(PyExc_ValueError,
				1209	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1210	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1211	errors);
				1212	return -1;
				1213	}
				1214	}
				1215
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1216	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1217
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1218	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1219	int size,
				1220	const char *errors)
				1221	{
				1222	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1223	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1224	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1225	char* message;
				1226	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1227
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1228	/* Escaped strings will always be longer than the resulting
				1229	Unicode string, so we start with size here and then reduce the
				1230	length after conversion to the true value. */
				1231	v = _PyUnicode_New(size);
				1232	if (v == NULL)
				1233	goto onError;
				1234	if (size == 0)
				1235	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1236
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1237	p = buf = PyUnicode_AS_UNICODE(v);
				1238	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1239
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1240	while (s < end) {
				1241	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1242	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1243	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1244
				1245	/* Non-escape characters are interpreted as Unicode ordinals */
				1246	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1247	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1248	continue;
				1249	}
				1250
				1251	/* \ - Escapes */
				1252	s++;
				1253	switch (*s++) {
				1254
				1255	/* \x escapes */
				1256	case '\n': break;
				1257	case '\\': *p++ = '\\'; break;
				1258	case '\'': *p++ = '\''; break;
				1259	case '\"': *p++ = '\"'; break;
				1260	case 'b': *p++ = '\b'; break;
				1261	case 'f': p++ = '\014'; break; / FF */
				1262	case 't': *p++ = '\t'; break;
				1263	case 'n': *p++ = '\n'; break;
				1264	case 'r': *p++ = '\r'; break;
				1265	case 'v': p++ = '\013'; break; / VT */
				1266	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1267
				1268	/* \OOO (octal) escapes */
				1269	case '0': case '1': case '2': case '3':
				1270	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1271	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1272	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1273	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1274	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1275	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1276	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1277	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1278	break;
				1279
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1280	/* hex escapes */
				1281	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1282	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1283	digits = 2;
				1284	message = "truncated \\xXX escape";
				1285	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1286
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1287	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1288	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1289	digits = 4;
				1290	message = "truncated \\uXXXX escape";
				1291	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1292
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1293	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1294	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1295	digits = 8;
				1296	message = "truncated \\UXXXXXXXX escape";
				1297	hexescape:
				1298	chr = 0;
				1299	for (i = 0; i < digits; i++) {
				1300	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1301	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1302	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1303	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1304	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1305	i++;
				1306	break;
				1307	}
				1308	chr = (chr<<4) & ~0xF;
				1309	if (c >= '0' && c <= '9')
				1310	chr += c - '0';
				1311	else if (c >= 'a' && c <= 'f')
				1312	chr += 10 + c - 'a';
				1313	else
				1314	chr += 10 + c - 'A';
				1315	}
				1316	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1317	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1318	/* when we get here, chr is a 32-bit unicode character */
				1319	if (chr <= 0xffff)
				1320	/* UCS-2 character */
				1321	*p++ = (Py_UNICODE) chr;
				1322	else if (chr <= 0x10ffff) {
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame^]	1323	/* UCS-4 character. Either store directly, or as
				1324	surrogate pair. */
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1325	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1326	*p++ = chr;
				1327	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1328	chr -= 0x10000L;
				1329	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1330	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1331	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1332	} else {
				1333	if (unicodeescape_decoding_error(
				1334	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1335	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1336	)
				1337	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1338	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1339	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1340	break;
				1341
				1342	/* \N{name} */
				1343	case 'N':
				1344	message = "malformed \\N character escape";
				1345	if (ucnhash_CAPI == NULL) {
				1346	/* load the unicode data module */
				1347	PyObject m, v;
				1348	m = PyImport_ImportModule("unicodedata");
				1349	if (m == NULL)
				1350	goto ucnhashError;
				1351	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1352	Py_DECREF(m);
				1353	if (v == NULL)
				1354	goto ucnhashError;
				1355	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1356	Py_DECREF(v);
				1357	if (ucnhash_CAPI == NULL)
				1358	goto ucnhashError;
				1359	}
				1360	if (*s == '{') {
				1361	const char *start = s+1;
				1362	/* look for the closing brace */
				1363	while (*s != '}' && s < end)
				1364	s++;
				1365	if (s > start && s < end && *s == '}') {
				1366	/* found a name. look it up in the unicode database */
				1367	message = "unknown Unicode character name";
				1368	s++;
				1369	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1370	goto store;
				1371	}
				1372	}
				1373	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1374	goto onError;
				1375	*p++ = x;
				1376	break;
				1377
				1378	default:
				1379	*p++ = '\\';
				1380	*p++ = (unsigned char)s[-1];
				1381	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1382	}
				1383	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1384	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1385	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1386	return (PyObject *)v;
				1387
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1388	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1389	PyErr_SetString(
				1390	PyExc_UnicodeError,
				1391	"\\N escapes not supported (can't load unicodedata module)"
				1392	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1393	return NULL;
				1394
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1395	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1396	Py_XDECREF(v);
				1397	return NULL;
				1398	}
				1399
				1400	/* Return a Unicode-Escape string version of the Unicode object.
				1401
				1402	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1403	appropriate.
				1404
				1405	*/
				1406
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1407	static const Py_UNICODE findchar(const Py_UNICODE s,
				1408	int size,
				1409	Py_UNICODE ch);
				1410
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1411	static
				1412	PyObject unicodeescape_string(const Py_UNICODE s,
				1413	int size,
				1414	int quotes)
				1415	{
				1416	PyObject *repr;
				1417	char *p;
				1418	char *q;
				1419
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1420	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1421
				1422	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1423	if (repr == NULL)
				1424	return NULL;
				1425
				1426	p = q = PyString_AS_STRING(repr);
				1427
				1428	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1429	*p++ = 'u';
				1430	*p++ = (findchar(s, size, '\'') &&
				1431	!findchar(s, size, '"')) ? '"' : '\'';
				1432	}
				1433	while (size-- > 0) {
				1434	Py_UNICODE ch = *s++;
				1435	/* Escape quotes */
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	1436	if (quotes && (ch == (Py_UNICODE) q[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1437	*p++ = '\\';
				1438	*p++ = (char) ch;
				1439	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1440	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1441	/* Map 21-bit characters to '\U00xxxxxx' */
				1442	else if (ch >= 0x10000) {
				1443	*p++ = '\\';
				1444	*p++ = 'U';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame^]	1445	*p++ = hexdigit[(ch >> 28) & 0x0000000F];
				1446	*p++ = hexdigit[(ch >> 24) & 0x0000000F];
				1447	*p++ = hexdigit[(ch >> 20) & 0x0000000F];
				1448	*p++ = hexdigit[(ch >> 16) & 0x0000000F];
				1449	*p++ = hexdigit[(ch >> 12) & 0x0000000F];
				1450	*p++ = hexdigit[(ch >> 8) & 0x0000000F];
				1451	*p++ = hexdigit[(ch >> 4) & 0x0000000F];
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1452	*p++ = hexdigit[ch & 15];
				1453	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1454	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame^]	1455	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
				1456	else if (ch >= 0xD800 && ch < 0xDC00) {
				1457	Py_UNICODE ch2;
				1458	Py_UCS4 ucs;
				1459
				1460	ch2 = *s++;
				1461	size--;
				1462	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
				1463	ucs = (((ch & 0x03FF) << 10) \| (ch2 & 0x03FF)) + 0x00010000;
				1464	*p++ = '\\';
				1465	*p++ = 'U';
				1466	*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
				1467	*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
				1468	*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
				1469	*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
				1470	*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
				1471	*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
				1472	*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
				1473	*p++ = hexdigit[ucs & 0x0000000F];
				1474	continue;
				1475	}
				1476	/* Fall through: isolated surrogates are copied as-is */
				1477	s--;
				1478	size++;
				1479	}
				1480
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1481	/* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame^]	1482	if (ch >= 256) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1483	*p++ = '\\';
				1484	*p++ = 'u';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame^]	1485	*p++ = hexdigit[(ch >> 12) & 0x000F];
				1486	*p++ = hexdigit[(ch >> 8) & 0x000F];
				1487	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1488	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1489	}
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1490	/* Map special whitespace to '\t', \n', '\r' */
				1491	else if (ch == '\t') {
				1492	*p++ = '\\';
				1493	*p++ = 't';
				1494	}
				1495	else if (ch == '\n') {
				1496	*p++ = '\\';
				1497	*p++ = 'n';
				1498	}
				1499	else if (ch == '\r') {
				1500	*p++ = '\\';
				1501	*p++ = 'r';
				1502	}
				1503	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1504	else if (ch < ' ' \|\| ch >= 128) {
				1505	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1506	*p++ = 'x';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame^]	1507	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1508	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1509	}
				1510	/* Copy everything else as-is */
				1511	else
				1512	*p++ = (char) ch;
				1513	}
				1514	if (quotes)
				1515	*p++ = q[1];
				1516
				1517	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1518	if (_PyString_Resize(&repr, p - q))
				1519	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1520
				1521	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1522
				1523	onError:
				1524	Py_DECREF(repr);
				1525	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1526	}
				1527
				1528	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1529	int size)
				1530	{
				1531	return unicodeescape_string(s, size, 0);
				1532	}
				1533
				1534	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1535	{
				1536	if (!PyUnicode_Check(unicode)) {
				1537	PyErr_BadArgument();
				1538	return NULL;
				1539	}
				1540	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1541	PyUnicode_GET_SIZE(unicode));
				1542	}
				1543
				1544	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1545
				1546	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1547	int size,
				1548	const char *errors)
				1549	{
				1550	PyUnicodeObject *v;
				1551	Py_UNICODE p, buf;
				1552	const char *end;
				1553	const char *bs;
				1554
				1555	/* Escaped strings will always be longer than the resulting
				1556	Unicode string, so we start with size here and then reduce the
				1557	length after conversion to the true value. */
				1558	v = _PyUnicode_New(size);
				1559	if (v == NULL)
				1560	goto onError;
				1561	if (size == 0)
				1562	return (PyObject *)v;
				1563	p = buf = PyUnicode_AS_UNICODE(v);
				1564	end = s + size;
				1565	while (s < end) {
				1566	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1567	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1568	int i;
				1569
				1570	/* Non-escape characters are interpreted as Unicode ordinals */
				1571	if (*s != '\\') {
				1572	p++ = (unsigned char)s++;
				1573	continue;
				1574	}
				1575
				1576	/* \u-escapes are only interpreted iff the number of leading
				1577	backslashes if odd */
				1578	bs = s;
				1579	for (;s < end;) {
				1580	if (*s != '\\')
				1581	break;
				1582	p++ = (unsigned char)s++;
				1583	}
				1584	if (((s - bs) & 1) == 0 \|\|
				1585	s >= end \|\|
				1586	*s != 'u') {
				1587	continue;
				1588	}
				1589	p--;
				1590	s++;
				1591
				1592	/* \uXXXX with 4 hex digits */
				1593	for (x = 0, i = 0; i < 4; i++) {
				1594	c = (unsigned char)s[i];
				1595	if (!isxdigit(c)) {
				1596	if (unicodeescape_decoding_error(&s, &x, errors,
				1597	"truncated \\uXXXX"))
				1598	goto onError;
				1599	i++;
				1600	break;
				1601	}
				1602	x = (x<<4) & ~0xF;
				1603	if (c >= '0' && c <= '9')
				1604	x += c - '0';
				1605	else if (c >= 'a' && c <= 'f')
				1606	x += 10 + c - 'a';
				1607	else
				1608	x += 10 + c - 'A';
				1609	}
				1610	s += i;
				1611	*p++ = x;
				1612	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1613	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1614	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1615	return (PyObject *)v;
				1616
				1617	onError:
				1618	Py_XDECREF(v);
				1619	return NULL;
				1620	}
				1621
				1622	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1623	int size)
				1624	{
				1625	PyObject *repr;
				1626	char *p;
				1627	char *q;
				1628
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1629	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1630
				1631	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1632	if (repr == NULL)
				1633	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1634	if (size == 0)
				1635	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1636
				1637	p = q = PyString_AS_STRING(repr);
				1638	while (size-- > 0) {
				1639	Py_UNICODE ch = *s++;
				1640	/* Map 16-bit characters to '\uxxxx' */
				1641	if (ch >= 256) {
				1642	*p++ = '\\';
				1643	*p++ = 'u';
				1644	*p++ = hexdigit[(ch >> 12) & 0xf];
				1645	*p++ = hexdigit[(ch >> 8) & 0xf];
				1646	*p++ = hexdigit[(ch >> 4) & 0xf];
				1647	*p++ = hexdigit[ch & 15];
				1648	}
				1649	/* Copy everything else as-is */
				1650	else
				1651	*p++ = (char) ch;
				1652	}
				1653	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1654	if (_PyString_Resize(&repr, p - q))
				1655	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1656
				1657	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1658
				1659	onError:
				1660	Py_DECREF(repr);
				1661	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1662	}
				1663
				1664	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1665	{
				1666	if (!PyUnicode_Check(unicode)) {
				1667	PyErr_BadArgument();
				1668	return NULL;
				1669	}
				1670	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1671	PyUnicode_GET_SIZE(unicode));
				1672	}
				1673
				1674	/* --- Latin-1 Codec ------------------------------------------------------ */
				1675
				1676	PyObject PyUnicode_DecodeLatin1(const char s,
				1677	int size,
				1678	const char *errors)
				1679	{
				1680	PyUnicodeObject *v;
				1681	Py_UNICODE *p;
				1682
				1683	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1684	if (size == 1 && (unsigned char)s < 256) {
				1685	Py_UNICODE r = (unsigned char)s;
				1686	return PyUnicode_FromUnicode(&r, 1);
				1687	}
				1688
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1689	v = _PyUnicode_New(size);
				1690	if (v == NULL)
				1691	goto onError;
				1692	if (size == 0)
				1693	return (PyObject *)v;
				1694	p = PyUnicode_AS_UNICODE(v);
				1695	while (size-- > 0)
				1696	p++ = (unsigned char)s++;
				1697	return (PyObject *)v;
				1698
				1699	onError:
				1700	Py_XDECREF(v);
				1701	return NULL;
				1702	}
				1703
				1704	static
				1705	int latin1_encoding_error(const Py_UNICODE **source,
				1706	char **dest,
				1707	const char *errors,
				1708	const char *details)
				1709	{
				1710	if ((errors == NULL) \|\|
				1711	(strcmp(errors,"strict") == 0)) {
				1712	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1713	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1714	details);
				1715	return -1;
				1716	}
				1717	else if (strcmp(errors,"ignore") == 0) {
				1718	return 0;
				1719	}
				1720	else if (strcmp(errors,"replace") == 0) {
				1721	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1722	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1723	return 0;
				1724	}
				1725	else {
				1726	PyErr_Format(PyExc_ValueError,
				1727	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1728	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1729	errors);
				1730	return -1;
				1731	}
				1732	}
				1733
				1734	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1735	int size,
				1736	const char *errors)
				1737	{
				1738	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1739	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1740
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1741	repr = PyString_FromStringAndSize(NULL, size);
				1742	if (repr == NULL)
				1743	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1744	if (size == 0)
				1745	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1746
				1747	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1748	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1749	while (size-- > 0) {
				1750	Py_UNICODE ch = *p++;
				1751	if (ch >= 256) {
				1752	if (latin1_encoding_error(&p, &s, errors,
				1753	"ordinal not in range(256)"))
				1754	goto onError;
				1755	}
				1756	else
				1757	*s++ = (char)ch;
				1758	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1759	/* Resize if error handling skipped some characters */
				1760	if (s - start < PyString_GET_SIZE(repr))
				1761	if (_PyString_Resize(&repr, s - start))
				1762	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1763	return repr;
				1764
				1765	onError:
				1766	Py_DECREF(repr);
				1767	return NULL;
				1768	}
				1769
				1770	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1771	{
				1772	if (!PyUnicode_Check(unicode)) {
				1773	PyErr_BadArgument();
				1774	return NULL;
				1775	}
				1776	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1777	PyUnicode_GET_SIZE(unicode),
				1778	NULL);
				1779	}
				1780
				1781	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1782
				1783	static
				1784	int ascii_decoding_error(const char **source,
				1785	Py_UNICODE **dest,
				1786	const char *errors,
				1787	const char *details)
				1788	{
				1789	if ((errors == NULL) \|\|
				1790	(strcmp(errors,"strict") == 0)) {
				1791	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1792	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1793	details);
				1794	return -1;
				1795	}
				1796	else if (strcmp(errors,"ignore") == 0) {
				1797	return 0;
				1798	}
				1799	else if (strcmp(errors,"replace") == 0) {
				1800	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1801	(*dest)++;
				1802	return 0;
				1803	}
				1804	else {
				1805	PyErr_Format(PyExc_ValueError,
				1806	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1807	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1808	errors);
				1809	return -1;
				1810	}
				1811	}
				1812
				1813	PyObject PyUnicode_DecodeASCII(const char s,
				1814	int size,
				1815	const char *errors)
				1816	{
				1817	PyUnicodeObject *v;
				1818	Py_UNICODE *p;
				1819
				1820	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1821	if (size == 1 && (unsigned char)s < 128) {
				1822	Py_UNICODE r = (unsigned char)s;
				1823	return PyUnicode_FromUnicode(&r, 1);
				1824	}
				1825
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1826	v = _PyUnicode_New(size);
				1827	if (v == NULL)
				1828	goto onError;
				1829	if (size == 0)
				1830	return (PyObject *)v;
				1831	p = PyUnicode_AS_UNICODE(v);
				1832	while (size-- > 0) {
				1833	register unsigned char c;
				1834
				1835	c = (unsigned char)*s++;
				1836	if (c < 128)
				1837	*p++ = c;
				1838	else if (ascii_decoding_error(&s, &p, errors,
				1839	"ordinal not in range(128)"))
				1840	goto onError;
				1841	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1842	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1843	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1844	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1845	return (PyObject *)v;
				1846
				1847	onError:
				1848	Py_XDECREF(v);
				1849	return NULL;
				1850	}
				1851
				1852	static
				1853	int ascii_encoding_error(const Py_UNICODE **source,
				1854	char **dest,
				1855	const char *errors,
				1856	const char *details)
				1857	{
				1858	if ((errors == NULL) \|\|
				1859	(strcmp(errors,"strict") == 0)) {
				1860	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1861	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1862	details);
				1863	return -1;
				1864	}
				1865	else if (strcmp(errors,"ignore") == 0) {
				1866	return 0;
				1867	}
				1868	else if (strcmp(errors,"replace") == 0) {
				1869	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1870	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1871	return 0;
				1872	}
				1873	else {
				1874	PyErr_Format(PyExc_ValueError,
				1875	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1876	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1877	errors);
				1878	return -1;
				1879	}
				1880	}
				1881
				1882	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1883	int size,
				1884	const char *errors)
				1885	{
				1886	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1887	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1888
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1889	repr = PyString_FromStringAndSize(NULL, size);
				1890	if (repr == NULL)
				1891	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1892	if (size == 0)
				1893	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1894
				1895	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1896	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1897	while (size-- > 0) {
				1898	Py_UNICODE ch = *p++;
				1899	if (ch >= 128) {
				1900	if (ascii_encoding_error(&p, &s, errors,
				1901	"ordinal not in range(128)"))
				1902	goto onError;
				1903	}
				1904	else
				1905	*s++ = (char)ch;
				1906	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1907	/* Resize if error handling skipped some characters */
				1908	if (s - start < PyString_GET_SIZE(repr))
				1909	if (_PyString_Resize(&repr, s - start))
				1910	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1911	return repr;
				1912
				1913	onError:
				1914	Py_DECREF(repr);
				1915	return NULL;
				1916	}
				1917
				1918	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1919	{
				1920	if (!PyUnicode_Check(unicode)) {
				1921	PyErr_BadArgument();
				1922	return NULL;
				1923	}
				1924	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1925	PyUnicode_GET_SIZE(unicode),
				1926	NULL);
				1927	}
				1928
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	1929	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1930
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1931	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1932
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1933	PyObject PyUnicode_DecodeMBCS(const char s,
				1934	int size,
				1935	const char *errors)
				1936	{
				1937	PyUnicodeObject *v;
				1938	Py_UNICODE *p;
				1939
				1940	/* First get the size of the result */
				1941	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1942	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1943	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1944
				1945	v = _PyUnicode_New(usize);
				1946	if (v == NULL)
				1947	return NULL;
				1948	if (usize == 0)
				1949	return (PyObject *)v;
				1950	p = PyUnicode_AS_UNICODE(v);
				1951	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1952	Py_DECREF(v);
				1953	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1954	}
				1955
				1956	return (PyObject *)v;
				1957	}
				1958
				1959	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1960	int size,
				1961	const char *errors)
				1962	{
				1963	PyObject *repr;
				1964	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1965	DWORD mbcssize;
				1966
				1967	/* If there are no characters, bail now! */
				1968	if (size==0)
				1969	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1970
				1971	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1972	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1973	if (mbcssize==0)
				1974	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1975
				1976	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1977	if (repr == NULL)
				1978	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1979	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1980	return repr;
				1981
				1982	/* Do the conversion */
				1983	s = PyString_AS_STRING(repr);
				1984	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1985	Py_DECREF(repr);
				1986	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1987	}
				1988	return repr;
				1989	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1990
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1991	#endif /* MS_WIN32 */
				1992
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1993	/* --- Character Mapping Codec -------------------------------------------- */
				1994
				1995	static
				1996	int charmap_decoding_error(const char **source,
				1997	Py_UNICODE **dest,
				1998	const char *errors,
				1999	const char *details)
				2000	{
				2001	if ((errors == NULL) \|\|
				2002	(strcmp(errors,"strict") == 0)) {
				2003	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2004	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2005	details);
				2006	return -1;
				2007	}
				2008	else if (strcmp(errors,"ignore") == 0) {
				2009	return 0;
				2010	}
				2011	else if (strcmp(errors,"replace") == 0) {
				2012	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2013	(*dest)++;
				2014	return 0;
				2015	}
				2016	else {
				2017	PyErr_Format(PyExc_ValueError,
				2018	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2019	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2020	errors);
				2021	return -1;
				2022	}
				2023	}
				2024
				2025	PyObject PyUnicode_DecodeCharmap(const char s,
				2026	int size,
				2027	PyObject *mapping,
				2028	const char *errors)
				2029	{
				2030	PyUnicodeObject *v;
				2031	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2032	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2033
				2034	/* Default to Latin-1 */
				2035	if (mapping == NULL)
				2036	return PyUnicode_DecodeLatin1(s, size, errors);
				2037
				2038	v = _PyUnicode_New(size);
				2039	if (v == NULL)
				2040	goto onError;
				2041	if (size == 0)
				2042	return (PyObject *)v;
				2043	p = PyUnicode_AS_UNICODE(v);
				2044	while (size-- > 0) {
				2045	unsigned char ch = *s++;
				2046	PyObject w, x;
				2047
				2048	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2049	w = PyInt_FromLong((long)ch);
				2050	if (w == NULL)
				2051	goto onError;
				2052	x = PyObject_GetItem(mapping, w);
				2053	Py_DECREF(w);
				2054	if (x == NULL) {
				2055	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2056	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2057	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2058	x = Py_None;
				2059	Py_INCREF(x);
				2060	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2061	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2062	}
				2063
				2064	/* Apply mapping */
				2065	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2066	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2067	if (value < 0 \|\| value > 65535) {
				2068	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2069	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2070	Py_DECREF(x);
				2071	goto onError;
				2072	}
				2073	*p++ = (Py_UNICODE)value;
				2074	}
				2075	else if (x == Py_None) {
				2076	/* undefined mapping */
				2077	if (charmap_decoding_error(&s, &p, errors,
				2078	"character maps to <undefined>")) {
				2079	Py_DECREF(x);
				2080	goto onError;
				2081	}
				2082	}
				2083	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2084	int targetsize = PyUnicode_GET_SIZE(x);
				2085
				2086	if (targetsize == 1)
				2087	/* 1-1 mapping */
				2088	p++ = PyUnicode_AS_UNICODE(x);
				2089
				2090	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2091	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2092	if (targetsize > extrachars) {
				2093	/* resize first */
				2094	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2095	int needed = (targetsize - extrachars) + \
				2096	(targetsize << 2);
				2097	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2098	if (_PyUnicode_Resize(&v,
				2099	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2100	Py_DECREF(x);
				2101	goto onError;
				2102	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2103	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2104	}
				2105	Py_UNICODE_COPY(p,
				2106	PyUnicode_AS_UNICODE(x),
				2107	targetsize);
				2108	p += targetsize;
				2109	extrachars -= targetsize;
				2110	}
				2111	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2112	}
				2113	else {
				2114	/* wrong return value */
				2115	PyErr_SetString(PyExc_TypeError,
				2116	"character mapping must return integer, None or unicode");
				2117	Py_DECREF(x);
				2118	goto onError;
				2119	}
				2120	Py_DECREF(x);
				2121	}
				2122	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2123	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2124	goto onError;
				2125	return (PyObject *)v;
				2126
				2127	onError:
				2128	Py_XDECREF(v);
				2129	return NULL;
				2130	}
				2131
				2132	static
				2133	int charmap_encoding_error(const Py_UNICODE **source,
				2134	char **dest,
				2135	const char *errors,
				2136	const char *details)
				2137	{
				2138	if ((errors == NULL) \|\|
				2139	(strcmp(errors,"strict") == 0)) {
				2140	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2141	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2142	details);
				2143	return -1;
				2144	}
				2145	else if (strcmp(errors,"ignore") == 0) {
				2146	return 0;
				2147	}
				2148	else if (strcmp(errors,"replace") == 0) {
				2149	**dest = '?';
				2150	(*dest)++;
				2151	return 0;
				2152	}
				2153	else {
				2154	PyErr_Format(PyExc_ValueError,
				2155	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2156	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2157	errors);
				2158	return -1;
				2159	}
				2160	}
				2161
				2162	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2163	int size,
				2164	PyObject *mapping,
				2165	const char *errors)
				2166	{
				2167	PyObject *v;
				2168	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2169	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2170
				2171	/* Default to Latin-1 */
				2172	if (mapping == NULL)
				2173	return PyUnicode_EncodeLatin1(p, size, errors);
				2174
				2175	v = PyString_FromStringAndSize(NULL, size);
				2176	if (v == NULL)
				2177	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2178	if (size == 0)
				2179	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2180	s = PyString_AS_STRING(v);
				2181	while (size-- > 0) {
				2182	Py_UNICODE ch = *p++;
				2183	PyObject w, x;
				2184
				2185	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2186	w = PyInt_FromLong((long)ch);
				2187	if (w == NULL)
				2188	goto onError;
				2189	x = PyObject_GetItem(mapping, w);
				2190	Py_DECREF(w);
				2191	if (x == NULL) {
				2192	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2193	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2194	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2195	x = Py_None;
				2196	Py_INCREF(x);
				2197	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2198	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2199	}
				2200
				2201	/* Apply mapping */
				2202	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2203	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2204	if (value < 0 \|\| value > 255) {
				2205	PyErr_SetString(PyExc_TypeError,
				2206	"character mapping must be in range(256)");
				2207	Py_DECREF(x);
				2208	goto onError;
				2209	}
				2210	*s++ = (char)value;
				2211	}
				2212	else if (x == Py_None) {
				2213	/* undefined mapping */
				2214	if (charmap_encoding_error(&p, &s, errors,
				2215	"character maps to <undefined>")) {
				2216	Py_DECREF(x);
				2217	goto onError;
				2218	}
				2219	}
				2220	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2221	int targetsize = PyString_GET_SIZE(x);
				2222
				2223	if (targetsize == 1)
				2224	/* 1-1 mapping */
				2225	s++ = PyString_AS_STRING(x);
				2226
				2227	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2228	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2229	if (targetsize > extrachars) {
				2230	/* resize first */
				2231	int oldpos = (int)(s - PyString_AS_STRING(v));
				2232	int needed = (targetsize - extrachars) + \
				2233	(targetsize << 2);
				2234	extrachars += needed;
				2235	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2236	Py_DECREF(x);
				2237	goto onError;
				2238	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2239	s = PyString_AS_STRING(v) + oldpos;
				2240	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2241	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2242	s += targetsize;
				2243	extrachars -= targetsize;
				2244	}
				2245	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2246	}
				2247	else {
				2248	/* wrong return value */
				2249	PyErr_SetString(PyExc_TypeError,
				2250	"character mapping must return integer, None or unicode");
				2251	Py_DECREF(x);
				2252	goto onError;
				2253	}
				2254	Py_DECREF(x);
				2255	}
				2256	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2257	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2258	goto onError;
				2259	return v;
				2260
				2261	onError:
				2262	Py_DECREF(v);
				2263	return NULL;
				2264	}
				2265
				2266	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2267	PyObject *mapping)
				2268	{
				2269	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2270	PyErr_BadArgument();
				2271	return NULL;
				2272	}
				2273	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2274	PyUnicode_GET_SIZE(unicode),
				2275	mapping,
				2276	NULL);
				2277	}
				2278
				2279	static
				2280	int translate_error(const Py_UNICODE **source,
				2281	Py_UNICODE **dest,
				2282	const char *errors,
				2283	const char *details)
				2284	{
				2285	if ((errors == NULL) \|\|
				2286	(strcmp(errors,"strict") == 0)) {
				2287	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2288	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2289	details);
				2290	return -1;
				2291	}
				2292	else if (strcmp(errors,"ignore") == 0) {
				2293	return 0;
				2294	}
				2295	else if (strcmp(errors,"replace") == 0) {
				2296	**dest = '?';
				2297	(*dest)++;
				2298	return 0;
				2299	}
				2300	else {
				2301	PyErr_Format(PyExc_ValueError,
				2302	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2303	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2304	errors);
				2305	return -1;
				2306	}
				2307	}
				2308
				2309	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2310	int size,
				2311	PyObject *mapping,
				2312	const char *errors)
				2313	{
				2314	PyUnicodeObject *v;
				2315	Py_UNICODE *p;
				2316
				2317	if (mapping == NULL) {
				2318	PyErr_BadArgument();
				2319	return NULL;
				2320	}
				2321
				2322	/* Output will never be longer than input */
				2323	v = _PyUnicode_New(size);
				2324	if (v == NULL)
				2325	goto onError;
				2326	if (size == 0)
				2327	goto done;
				2328	p = PyUnicode_AS_UNICODE(v);
				2329	while (size-- > 0) {
				2330	Py_UNICODE ch = *s++;
				2331	PyObject w, x;
				2332
				2333	/* Get mapping */
				2334	w = PyInt_FromLong(ch);
				2335	if (w == NULL)
				2336	goto onError;
				2337	x = PyObject_GetItem(mapping, w);
				2338	Py_DECREF(w);
				2339	if (x == NULL) {
				2340	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2341	/* No mapping found: default to 1-1 mapping */
				2342	PyErr_Clear();
				2343	*p++ = ch;
				2344	continue;
				2345	}
				2346	goto onError;
				2347	}
				2348
				2349	/* Apply mapping */
				2350	if (PyInt_Check(x))
				2351	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2352	else if (x == Py_None) {
				2353	/* undefined mapping */
				2354	if (translate_error(&s, &p, errors,
				2355	"character maps to <undefined>")) {
				2356	Py_DECREF(x);
				2357	goto onError;
				2358	}
				2359	}
				2360	else if (PyUnicode_Check(x)) {
				2361	if (PyUnicode_GET_SIZE(x) != 1) {
				2362	/* 1-n mapping */
				2363	PyErr_SetString(PyExc_NotImplementedError,
				2364	"1-n mappings are currently not implemented");
				2365	Py_DECREF(x);
				2366	goto onError;
				2367	}
				2368	p++ = PyUnicode_AS_UNICODE(x);
				2369	}
				2370	else {
				2371	/* wrong return value */
				2372	PyErr_SetString(PyExc_TypeError,
				2373	"translate mapping must return integer, None or unicode");
				2374	Py_DECREF(x);
				2375	goto onError;
				2376	}
				2377	Py_DECREF(x);
				2378	}
				2379	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2380	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2381	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2382
				2383	done:
				2384	return (PyObject *)v;
				2385
				2386	onError:
				2387	Py_XDECREF(v);
				2388	return NULL;
				2389	}
				2390
				2391	PyObject PyUnicode_Translate(PyObject str,
				2392	PyObject *mapping,
				2393	const char *errors)
				2394	{
				2395	PyObject *result;
				2396
				2397	str = PyUnicode_FromObject(str);
				2398	if (str == NULL)
				2399	goto onError;
				2400	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2401	PyUnicode_GET_SIZE(str),
				2402	mapping,
				2403	errors);
				2404	Py_DECREF(str);
				2405	return result;
				2406
				2407	onError:
				2408	Py_XDECREF(str);
				2409	return NULL;
				2410	}
				2411
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2412	/* --- Decimal Encoder ---------------------------------------------------- */
				2413
				2414	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2415	int length,
				2416	char *output,
				2417	const char *errors)
				2418	{
				2419	Py_UNICODE p, end;
				2420
				2421	if (output == NULL) {
				2422	PyErr_BadArgument();
				2423	return -1;
				2424	}
				2425
				2426	p = s;
				2427	end = s + length;
				2428	while (p < end) {
				2429	register Py_UNICODE ch = *p++;
				2430	int decimal;
				2431
				2432	if (Py_UNICODE_ISSPACE(ch)) {
				2433	*output++ = ' ';
				2434	continue;
				2435	}
				2436	decimal = Py_UNICODE_TODECIMAL(ch);
				2437	if (decimal >= 0) {
				2438	*output++ = '0' + decimal;
				2439	continue;
				2440	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2441	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2442	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2443	continue;
				2444	}
				2445	/* All other characters are considered invalid */
				2446	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2447	PyErr_SetString(PyExc_ValueError,
				2448	"invalid decimal Unicode string");
				2449	goto onError;
				2450	}
				2451	else if (strcmp(errors, "ignore") == 0)
				2452	continue;
				2453	else if (strcmp(errors, "replace") == 0) {
				2454	*output++ = '?';
				2455	continue;
				2456	}
				2457	}
				2458	/* 0-terminate the output string */
				2459	*output++ = '\0';
				2460	return 0;
				2461
				2462	onError:
				2463	return -1;
				2464	}
				2465
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2466	/* --- Helpers ------------------------------------------------------------ */
				2467
				2468	static
				2469	int count(PyUnicodeObject *self,
				2470	int start,
				2471	int end,
				2472	PyUnicodeObject *substring)
				2473	{
				2474	int count = 0;
				2475
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2476	if (start < 0)
				2477	start += self->length;
				2478	if (start < 0)
				2479	start = 0;
				2480	if (end > self->length)
				2481	end = self->length;
				2482	if (end < 0)
				2483	end += self->length;
				2484	if (end < 0)
				2485	end = 0;
				2486
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2487	if (substring->length == 0)
				2488	return (end - start + 1);
				2489
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2490	end -= substring->length;
				2491
				2492	while (start <= end)
				2493	if (Py_UNICODE_MATCH(self, start, substring)) {
				2494	count++;
				2495	start += substring->length;
				2496	} else
				2497	start++;
				2498
				2499	return count;
				2500	}
				2501
				2502	int PyUnicode_Count(PyObject *str,
				2503	PyObject *substr,
				2504	int start,
				2505	int end)
				2506	{
				2507	int result;
				2508
				2509	str = PyUnicode_FromObject(str);
				2510	if (str == NULL)
				2511	return -1;
				2512	substr = PyUnicode_FromObject(substr);
				2513	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2514	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2515	return -1;
				2516	}
				2517
				2518	result = count((PyUnicodeObject *)str,
				2519	start, end,
				2520	(PyUnicodeObject *)substr);
				2521
				2522	Py_DECREF(str);
				2523	Py_DECREF(substr);
				2524	return result;
				2525	}
				2526
				2527	static
				2528	int findstring(PyUnicodeObject *self,
				2529	PyUnicodeObject *substring,
				2530	int start,
				2531	int end,
				2532	int direction)
				2533	{
				2534	if (start < 0)
				2535	start += self->length;
				2536	if (start < 0)
				2537	start = 0;
				2538
				2539	if (substring->length == 0)
				2540	return start;
				2541
				2542	if (end > self->length)
				2543	end = self->length;
				2544	if (end < 0)
				2545	end += self->length;
				2546	if (end < 0)
				2547	end = 0;
				2548
				2549	end -= substring->length;
				2550
				2551	if (direction < 0) {
				2552	for (; end >= start; end--)
				2553	if (Py_UNICODE_MATCH(self, end, substring))
				2554	return end;
				2555	} else {
				2556	for (; start <= end; start++)
				2557	if (Py_UNICODE_MATCH(self, start, substring))
				2558	return start;
				2559	}
				2560
				2561	return -1;
				2562	}
				2563
				2564	int PyUnicode_Find(PyObject *str,
				2565	PyObject *substr,
				2566	int start,
				2567	int end,
				2568	int direction)
				2569	{
				2570	int result;
				2571
				2572	str = PyUnicode_FromObject(str);
				2573	if (str == NULL)
				2574	return -1;
				2575	substr = PyUnicode_FromObject(substr);
				2576	if (substr == NULL) {
				2577	Py_DECREF(substr);
				2578	return -1;
				2579	}
				2580
				2581	result = findstring((PyUnicodeObject *)str,
				2582	(PyUnicodeObject *)substr,
				2583	start, end, direction);
				2584	Py_DECREF(str);
				2585	Py_DECREF(substr);
				2586	return result;
				2587	}
				2588
				2589	static
				2590	int tailmatch(PyUnicodeObject *self,
				2591	PyUnicodeObject *substring,
				2592	int start,
				2593	int end,
				2594	int direction)
				2595	{
				2596	if (start < 0)
				2597	start += self->length;
				2598	if (start < 0)
				2599	start = 0;
				2600
				2601	if (substring->length == 0)
				2602	return 1;
				2603
				2604	if (end > self->length)
				2605	end = self->length;
				2606	if (end < 0)
				2607	end += self->length;
				2608	if (end < 0)
				2609	end = 0;
				2610
				2611	end -= substring->length;
				2612	if (end < start)
				2613	return 0;
				2614
				2615	if (direction > 0) {
				2616	if (Py_UNICODE_MATCH(self, end, substring))
				2617	return 1;
				2618	} else {
				2619	if (Py_UNICODE_MATCH(self, start, substring))
				2620	return 1;
				2621	}
				2622
				2623	return 0;
				2624	}
				2625
				2626	int PyUnicode_Tailmatch(PyObject *str,
				2627	PyObject *substr,
				2628	int start,
				2629	int end,
				2630	int direction)
				2631	{
				2632	int result;
				2633
				2634	str = PyUnicode_FromObject(str);
				2635	if (str == NULL)
				2636	return -1;
				2637	substr = PyUnicode_FromObject(substr);
				2638	if (substr == NULL) {
				2639	Py_DECREF(substr);
				2640	return -1;
				2641	}
				2642
				2643	result = tailmatch((PyUnicodeObject *)str,
				2644	(PyUnicodeObject *)substr,
				2645	start, end, direction);
				2646	Py_DECREF(str);
				2647	Py_DECREF(substr);
				2648	return result;
				2649	}
				2650
				2651	static
				2652	const Py_UNICODE findchar(const Py_UNICODE s,
				2653	int size,
				2654	Py_UNICODE ch)
				2655	{
				2656	/* like wcschr, but doesn't stop at NULL characters */
				2657
				2658	while (size-- > 0) {
				2659	if (*s == ch)
				2660	return s;
				2661	s++;
				2662	}
				2663
				2664	return NULL;
				2665	}
				2666
				2667	/* Apply fixfct filter to the Unicode object self and return a
				2668	reference to the modified object */
				2669
				2670	static
				2671	PyObject fixup(PyUnicodeObject self,
				2672	int (fixfct)(PyUnicodeObject s))
				2673	{
				2674
				2675	PyUnicodeObject *u;
				2676
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2677	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2678	if (u == NULL)
				2679	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2680
				2681	Py_UNICODE_COPY(u->str, self->str, self->length);
				2682
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2683	if (!fixfct(u)) {
				2684	/* fixfct should return TRUE if it modified the buffer. If
				2685	FALSE, return a reference to the original buffer instead
				2686	(to save space, not time) */
				2687	Py_INCREF(self);
				2688	Py_DECREF(u);
				2689	return (PyObject*) self;
				2690	}
				2691	return (PyObject*) u;
				2692	}
				2693
				2694	static
				2695	int fixupper(PyUnicodeObject *self)
				2696	{
				2697	int len = self->length;
				2698	Py_UNICODE *s = self->str;
				2699	int status = 0;
				2700
				2701	while (len-- > 0) {
				2702	register Py_UNICODE ch;
				2703
				2704	ch = Py_UNICODE_TOUPPER(*s);
				2705	if (ch != *s) {
				2706	status = 1;
				2707	*s = ch;
				2708	}
				2709	s++;
				2710	}
				2711
				2712	return status;
				2713	}
				2714
				2715	static
				2716	int fixlower(PyUnicodeObject *self)
				2717	{
				2718	int len = self->length;
				2719	Py_UNICODE *s = self->str;
				2720	int status = 0;
				2721
				2722	while (len-- > 0) {
				2723	register Py_UNICODE ch;
				2724
				2725	ch = Py_UNICODE_TOLOWER(*s);
				2726	if (ch != *s) {
				2727	status = 1;
				2728	*s = ch;
				2729	}
				2730	s++;
				2731	}
				2732
				2733	return status;
				2734	}
				2735
				2736	static
				2737	int fixswapcase(PyUnicodeObject *self)
				2738	{
				2739	int len = self->length;
				2740	Py_UNICODE *s = self->str;
				2741	int status = 0;
				2742
				2743	while (len-- > 0) {
				2744	if (Py_UNICODE_ISUPPER(*s)) {
				2745	s = Py_UNICODE_TOLOWER(s);
				2746	status = 1;
				2747	} else if (Py_UNICODE_ISLOWER(*s)) {
				2748	s = Py_UNICODE_TOUPPER(s);
				2749	status = 1;
				2750	}
				2751	s++;
				2752	}
				2753
				2754	return status;
				2755	}
				2756
				2757	static
				2758	int fixcapitalize(PyUnicodeObject *self)
				2759	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2760	int len = self->length;
				2761	Py_UNICODE *s = self->str;
				2762	int status = 0;
				2763
				2764	if (len == 0)
				2765	return 0;
				2766	if (Py_UNICODE_ISLOWER(*s)) {
				2767	s = Py_UNICODE_TOUPPER(s);
				2768	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2769	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2770	s++;
				2771	while (--len > 0) {
				2772	if (Py_UNICODE_ISUPPER(*s)) {
				2773	s = Py_UNICODE_TOLOWER(s);
				2774	status = 1;
				2775	}
				2776	s++;
				2777	}
				2778	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2779	}
				2780
				2781	static
				2782	int fixtitle(PyUnicodeObject *self)
				2783	{
				2784	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2785	register Py_UNICODE *e;
				2786	int previous_is_cased;
				2787
				2788	/* Shortcut for single character strings */
				2789	if (PyUnicode_GET_SIZE(self) == 1) {
				2790	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2791	if (*p != ch) {
				2792	*p = ch;
				2793	return 1;
				2794	}
				2795	else
				2796	return 0;
				2797	}
				2798
				2799	e = p + PyUnicode_GET_SIZE(self);
				2800	previous_is_cased = 0;
				2801	for (; p < e; p++) {
				2802	register const Py_UNICODE ch = *p;
				2803
				2804	if (previous_is_cased)
				2805	*p = Py_UNICODE_TOLOWER(ch);
				2806	else
				2807	*p = Py_UNICODE_TOTITLE(ch);
				2808
				2809	if (Py_UNICODE_ISLOWER(ch) \|\|
				2810	Py_UNICODE_ISUPPER(ch) \|\|
				2811	Py_UNICODE_ISTITLE(ch))
				2812	previous_is_cased = 1;
				2813	else
				2814	previous_is_cased = 0;
				2815	}
				2816	return 1;
				2817	}
				2818
				2819	PyObject PyUnicode_Join(PyObject separator,
				2820	PyObject *seq)
				2821	{
				2822	Py_UNICODE *sep;
				2823	int seplen;
				2824	PyUnicodeObject *res = NULL;
				2825	int reslen = 0;
				2826	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2827	int sz = 100;
				2828	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2829	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2830
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2831	it = PyObject_GetIter(seq);
				2832	if (it == NULL)
				2833	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2834
				2835	if (separator == NULL) {
				2836	Py_UNICODE blank = ' ';
				2837	sep = &blank;
				2838	seplen = 1;
				2839	}
				2840	else {
				2841	separator = PyUnicode_FromObject(separator);
				2842	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2843	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2844	sep = PyUnicode_AS_UNICODE(separator);
				2845	seplen = PyUnicode_GET_SIZE(separator);
				2846	}
				2847
				2848	res = _PyUnicode_New(sz);
				2849	if (res == NULL)
				2850	goto onError;
				2851	p = PyUnicode_AS_UNICODE(res);
				2852	reslen = 0;
				2853
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2854	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2855	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2856	PyObject *item = PyIter_Next(it);
				2857	if (item == NULL) {
				2858	if (PyErr_Occurred())
				2859	goto onError;
				2860	break;
				2861	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2862	if (!PyUnicode_Check(item)) {
				2863	PyObject *v;
				2864	v = PyUnicode_FromObject(item);
				2865	Py_DECREF(item);
				2866	item = v;
				2867	if (item == NULL)
				2868	goto onError;
				2869	}
				2870	itemlen = PyUnicode_GET_SIZE(item);
				2871	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2872	if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2873	goto onError;
				2874	sz *= 2;
				2875	p = PyUnicode_AS_UNICODE(res) + reslen;
				2876	}
				2877	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2878	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2879	p += seplen;
				2880	reslen += seplen;
				2881	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2882	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2883	p += itemlen;
				2884	reslen += itemlen;
				2885	Py_DECREF(item);
				2886	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2887	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2888	goto onError;
				2889
				2890	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2891	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2892	return (PyObject *)res;
				2893
				2894	onError:
				2895	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2896	Py_XDECREF(res);
				2897	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2898	return NULL;
				2899	}
				2900
				2901	static
				2902	PyUnicodeObject pad(PyUnicodeObject self,
				2903	int left,
				2904	int right,
				2905	Py_UNICODE fill)
				2906	{
				2907	PyUnicodeObject *u;
				2908
				2909	if (left < 0)
				2910	left = 0;
				2911	if (right < 0)
				2912	right = 0;
				2913
				2914	if (left == 0 && right == 0) {
				2915	Py_INCREF(self);
				2916	return self;
				2917	}
				2918
				2919	u = _PyUnicode_New(left + self->length + right);
				2920	if (u) {
				2921	if (left)
				2922	Py_UNICODE_FILL(u->str, fill, left);
				2923	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2924	if (right)
				2925	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2926	}
				2927
				2928	return u;
				2929	}
				2930
				2931	#define SPLIT_APPEND(data, left, right) \
				2932	str = PyUnicode_FromUnicode(data + left, right - left); \
				2933	if (!str) \
				2934	goto onError; \
				2935	if (PyList_Append(list, str)) { \
				2936	Py_DECREF(str); \
				2937	goto onError; \
				2938	} \
				2939	else \
				2940	Py_DECREF(str);
				2941
				2942	static
				2943	PyObject split_whitespace(PyUnicodeObject self,
				2944	PyObject *list,
				2945	int maxcount)
				2946	{
				2947	register int i;
				2948	register int j;
				2949	int len = self->length;
				2950	PyObject *str;
				2951
				2952	for (i = j = 0; i < len; ) {
				2953	/* find a token */
				2954	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2955	i++;
				2956	j = i;
				2957	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2958	i++;
				2959	if (j < i) {
				2960	if (maxcount-- <= 0)
				2961	break;
				2962	SPLIT_APPEND(self->str, j, i);
				2963	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2964	i++;
				2965	j = i;
				2966	}
				2967	}
				2968	if (j < len) {
				2969	SPLIT_APPEND(self->str, j, len);
				2970	}
				2971	return list;
				2972
				2973	onError:
				2974	Py_DECREF(list);
				2975	return NULL;
				2976	}
				2977
				2978	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2979	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2980	{
				2981	register int i;
				2982	register int j;
				2983	int len;
				2984	PyObject *list;
				2985	PyObject *str;
				2986	Py_UNICODE *data;
				2987
				2988	string = PyUnicode_FromObject(string);
				2989	if (string == NULL)
				2990	return NULL;
				2991	data = PyUnicode_AS_UNICODE(string);
				2992	len = PyUnicode_GET_SIZE(string);
				2993
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2994	list = PyList_New(0);
				2995	if (!list)
				2996	goto onError;
				2997
				2998	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2999	int eol;
				3000
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3001	/* Find a line and append it */
				3002	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				3003	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3004
				3005	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3006	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3007	if (i < len) {
				3008	if (data[i] == '\r' && i + 1 < len &&
				3009	data[i+1] == '\n')
				3010	i += 2;
				3011	else
				3012	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3013	if (keepends)
				3014	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3015	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3016	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3017	j = i;
				3018	}
				3019	if (j < len) {
				3020	SPLIT_APPEND(data, j, len);
				3021	}
				3022
				3023	Py_DECREF(string);
				3024	return list;
				3025
				3026	onError:
				3027	Py_DECREF(list);
				3028	Py_DECREF(string);
				3029	return NULL;
				3030	}
				3031
				3032	static
				3033	PyObject split_char(PyUnicodeObject self,
				3034	PyObject *list,
				3035	Py_UNICODE ch,
				3036	int maxcount)
				3037	{
				3038	register int i;
				3039	register int j;
				3040	int len = self->length;
				3041	PyObject *str;
				3042
				3043	for (i = j = 0; i < len; ) {
				3044	if (self->str[i] == ch) {
				3045	if (maxcount-- <= 0)
				3046	break;
				3047	SPLIT_APPEND(self->str, j, i);
				3048	i = j = i + 1;
				3049	} else
				3050	i++;
				3051	}
				3052	if (j <= len) {
				3053	SPLIT_APPEND(self->str, j, len);
				3054	}
				3055	return list;
				3056
				3057	onError:
				3058	Py_DECREF(list);
				3059	return NULL;
				3060	}
				3061
				3062	static
				3063	PyObject split_substring(PyUnicodeObject self,
				3064	PyObject *list,
				3065	PyUnicodeObject *substring,
				3066	int maxcount)
				3067	{
				3068	register int i;
				3069	register int j;
				3070	int len = self->length;
				3071	int sublen = substring->length;
				3072	PyObject *str;
				3073
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3074	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3075	if (Py_UNICODE_MATCH(self, i, substring)) {
				3076	if (maxcount-- <= 0)
				3077	break;
				3078	SPLIT_APPEND(self->str, j, i);
				3079	i = j = i + sublen;
				3080	} else
				3081	i++;
				3082	}
				3083	if (j <= len) {
				3084	SPLIT_APPEND(self->str, j, len);
				3085	}
				3086	return list;
				3087
				3088	onError:
				3089	Py_DECREF(list);
				3090	return NULL;
				3091	}
				3092
				3093	#undef SPLIT_APPEND
				3094
				3095	static
				3096	PyObject split(PyUnicodeObject self,
				3097	PyUnicodeObject *substring,
				3098	int maxcount)
				3099	{
				3100	PyObject *list;
				3101
				3102	if (maxcount < 0)
				3103	maxcount = INT_MAX;
				3104
				3105	list = PyList_New(0);
				3106	if (!list)
				3107	return NULL;
				3108
				3109	if (substring == NULL)
				3110	return split_whitespace(self,list,maxcount);
				3111
				3112	else if (substring->length == 1)
				3113	return split_char(self,list,substring->str[0],maxcount);
				3114
				3115	else if (substring->length == 0) {
				3116	Py_DECREF(list);
				3117	PyErr_SetString(PyExc_ValueError, "empty separator");
				3118	return NULL;
				3119	}
				3120	else
				3121	return split_substring(self,list,substring,maxcount);
				3122	}
				3123
				3124	static
				3125	PyObject strip(PyUnicodeObject self,
				3126	int left,
				3127	int right)
				3128	{
				3129	Py_UNICODE *p = self->str;
				3130	int start = 0;
				3131	int end = self->length;
				3132
				3133	if (left)
				3134	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3135	start++;
				3136
				3137	if (right)
				3138	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3139	end--;
				3140
				3141	if (start == 0 && end == self->length) {
				3142	/* couldn't strip anything off, return original string */
				3143	Py_INCREF(self);
				3144	return (PyObject*) self;
				3145	}
				3146
				3147	return (PyObject*) PyUnicode_FromUnicode(
				3148	self->str + start,
				3149	end - start
				3150	);
				3151	}
				3152
				3153	static
				3154	PyObject replace(PyUnicodeObject self,
				3155	PyUnicodeObject *str1,
				3156	PyUnicodeObject *str2,
				3157	int maxcount)
				3158	{
				3159	PyUnicodeObject *u;
				3160
				3161	if (maxcount < 0)
				3162	maxcount = INT_MAX;
				3163
				3164	if (str1->length == 1 && str2->length == 1) {
				3165	int i;
				3166
				3167	/* replace characters */
				3168	if (!findchar(self->str, self->length, str1->str[0])) {
				3169	/* nothing to replace, return original string */
				3170	Py_INCREF(self);
				3171	u = self;
				3172	} else {
				3173	Py_UNICODE u1 = str1->str[0];
				3174	Py_UNICODE u2 = str2->str[0];
				3175
				3176	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3177	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3178	self->length
				3179	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3180	if (u != NULL) {
				3181	Py_UNICODE_COPY(u->str, self->str,
				3182	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3183	for (i = 0; i < u->length; i++)
				3184	if (u->str[i] == u1) {
				3185	if (--maxcount < 0)
				3186	break;
				3187	u->str[i] = u2;
				3188	}
				3189	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3190	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3191
				3192	} else {
				3193	int n, i;
				3194	Py_UNICODE *p;
				3195
				3196	/* replace strings */
				3197	n = count(self, 0, self->length, str1);
				3198	if (n > maxcount)
				3199	n = maxcount;
				3200	if (n == 0) {
				3201	/* nothing to replace, return original string */
				3202	Py_INCREF(self);
				3203	u = self;
				3204	} else {
				3205	u = _PyUnicode_New(
				3206	self->length + n * (str2->length - str1->length));
				3207	if (u) {
				3208	i = 0;
				3209	p = u->str;
				3210	while (i <= self->length - str1->length)
				3211	if (Py_UNICODE_MATCH(self, i, str1)) {
				3212	/* replace string segment */
				3213	Py_UNICODE_COPY(p, str2->str, str2->length);
				3214	p += str2->length;
				3215	i += str1->length;
				3216	if (--n <= 0) {
				3217	/* copy remaining part */
				3218	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3219	break;
				3220	}
				3221	} else
				3222	*p++ = self->str[i++];
				3223	}
				3224	}
				3225	}
				3226
				3227	return (PyObject *) u;
				3228	}
				3229
				3230	/* --- Unicode Object Methods --------------------------------------------- */
				3231
				3232	static char title__doc__[] =
				3233	"S.title() -> unicode\n\
				3234	\n\
				3235	Return a titlecased version of S, i.e. words start with title case\n\
				3236	characters, all remaining cased characters have lower case.";
				3237
				3238	static PyObject*
				3239	unicode_title(PyUnicodeObject self, PyObject args)
				3240	{
				3241	if (!PyArg_NoArgs(args))
				3242	return NULL;
				3243	return fixup(self, fixtitle);
				3244	}
				3245
				3246	static char capitalize__doc__[] =
				3247	"S.capitalize() -> unicode\n\
				3248	\n\
				3249	Return a capitalized version of S, i.e. make the first character\n\
				3250	have upper case.";
				3251
				3252	static PyObject*
				3253	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3254	{
				3255	if (!PyArg_NoArgs(args))
				3256	return NULL;
				3257	return fixup(self, fixcapitalize);
				3258	}
				3259
				3260	#if 0
				3261	static char capwords__doc__[] =
				3262	"S.capwords() -> unicode\n\
				3263	\n\
				3264	Apply .capitalize() to all words in S and return the result with\n\
				3265	normalized whitespace (all whitespace strings are replaced by ' ').";
				3266
				3267	static PyObject*
				3268	unicode_capwords(PyUnicodeObject self, PyObject args)
				3269	{
				3270	PyObject *list;
				3271	PyObject *item;
				3272	int i;
				3273
				3274	if (!PyArg_NoArgs(args))
				3275	return NULL;
				3276
				3277	/* Split into words */
				3278	list = split(self, NULL, -1);
				3279	if (!list)
				3280	return NULL;
				3281
				3282	/* Capitalize each word */
				3283	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3284	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3285	fixcapitalize);
				3286	if (item == NULL)
				3287	goto onError;
				3288	Py_DECREF(PyList_GET_ITEM(list, i));
				3289	PyList_SET_ITEM(list, i, item);
				3290	}
				3291
				3292	/* Join the words to form a new string */
				3293	item = PyUnicode_Join(NULL, list);
				3294
				3295	onError:
				3296	Py_DECREF(list);
				3297	return (PyObject *)item;
				3298	}
				3299	#endif
				3300
				3301	static char center__doc__[] =
				3302	"S.center(width) -> unicode\n\
				3303	\n\
				3304	Return S centered in a Unicode string of length width. Padding is done\n\
				3305	using spaces.";
				3306
				3307	static PyObject *
				3308	unicode_center(PyUnicodeObject self, PyObject args)
				3309	{
				3310	int marg, left;
				3311	int width;
				3312
				3313	if (!PyArg_ParseTuple(args, "i:center", &width))
				3314	return NULL;
				3315
				3316	if (self->length >= width) {
				3317	Py_INCREF(self);
				3318	return (PyObject*) self;
				3319	}
				3320
				3321	marg = width - self->length;
				3322	left = marg / 2 + (marg & width & 1);
				3323
				3324	return (PyObject*) pad(self, left, marg - left, ' ');
				3325	}
				3326
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3327	#if 0
				3328
				3329	/* This code should go into some future Unicode collation support
				3330	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3331	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3332
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3333	/* speedy UTF-16 code point order comparison */
				3334	/* gleaned from: */
				3335	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3336
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3337	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3338	{
				3339	0, 0, 0, 0, 0, 0, 0, 0,
				3340	0, 0, 0, 0, 0, 0, 0, 0,
				3341	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3342	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3343	};
				3344
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3345	static int
				3346	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3347	{
				3348	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3349
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3350	Py_UNICODE *s1 = str1->str;
				3351	Py_UNICODE *s2 = str2->str;
				3352
				3353	len1 = str1->length;
				3354	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3355
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3356	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3357	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3358
				3359	c1 = *s1++;
				3360	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3361
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3362	if (c1 > (1<<11) * 26)
				3363	c1 += utf16Fixup[c1>>11];
				3364	if (c2 > (1<<11) * 26)
				3365	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3366	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3367
				3368	if (c1 != c2)
				3369	return (c1 < c2) ? -1 : 1;
				3370
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3371	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3372	}
				3373
				3374	return (len1 < len2) ? -1 : (len1 != len2);
				3375	}
				3376
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3377	#else
				3378
				3379	static int
				3380	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3381	{
				3382	register int len1, len2;
				3383
				3384	Py_UNICODE *s1 = str1->str;
				3385	Py_UNICODE *s2 = str2->str;
				3386
				3387	len1 = str1->length;
				3388	len2 = str2->length;
				3389
				3390	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3391	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3392
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3393	c1 = *s1++;
				3394	c2 = *s2++;
				3395
				3396	if (c1 != c2)
				3397	return (c1 < c2) ? -1 : 1;
				3398
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3399	len1--; len2--;
				3400	}
				3401
				3402	return (len1 < len2) ? -1 : (len1 != len2);
				3403	}
				3404
				3405	#endif
				3406
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3407	int PyUnicode_Compare(PyObject *left,
				3408	PyObject *right)
				3409	{
				3410	PyUnicodeObject u = NULL, v = NULL;
				3411	int result;
				3412
				3413	/* Coerce the two arguments */
				3414	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3415	if (u == NULL)
				3416	goto onError;
				3417	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3418	if (v == NULL)
				3419	goto onError;
				3420
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3421	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3422	if (v == u) {
				3423	Py_DECREF(u);
				3424	Py_DECREF(v);
				3425	return 0;
				3426	}
				3427
				3428	result = unicode_compare(u, v);
				3429
				3430	Py_DECREF(u);
				3431	Py_DECREF(v);
				3432	return result;
				3433
				3434	onError:
				3435	Py_XDECREF(u);
				3436	Py_XDECREF(v);
				3437	return -1;
				3438	}
				3439
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3440	int PyUnicode_Contains(PyObject *container,
				3441	PyObject *element)
				3442	{
				3443	PyUnicodeObject u = NULL, v = NULL;
				3444	int result;
				3445	register const Py_UNICODE p, e;
				3446	register Py_UNICODE ch;
				3447
				3448	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3449	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3450	if (v == NULL) {
				3451	PyErr_SetString(PyExc_TypeError,
				3452	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3453	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3454	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3455	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3456	if (u == NULL) {
				3457	Py_DECREF(v);
				3458	goto onError;
				3459	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3460
				3461	/* Check v in u */
				3462	if (PyUnicode_GET_SIZE(v) != 1) {
				3463	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3464	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3465	goto onError;
				3466	}
				3467	ch = *PyUnicode_AS_UNICODE(v);
				3468	p = PyUnicode_AS_UNICODE(u);
				3469	e = p + PyUnicode_GET_SIZE(u);
				3470	result = 0;
				3471	while (p < e) {
				3472	if (*p++ == ch) {
				3473	result = 1;
				3474	break;
				3475	}
				3476	}
				3477
				3478	Py_DECREF(u);
				3479	Py_DECREF(v);
				3480	return result;
				3481
				3482	onError:
				3483	Py_XDECREF(u);
				3484	Py_XDECREF(v);
				3485	return -1;
				3486	}
				3487
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3488	/* Concat to string or Unicode object giving a new Unicode object. */
				3489
				3490	PyObject PyUnicode_Concat(PyObject left,
				3491	PyObject *right)
				3492	{
				3493	PyUnicodeObject u = NULL, v = NULL, *w;
				3494
				3495	/* Coerce the two arguments */
				3496	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3497	if (u == NULL)
				3498	goto onError;
				3499	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3500	if (v == NULL)
				3501	goto onError;
				3502
				3503	/* Shortcuts */
				3504	if (v == unicode_empty) {
				3505	Py_DECREF(v);
				3506	return (PyObject *)u;
				3507	}
				3508	if (u == unicode_empty) {
				3509	Py_DECREF(u);
				3510	return (PyObject *)v;
				3511	}
				3512
				3513	/* Concat the two Unicode strings */
				3514	w = _PyUnicode_New(u->length + v->length);
				3515	if (w == NULL)
				3516	goto onError;
				3517	Py_UNICODE_COPY(w->str, u->str, u->length);
				3518	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3519
				3520	Py_DECREF(u);
				3521	Py_DECREF(v);
				3522	return (PyObject *)w;
				3523
				3524	onError:
				3525	Py_XDECREF(u);
				3526	Py_XDECREF(v);
				3527	return NULL;
				3528	}
				3529
				3530	static char count__doc__[] =
				3531	"S.count(sub[, start[, end]]) -> int\n\
				3532	\n\
				3533	Return the number of occurrences of substring sub in Unicode string\n\
				3534	S[start:end]. Optional arguments start and end are\n\
				3535	interpreted as in slice notation.";
				3536
				3537	static PyObject *
				3538	unicode_count(PyUnicodeObject self, PyObject args)
				3539	{
				3540	PyUnicodeObject *substring;
				3541	int start = 0;
				3542	int end = INT_MAX;
				3543	PyObject *result;
				3544
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3545	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3546	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3547	return NULL;
				3548
				3549	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3550	(PyObject *)substring);
				3551	if (substring == NULL)
				3552	return NULL;
				3553
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3554	if (start < 0)
				3555	start += self->length;
				3556	if (start < 0)
				3557	start = 0;
				3558	if (end > self->length)
				3559	end = self->length;
				3560	if (end < 0)
				3561	end += self->length;
				3562	if (end < 0)
				3563	end = 0;
				3564
				3565	result = PyInt_FromLong((long) count(self, start, end, substring));
				3566
				3567	Py_DECREF(substring);
				3568	return result;
				3569	}
				3570
				3571	static char encode__doc__[] =
				3572	"S.encode([encoding[,errors]]) -> string\n\
				3573	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3574	Return an encoded string version of S. Default encoding is the current\n\
				3575	default string encoding. errors may be given to set a different error\n\
				3576	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3577	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3578
				3579	static PyObject *
				3580	unicode_encode(PyUnicodeObject self, PyObject args)
				3581	{
				3582	char *encoding = NULL;
				3583	char *errors = NULL;
				3584	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3585	return NULL;
				3586	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3587	}
				3588
				3589	static char expandtabs__doc__[] =
				3590	"S.expandtabs([tabsize]) -> unicode\n\
				3591	\n\
				3592	Return a copy of S where all tab characters are expanded using spaces.\n\
				3593	If tabsize is not given, a tab size of 8 characters is assumed.";
				3594
				3595	static PyObject*
				3596	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3597	{
				3598	Py_UNICODE *e;
				3599	Py_UNICODE *p;
				3600	Py_UNICODE *q;
				3601	int i, j;
				3602	PyUnicodeObject *u;
				3603	int tabsize = 8;
				3604
				3605	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3606	return NULL;
				3607
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3608	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3609	i = j = 0;
				3610	e = self->str + self->length;
				3611	for (p = self->str; p < e; p++)
				3612	if (*p == '\t') {
				3613	if (tabsize > 0)
				3614	j += tabsize - (j % tabsize);
				3615	}
				3616	else {
				3617	j++;
				3618	if (p == '\n' \|\| p == '\r') {
				3619	i += j;
				3620	j = 0;
				3621	}
				3622	}
				3623
				3624	/* Second pass: create output string and fill it */
				3625	u = _PyUnicode_New(i + j);
				3626	if (!u)
				3627	return NULL;
				3628
				3629	j = 0;
				3630	q = u->str;
				3631
				3632	for (p = self->str; p < e; p++)
				3633	if (*p == '\t') {
				3634	if (tabsize > 0) {
				3635	i = tabsize - (j % tabsize);
				3636	j += i;
				3637	while (i--)
				3638	*q++ = ' ';
				3639	}
				3640	}
				3641	else {
				3642	j++;
				3643	q++ = p;
				3644	if (p == '\n' \|\| p == '\r')
				3645	j = 0;
				3646	}
				3647
				3648	return (PyObject*) u;
				3649	}
				3650
				3651	static char find__doc__[] =
				3652	"S.find(sub [,start [,end]]) -> int\n\
				3653	\n\
				3654	Return the lowest index in S where substring sub is found,\n\
				3655	such that sub is contained within s[start,end]. Optional\n\
				3656	arguments start and end are interpreted as in slice notation.\n\
				3657	\n\
				3658	Return -1 on failure.";
				3659
				3660	static PyObject *
				3661	unicode_find(PyUnicodeObject self, PyObject args)
				3662	{
				3663	PyUnicodeObject *substring;
				3664	int start = 0;
				3665	int end = INT_MAX;
				3666	PyObject *result;
				3667
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3668	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3669	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3670	return NULL;
				3671	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3672	(PyObject *)substring);
				3673	if (substring == NULL)
				3674	return NULL;
				3675
				3676	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3677
				3678	Py_DECREF(substring);
				3679	return result;
				3680	}
				3681
				3682	static PyObject *
				3683	unicode_getitem(PyUnicodeObject *self, int index)
				3684	{
				3685	if (index < 0 \|\| index >= self->length) {
				3686	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3687	return NULL;
				3688	}
				3689
				3690	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3691	}
				3692
				3693	static long
				3694	unicode_hash(PyUnicodeObject *self)
				3695	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3696	/* Since Unicode objects compare equal to their ASCII string
				3697	counterparts, they should use the individual character values
				3698	as basis for their hash value. This is needed to assure that
				3699	strings and Unicode objects behave in the same way as
				3700	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3701
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3702	register int len;
				3703	register Py_UNICODE *p;
				3704	register long x;
				3705
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3706	if (self->hash != -1)
				3707	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3708	len = PyUnicode_GET_SIZE(self);
				3709	p = PyUnicode_AS_UNICODE(self);
				3710	x = *p << 7;
				3711	while (--len >= 0)
				3712	x = (1000003x) ^ p++;
				3713	x ^= PyUnicode_GET_SIZE(self);
				3714	if (x == -1)
				3715	x = -2;
				3716	self->hash = x;
				3717	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3718	}
				3719
				3720	static char index__doc__[] =
				3721	"S.index(sub [,start [,end]]) -> int\n\
				3722	\n\
				3723	Like S.find() but raise ValueError when the substring is not found.";
				3724
				3725	static PyObject *
				3726	unicode_index(PyUnicodeObject self, PyObject args)
				3727	{
				3728	int result;
				3729	PyUnicodeObject *substring;
				3730	int start = 0;
				3731	int end = INT_MAX;
				3732
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3733	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3734	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3735	return NULL;
				3736
				3737	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3738	(PyObject *)substring);
				3739	if (substring == NULL)
				3740	return NULL;
				3741
				3742	result = findstring(self, substring, start, end, 1);
				3743
				3744	Py_DECREF(substring);
				3745	if (result < 0) {
				3746	PyErr_SetString(PyExc_ValueError, "substring not found");
				3747	return NULL;
				3748	}
				3749	return PyInt_FromLong(result);
				3750	}
				3751
				3752	static char islower__doc__[] =
				3753	"S.islower() -> int\n\
				3754	\n\
				3755	Return 1 if all cased characters in S are lowercase and there is\n\
				3756	at least one cased character in S, 0 otherwise.";
				3757
				3758	static PyObject*
				3759	unicode_islower(PyUnicodeObject self, PyObject args)
				3760	{
				3761	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3762	register const Py_UNICODE *e;
				3763	int cased;
				3764
				3765	if (!PyArg_NoArgs(args))
				3766	return NULL;
				3767
				3768	/* Shortcut for single character strings */
				3769	if (PyUnicode_GET_SIZE(self) == 1)
				3770	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3771
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3772	/* Special case for empty strings */
				3773	if (PyString_GET_SIZE(self) == 0)
				3774	return PyInt_FromLong(0);
				3775
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3776	e = p + PyUnicode_GET_SIZE(self);
				3777	cased = 0;
				3778	for (; p < e; p++) {
				3779	register const Py_UNICODE ch = *p;
				3780
				3781	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3782	return PyInt_FromLong(0);
				3783	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3784	cased = 1;
				3785	}
				3786	return PyInt_FromLong(cased);
				3787	}
				3788
				3789	static char isupper__doc__[] =
				3790	"S.isupper() -> int\n\
				3791	\n\
				3792	Return 1 if all cased characters in S are uppercase and there is\n\
				3793	at least one cased character in S, 0 otherwise.";
				3794
				3795	static PyObject*
				3796	unicode_isupper(PyUnicodeObject self, PyObject args)
				3797	{
				3798	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3799	register const Py_UNICODE *e;
				3800	int cased;
				3801
				3802	if (!PyArg_NoArgs(args))
				3803	return NULL;
				3804
				3805	/* Shortcut for single character strings */
				3806	if (PyUnicode_GET_SIZE(self) == 1)
				3807	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3808
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3809	/* Special case for empty strings */
				3810	if (PyString_GET_SIZE(self) == 0)
				3811	return PyInt_FromLong(0);
				3812
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3813	e = p + PyUnicode_GET_SIZE(self);
				3814	cased = 0;
				3815	for (; p < e; p++) {
				3816	register const Py_UNICODE ch = *p;
				3817
				3818	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3819	return PyInt_FromLong(0);
				3820	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3821	cased = 1;
				3822	}
				3823	return PyInt_FromLong(cased);
				3824	}
				3825
				3826	static char istitle__doc__[] =
				3827	"S.istitle() -> int\n\
				3828	\n\
				3829	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3830	may only follow uncased characters and lowercase characters only cased\n\
				3831	ones. Return 0 otherwise.";
				3832
				3833	static PyObject*
				3834	unicode_istitle(PyUnicodeObject self, PyObject args)
				3835	{
				3836	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3837	register const Py_UNICODE *e;
				3838	int cased, previous_is_cased;
				3839
				3840	if (!PyArg_NoArgs(args))
				3841	return NULL;
				3842
				3843	/* Shortcut for single character strings */
				3844	if (PyUnicode_GET_SIZE(self) == 1)
				3845	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3846	(Py_UNICODE_ISUPPER(*p) != 0));
				3847
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3848	/* Special case for empty strings */
				3849	if (PyString_GET_SIZE(self) == 0)
				3850	return PyInt_FromLong(0);
				3851
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3852	e = p + PyUnicode_GET_SIZE(self);
				3853	cased = 0;
				3854	previous_is_cased = 0;
				3855	for (; p < e; p++) {
				3856	register const Py_UNICODE ch = *p;
				3857
				3858	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3859	if (previous_is_cased)
				3860	return PyInt_FromLong(0);
				3861	previous_is_cased = 1;
				3862	cased = 1;
				3863	}
				3864	else if (Py_UNICODE_ISLOWER(ch)) {
				3865	if (!previous_is_cased)
				3866	return PyInt_FromLong(0);
				3867	previous_is_cased = 1;
				3868	cased = 1;
				3869	}
				3870	else
				3871	previous_is_cased = 0;
				3872	}
				3873	return PyInt_FromLong(cased);
				3874	}
				3875
				3876	static char isspace__doc__[] =
				3877	"S.isspace() -> int\n\
				3878	\n\
				3879	Return 1 if there are only whitespace characters in S,\n\
				3880	0 otherwise.";
				3881
				3882	static PyObject*
				3883	unicode_isspace(PyUnicodeObject self, PyObject args)
				3884	{
				3885	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3886	register const Py_UNICODE *e;
				3887
				3888	if (!PyArg_NoArgs(args))
				3889	return NULL;
				3890
				3891	/* Shortcut for single character strings */
				3892	if (PyUnicode_GET_SIZE(self) == 1 &&
				3893	Py_UNICODE_ISSPACE(*p))
				3894	return PyInt_FromLong(1);
				3895
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3896	/* Special case for empty strings */
				3897	if (PyString_GET_SIZE(self) == 0)
				3898	return PyInt_FromLong(0);
				3899
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3900	e = p + PyUnicode_GET_SIZE(self);
				3901	for (; p < e; p++) {
				3902	if (!Py_UNICODE_ISSPACE(*p))
				3903	return PyInt_FromLong(0);
				3904	}
				3905	return PyInt_FromLong(1);
				3906	}
				3907
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3908	static char isalpha__doc__[] =
				3909	"S.isalpha() -> int\n\
				3910	\n\
				3911	Return 1 if all characters in S are alphabetic\n\
				3912	and there is at least one character in S, 0 otherwise.";
				3913
				3914	static PyObject*
				3915	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3916	{
				3917	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3918	register const Py_UNICODE *e;
				3919
				3920	if (!PyArg_NoArgs(args))
				3921	return NULL;
				3922
				3923	/* Shortcut for single character strings */
				3924	if (PyUnicode_GET_SIZE(self) == 1 &&
				3925	Py_UNICODE_ISALPHA(*p))
				3926	return PyInt_FromLong(1);
				3927
				3928	/* Special case for empty strings */
				3929	if (PyString_GET_SIZE(self) == 0)
				3930	return PyInt_FromLong(0);
				3931
				3932	e = p + PyUnicode_GET_SIZE(self);
				3933	for (; p < e; p++) {
				3934	if (!Py_UNICODE_ISALPHA(*p))
				3935	return PyInt_FromLong(0);
				3936	}
				3937	return PyInt_FromLong(1);
				3938	}
				3939
				3940	static char isalnum__doc__[] =
				3941	"S.isalnum() -> int\n\
				3942	\n\
				3943	Return 1 if all characters in S are alphanumeric\n\
				3944	and there is at least one character in S, 0 otherwise.";
				3945
				3946	static PyObject*
				3947	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3948	{
				3949	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3950	register const Py_UNICODE *e;
				3951
				3952	if (!PyArg_NoArgs(args))
				3953	return NULL;
				3954
				3955	/* Shortcut for single character strings */
				3956	if (PyUnicode_GET_SIZE(self) == 1 &&
				3957	Py_UNICODE_ISALNUM(*p))
				3958	return PyInt_FromLong(1);
				3959
				3960	/* Special case for empty strings */
				3961	if (PyString_GET_SIZE(self) == 0)
				3962	return PyInt_FromLong(0);
				3963
				3964	e = p + PyUnicode_GET_SIZE(self);
				3965	for (; p < e; p++) {
				3966	if (!Py_UNICODE_ISALNUM(*p))
				3967	return PyInt_FromLong(0);
				3968	}
				3969	return PyInt_FromLong(1);
				3970	}
				3971
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3972	static char isdecimal__doc__[] =
				3973	"S.isdecimal() -> int\n\
				3974	\n\
				3975	Return 1 if there are only decimal characters in S,\n\
				3976	0 otherwise.";
				3977
				3978	static PyObject*
				3979	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3980	{
				3981	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3982	register const Py_UNICODE *e;
				3983
				3984	if (!PyArg_NoArgs(args))
				3985	return NULL;
				3986
				3987	/* Shortcut for single character strings */
				3988	if (PyUnicode_GET_SIZE(self) == 1 &&
				3989	Py_UNICODE_ISDECIMAL(*p))
				3990	return PyInt_FromLong(1);
				3991
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3992	/* Special case for empty strings */
				3993	if (PyString_GET_SIZE(self) == 0)
				3994	return PyInt_FromLong(0);
				3995
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3996	e = p + PyUnicode_GET_SIZE(self);
				3997	for (; p < e; p++) {
				3998	if (!Py_UNICODE_ISDECIMAL(*p))
				3999	return PyInt_FromLong(0);
				4000	}
				4001	return PyInt_FromLong(1);
				4002	}
				4003
				4004	static char isdigit__doc__[] =
				4005	"S.isdigit() -> int\n\
				4006	\n\
				4007	Return 1 if there are only digit characters in S,\n\
				4008	0 otherwise.";
				4009
				4010	static PyObject*
				4011	unicode_isdigit(PyUnicodeObject self, PyObject args)
				4012	{
				4013	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4014	register const Py_UNICODE *e;
				4015
				4016	if (!PyArg_NoArgs(args))
				4017	return NULL;
				4018
				4019	/* Shortcut for single character strings */
				4020	if (PyUnicode_GET_SIZE(self) == 1 &&
				4021	Py_UNICODE_ISDIGIT(*p))
				4022	return PyInt_FromLong(1);
				4023
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4024	/* Special case for empty strings */
				4025	if (PyString_GET_SIZE(self) == 0)
				4026	return PyInt_FromLong(0);
				4027
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4028	e = p + PyUnicode_GET_SIZE(self);
				4029	for (; p < e; p++) {
				4030	if (!Py_UNICODE_ISDIGIT(*p))
				4031	return PyInt_FromLong(0);
				4032	}
				4033	return PyInt_FromLong(1);
				4034	}
				4035
				4036	static char isnumeric__doc__[] =
				4037	"S.isnumeric() -> int\n\
				4038	\n\
				4039	Return 1 if there are only numeric characters in S,\n\
				4040	0 otherwise.";
				4041
				4042	static PyObject*
				4043	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				4044	{
				4045	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4046	register const Py_UNICODE *e;
				4047
				4048	if (!PyArg_NoArgs(args))
				4049	return NULL;
				4050
				4051	/* Shortcut for single character strings */
				4052	if (PyUnicode_GET_SIZE(self) == 1 &&
				4053	Py_UNICODE_ISNUMERIC(*p))
				4054	return PyInt_FromLong(1);
				4055
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4056	/* Special case for empty strings */
				4057	if (PyString_GET_SIZE(self) == 0)
				4058	return PyInt_FromLong(0);
				4059
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4060	e = p + PyUnicode_GET_SIZE(self);
				4061	for (; p < e; p++) {
				4062	if (!Py_UNICODE_ISNUMERIC(*p))
				4063	return PyInt_FromLong(0);
				4064	}
				4065	return PyInt_FromLong(1);
				4066	}
				4067
				4068	static char join__doc__[] =
				4069	"S.join(sequence) -> unicode\n\
				4070	\n\
				4071	Return a string which is the concatenation of the strings in the\n\
				4072	sequence. The separator between elements is S.";
				4073
				4074	static PyObject*
				4075	unicode_join(PyUnicodeObject self, PyObject args)
				4076	{
				4077	PyObject *data;
				4078	if (!PyArg_ParseTuple(args, "O:join", &data))
				4079	return NULL;
				4080
				4081	return PyUnicode_Join((PyObject *)self, data);
				4082	}
				4083
				4084	static int
				4085	unicode_length(PyUnicodeObject *self)
				4086	{
				4087	return self->length;
				4088	}
				4089
				4090	static char ljust__doc__[] =
				4091	"S.ljust(width) -> unicode\n\
				4092	\n\
				4093	Return S left justified in a Unicode string of length width. Padding is\n\
				4094	done using spaces.";
				4095
				4096	static PyObject *
				4097	unicode_ljust(PyUnicodeObject self, PyObject args)
				4098	{
				4099	int width;
				4100	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4101	return NULL;
				4102
				4103	if (self->length >= width) {
				4104	Py_INCREF(self);
				4105	return (PyObject*) self;
				4106	}
				4107
				4108	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4109	}
				4110
				4111	static char lower__doc__[] =
				4112	"S.lower() -> unicode\n\
				4113	\n\
				4114	Return a copy of the string S converted to lowercase.";
				4115
				4116	static PyObject*
				4117	unicode_lower(PyUnicodeObject self, PyObject args)
				4118	{
				4119	if (!PyArg_NoArgs(args))
				4120	return NULL;
				4121	return fixup(self, fixlower);
				4122	}
				4123
				4124	static char lstrip__doc__[] =
				4125	"S.lstrip() -> unicode\n\
				4126	\n\
				4127	Return a copy of the string S with leading whitespace removed.";
				4128
				4129	static PyObject *
				4130	unicode_lstrip(PyUnicodeObject self, PyObject args)
				4131	{
				4132	if (!PyArg_NoArgs(args))
				4133	return NULL;
				4134	return strip(self, 1, 0);
				4135	}
				4136
				4137	static PyObject*
				4138	unicode_repeat(PyUnicodeObject *str, int len)
				4139	{
				4140	PyUnicodeObject *u;
				4141	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4142	int nchars;
				4143	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4144
				4145	if (len < 0)
				4146	len = 0;
				4147
				4148	if (len == 1) {
				4149	/* no repeat, return original string */
				4150	Py_INCREF(str);
				4151	return (PyObject*) str;
				4152	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4153
				4154	/* ensure # of chars needed doesn't overflow int and # of bytes
				4155	* needed doesn't overflow size_t
				4156	*/
				4157	nchars = len * str->length;
				4158	if (len && nchars / len != str->length) {
				4159	PyErr_SetString(PyExc_OverflowError,
				4160	"repeated string is too long");
				4161	return NULL;
				4162	}
				4163	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4164	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4165	PyErr_SetString(PyExc_OverflowError,
				4166	"repeated string is too long");
				4167	return NULL;
				4168	}
				4169	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4170	if (!u)
				4171	return NULL;
				4172
				4173	p = u->str;
				4174
				4175	while (len-- > 0) {
				4176	Py_UNICODE_COPY(p, str->str, str->length);
				4177	p += str->length;
				4178	}
				4179
				4180	return (PyObject*) u;
				4181	}
				4182
				4183	PyObject PyUnicode_Replace(PyObject obj,
				4184	PyObject *subobj,
				4185	PyObject *replobj,
				4186	int maxcount)
				4187	{
				4188	PyObject *self;
				4189	PyObject *str1;
				4190	PyObject *str2;
				4191	PyObject *result;
				4192
				4193	self = PyUnicode_FromObject(obj);
				4194	if (self == NULL)
				4195	return NULL;
				4196	str1 = PyUnicode_FromObject(subobj);
				4197	if (str1 == NULL) {
				4198	Py_DECREF(self);
				4199	return NULL;
				4200	}
				4201	str2 = PyUnicode_FromObject(replobj);
				4202	if (str2 == NULL) {
				4203	Py_DECREF(self);
				4204	Py_DECREF(str1);
				4205	return NULL;
				4206	}
				4207	result = replace((PyUnicodeObject *)self,
				4208	(PyUnicodeObject *)str1,
				4209	(PyUnicodeObject *)str2,
				4210	maxcount);
				4211	Py_DECREF(self);
				4212	Py_DECREF(str1);
				4213	Py_DECREF(str2);
				4214	return result;
				4215	}
				4216
				4217	static char replace__doc__[] =
				4218	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4219	\n\
				4220	Return a copy of S with all occurrences of substring\n\
				4221	old replaced by new. If the optional argument maxsplit is\n\
				4222	given, only the first maxsplit occurrences are replaced.";
				4223
				4224	static PyObject*
				4225	unicode_replace(PyUnicodeObject self, PyObject args)
				4226	{
				4227	PyUnicodeObject *str1;
				4228	PyUnicodeObject *str2;
				4229	int maxcount = -1;
				4230	PyObject *result;
				4231
				4232	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4233	return NULL;
				4234	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4235	if (str1 == NULL)
				4236	return NULL;
				4237	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4238	if (str2 == NULL)
				4239	return NULL;
				4240
				4241	result = replace(self, str1, str2, maxcount);
				4242
				4243	Py_DECREF(str1);
				4244	Py_DECREF(str2);
				4245	return result;
				4246	}
				4247
				4248	static
				4249	PyObject unicode_repr(PyObject unicode)
				4250	{
				4251	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4252	PyUnicode_GET_SIZE(unicode),
				4253	1);
				4254	}
				4255
				4256	static char rfind__doc__[] =
				4257	"S.rfind(sub [,start [,end]]) -> int\n\
				4258	\n\
				4259	Return the highest index in S where substring sub is found,\n\
				4260	such that sub is contained within s[start,end]. Optional\n\
				4261	arguments start and end are interpreted as in slice notation.\n\
				4262	\n\
				4263	Return -1 on failure.";
				4264
				4265	static PyObject *
				4266	unicode_rfind(PyUnicodeObject self, PyObject args)
				4267	{
				4268	PyUnicodeObject *substring;
				4269	int start = 0;
				4270	int end = INT_MAX;
				4271	PyObject *result;
				4272
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4273	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4274	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4275	return NULL;
				4276	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4277	(PyObject *)substring);
				4278	if (substring == NULL)
				4279	return NULL;
				4280
				4281	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4282
				4283	Py_DECREF(substring);
				4284	return result;
				4285	}
				4286
				4287	static char rindex__doc__[] =
				4288	"S.rindex(sub [,start [,end]]) -> int\n\
				4289	\n\
				4290	Like S.rfind() but raise ValueError when the substring is not found.";
				4291
				4292	static PyObject *
				4293	unicode_rindex(PyUnicodeObject self, PyObject args)
				4294	{
				4295	int result;
				4296	PyUnicodeObject *substring;
				4297	int start = 0;
				4298	int end = INT_MAX;
				4299
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4300	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4301	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4302	return NULL;
				4303	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4304	(PyObject *)substring);
				4305	if (substring == NULL)
				4306	return NULL;
				4307
				4308	result = findstring(self, substring, start, end, -1);
				4309
				4310	Py_DECREF(substring);
				4311	if (result < 0) {
				4312	PyErr_SetString(PyExc_ValueError, "substring not found");
				4313	return NULL;
				4314	}
				4315	return PyInt_FromLong(result);
				4316	}
				4317
				4318	static char rjust__doc__[] =
				4319	"S.rjust(width) -> unicode\n\
				4320	\n\
				4321	Return S right justified in a Unicode string of length width. Padding is\n\
				4322	done using spaces.";
				4323
				4324	static PyObject *
				4325	unicode_rjust(PyUnicodeObject self, PyObject args)
				4326	{
				4327	int width;
				4328	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4329	return NULL;
				4330
				4331	if (self->length >= width) {
				4332	Py_INCREF(self);
				4333	return (PyObject*) self;
				4334	}
				4335
				4336	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4337	}
				4338
				4339	static char rstrip__doc__[] =
				4340	"S.rstrip() -> unicode\n\
				4341	\n\
				4342	Return a copy of the string S with trailing whitespace removed.";
				4343
				4344	static PyObject *
				4345	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4346	{
				4347	if (!PyArg_NoArgs(args))
				4348	return NULL;
				4349	return strip(self, 0, 1);
				4350	}
				4351
				4352	static PyObject*
				4353	unicode_slice(PyUnicodeObject *self, int start, int end)
				4354	{
				4355	/* standard clamping */
				4356	if (start < 0)
				4357	start = 0;
				4358	if (end < 0)
				4359	end = 0;
				4360	if (end > self->length)
				4361	end = self->length;
				4362	if (start == 0 && end == self->length) {
				4363	/* full slice, return original string */
				4364	Py_INCREF(self);
				4365	return (PyObject*) self;
				4366	}
				4367	if (start > end)
				4368	start = end;
				4369	/* copy slice */
				4370	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4371	end - start);
				4372	}
				4373
				4374	PyObject PyUnicode_Split(PyObject s,
				4375	PyObject *sep,
				4376	int maxsplit)
				4377	{
				4378	PyObject *result;
				4379
				4380	s = PyUnicode_FromObject(s);
				4381	if (s == NULL)
				4382	return NULL;
				4383	if (sep != NULL) {
				4384	sep = PyUnicode_FromObject(sep);
				4385	if (sep == NULL) {
				4386	Py_DECREF(s);
				4387	return NULL;
				4388	}
				4389	}
				4390
				4391	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4392
				4393	Py_DECREF(s);
				4394	Py_XDECREF(sep);
				4395	return result;
				4396	}
				4397
				4398	static char split__doc__[] =
				4399	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4400	\n\
				4401	Return a list of the words in S, using sep as the\n\
				4402	delimiter string. If maxsplit is given, at most maxsplit\n\
				4403	splits are done. If sep is not specified, any whitespace string\n\
				4404	is a separator.";
				4405
				4406	static PyObject*
				4407	unicode_split(PyUnicodeObject self, PyObject args)
				4408	{
				4409	PyObject *substring = Py_None;
				4410	int maxcount = -1;
				4411
				4412	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4413	return NULL;
				4414
				4415	if (substring == Py_None)
				4416	return split(self, NULL, maxcount);
				4417	else if (PyUnicode_Check(substring))
				4418	return split(self, (PyUnicodeObject *)substring, maxcount);
				4419	else
				4420	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4421	}
				4422
				4423	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4424	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4425	\n\
				4426	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4427	Line breaks are not included in the resulting list unless keepends\n\
				4428	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4429
				4430	static PyObject*
				4431	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4432	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4433	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4434
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4435	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4436	return NULL;
				4437
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4438	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4439	}
				4440
				4441	static
				4442	PyObject unicode_str(PyUnicodeObject self)
				4443	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4444	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4445	}
				4446
				4447	static char strip__doc__[] =
				4448	"S.strip() -> unicode\n\
				4449	\n\
				4450	Return a copy of S with leading and trailing whitespace removed.";
				4451
				4452	static PyObject *
				4453	unicode_strip(PyUnicodeObject self, PyObject args)
				4454	{
				4455	if (!PyArg_NoArgs(args))
				4456	return NULL;
				4457	return strip(self, 1, 1);
				4458	}
				4459
				4460	static char swapcase__doc__[] =
				4461	"S.swapcase() -> unicode\n\
				4462	\n\
				4463	Return a copy of S with uppercase characters converted to lowercase\n\
				4464	and vice versa.";
				4465
				4466	static PyObject*
				4467	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4468	{
				4469	if (!PyArg_NoArgs(args))
				4470	return NULL;
				4471	return fixup(self, fixswapcase);
				4472	}
				4473
				4474	static char translate__doc__[] =
				4475	"S.translate(table) -> unicode\n\
				4476	\n\
				4477	Return a copy of the string S, where all characters have been mapped\n\
				4478	through the given translation table, which must be a mapping of\n\
				4479	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4480	are left untouched. Characters mapped to None are deleted.";
				4481
				4482	static PyObject*
				4483	unicode_translate(PyUnicodeObject self, PyObject args)
				4484	{
				4485	PyObject *table;
				4486
				4487	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4488	return NULL;
				4489	return PyUnicode_TranslateCharmap(self->str,
				4490	self->length,
				4491	table,
				4492	"ignore");
				4493	}
				4494
				4495	static char upper__doc__[] =
				4496	"S.upper() -> unicode\n\
				4497	\n\
				4498	Return a copy of S converted to uppercase.";
				4499
				4500	static PyObject*
				4501	unicode_upper(PyUnicodeObject self, PyObject args)
				4502	{
				4503	if (!PyArg_NoArgs(args))
				4504	return NULL;
				4505	return fixup(self, fixupper);
				4506	}
				4507
				4508	#if 0
				4509	static char zfill__doc__[] =
				4510	"S.zfill(width) -> unicode\n\
				4511	\n\
				4512	Pad a numeric string x with zeros on the left, to fill a field\n\
				4513	of the specified width. The string x is never truncated.";
				4514
				4515	static PyObject *
				4516	unicode_zfill(PyUnicodeObject self, PyObject args)
				4517	{
				4518	int fill;
				4519	PyUnicodeObject *u;
				4520
				4521	int width;
				4522	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4523	return NULL;
				4524
				4525	if (self->length >= width) {
				4526	Py_INCREF(self);
				4527	return (PyObject*) self;
				4528	}
				4529
				4530	fill = width - self->length;
				4531
				4532	u = pad(self, fill, 0, '0');
				4533
				4534	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4535	/* move sign to beginning of string */
				4536	u->str[0] = u->str[fill];
				4537	u->str[fill] = '0';
				4538	}
				4539
				4540	return (PyObject*) u;
				4541	}
				4542	#endif
				4543
				4544	#if 0
				4545	static PyObject*
				4546	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4547	{
				4548	if (!PyArg_NoArgs(args))
				4549	return NULL;
				4550	return PyInt_FromLong(unicode_freelist_size);
				4551	}
				4552	#endif
				4553
				4554	static char startswith__doc__[] =
				4555	"S.startswith(prefix[, start[, end]]) -> int\n\
				4556	\n\
				4557	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4558	optional start, test S beginning at that position. With optional end, stop\n\
				4559	comparing S at that position.";
				4560
				4561	static PyObject *
				4562	unicode_startswith(PyUnicodeObject *self,
				4563	PyObject *args)
				4564	{
				4565	PyUnicodeObject *substring;
				4566	int start = 0;
				4567	int end = INT_MAX;
				4568	PyObject *result;
				4569
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4570	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4571	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4572	return NULL;
				4573	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4574	(PyObject *)substring);
				4575	if (substring == NULL)
				4576	return NULL;
				4577
				4578	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4579
				4580	Py_DECREF(substring);
				4581	return result;
				4582	}
				4583
				4584
				4585	static char endswith__doc__[] =
				4586	"S.endswith(suffix[, start[, end]]) -> int\n\
				4587	\n\
				4588	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4589	optional start, test S beginning at that position. With optional end, stop\n\
				4590	comparing S at that position.";
				4591
				4592	static PyObject *
				4593	unicode_endswith(PyUnicodeObject *self,
				4594	PyObject *args)
				4595	{
				4596	PyUnicodeObject *substring;
				4597	int start = 0;
				4598	int end = INT_MAX;
				4599	PyObject *result;
				4600
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4601	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4602	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4603	return NULL;
				4604	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4605	(PyObject *)substring);
				4606	if (substring == NULL)
				4607	return NULL;
				4608
				4609	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4610
				4611	Py_DECREF(substring);
				4612	return result;
				4613	}
				4614
				4615
				4616	static PyMethodDef unicode_methods[] = {
				4617
				4618	/* Order is according to common usage: often used methods should
				4619	appear first, since lookup is done sequentially. */
				4620
				4621	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4622	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4623	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4624	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4625	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4626	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4627	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4628	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4629	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4630	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4631	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4632	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4633	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4634	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4635	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4636	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4637	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4638	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4639	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4640	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4641	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4642	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4643	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4644	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4645	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4646	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4647	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4648	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4649	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4650	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4651	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4652	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4653	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4654	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4655	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4656	#if 0
				4657	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4658	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4659	#endif
				4660
				4661	#if 0
				4662	/* This one is just used for debugging the implementation. */
				4663	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4664	#endif
				4665
				4666	{NULL, NULL}
				4667	};
				4668
				4669	static PyObject *
				4670	unicode_getattr(PyUnicodeObject self, char name)
				4671	{
				4672	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4673	}
				4674
				4675	static PySequenceMethods unicode_as_sequence = {
				4676	(inquiry) unicode_length, /* sq_length */
				4677	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4678	(intargfunc) unicode_repeat, /* sq_repeat */
				4679	(intargfunc) unicode_getitem, /* sq_item */
				4680	(intintargfunc) unicode_slice, /* sq_slice */
				4681	0, /* sq_ass_item */
				4682	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4683	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4684	};
				4685
				4686	static int
				4687	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4688	int index,
				4689	const void **ptr)
				4690	{
				4691	if (index != 0) {
				4692	PyErr_SetString(PyExc_SystemError,
				4693	"accessing non-existent unicode segment");
				4694	return -1;
				4695	}
				4696	ptr = (void ) self->str;
				4697	return PyUnicode_GET_DATA_SIZE(self);
				4698	}
				4699
				4700	static int
				4701	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4702	const void **ptr)
				4703	{
				4704	PyErr_SetString(PyExc_TypeError,
				4705	"cannot use unicode as modifyable buffer");
				4706	return -1;
				4707	}
				4708
				4709	static int
				4710	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4711	int *lenp)
				4712	{
				4713	if (lenp)
				4714	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4715	return 1;
				4716	}
				4717
				4718	static int
				4719	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4720	int index,
				4721	const void **ptr)
				4722	{
				4723	PyObject *str;
				4724
				4725	if (index != 0) {
				4726	PyErr_SetString(PyExc_SystemError,
				4727	"accessing non-existent unicode segment");
				4728	return -1;
				4729	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4730	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4731	if (str == NULL)
				4732	return -1;
				4733	ptr = (void ) PyString_AS_STRING(str);
				4734	return PyString_GET_SIZE(str);
				4735	}
				4736
				4737	/* Helpers for PyUnicode_Format() */
				4738
				4739	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4740	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4741	{
				4742	int argidx = *p_argidx;
				4743	if (argidx < arglen) {
				4744	(*p_argidx)++;
				4745	if (arglen < 0)
				4746	return args;
				4747	else
				4748	return PyTuple_GetItem(args, argidx);
				4749	}
				4750	PyErr_SetString(PyExc_TypeError,
				4751	"not enough arguments for format string");
				4752	return NULL;
				4753	}
				4754
				4755	#define F_LJUST (1<<0)
				4756	#define F_SIGN (1<<1)
				4757	#define F_BLANK (1<<2)
				4758	#define F_ALT (1<<3)
				4759	#define F_ZERO (1<<4)
				4760
				4761	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4762	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4763	{
				4764	register int i;
				4765	int len;
				4766	va_list va;
				4767	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4768	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4769
				4770	/* First, format the string as char array, then expand to Py_UNICODE
				4771	array. */
				4772	charbuffer = (char *)buffer;
				4773	len = vsprintf(charbuffer, format, va);
				4774	for (i = len - 1; i >= 0; i--)
				4775	buffer[i] = (Py_UNICODE) charbuffer[i];
				4776
				4777	va_end(va);
				4778	return len;
				4779	}
				4780
				4781	static int
				4782	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4783	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4784	int flags,
				4785	int prec,
				4786	int type,
				4787	PyObject *v)
				4788	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4789	/* fmt = '%#.' + `prec` + `type`
				4790	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4791	char fmt[20];
				4792	double x;
				4793
				4794	x = PyFloat_AsDouble(v);
				4795	if (x == -1.0 && PyErr_Occurred())
				4796	return -1;
				4797	if (prec < 0)
				4798	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4799	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4800	type = 'g';
				4801	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4802	/* worst case length calc to ensure no buffer overrun:
				4803	fmt = %#.<prec>g
				4804	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4805	for any double rep.)
				4806	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4807	If prec=0 the effective precision is 1 (the leading digit is
				4808	always given), therefore increase by one to 10+prec. */
				4809	if (buflen <= (size_t)10 + (size_t)prec) {
				4810	PyErr_SetString(PyExc_OverflowError,
				4811	"formatted float is too long (precision too long?)");
				4812	return -1;
				4813	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4814	return usprintf(buf, fmt, x);
				4815	}
				4816
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4817	static PyObject*
				4818	formatlong(PyObject *val, int flags, int prec, int type)
				4819	{
				4820	char *buf;
				4821	int i, len;
				4822	PyObject str; / temporary string object. */
				4823	PyUnicodeObject *result;
				4824
				4825	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4826	if (!str)
				4827	return NULL;
				4828	result = _PyUnicode_New(len);
				4829	for (i = 0; i < len; i++)
				4830	result->str[i] = buf[i];
				4831	result->str[len] = 0;
				4832	Py_DECREF(str);
				4833	return (PyObject*)result;
				4834	}
				4835
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4836	static int
				4837	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4838	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4839	int flags,
				4840	int prec,
				4841	int type,
				4842	PyObject *v)
				4843	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4844	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4845	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4846	+ 1 + 1 = 24*/
				4847	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4848	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4849	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4850
				4851	x = PyInt_AsLong(v);
				4852	if (x == -1 && PyErr_Occurred())
				4853	return -1;
				4854	if (prec < 0)
				4855	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4856	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4857	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4858	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4859	PyErr_SetString(PyExc_OverflowError,
				4860	"formatted integer is too long (precision too long?)");
				4861	return -1;
				4862	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4863	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				4864	* but we want it (for consistency with other %#x conversions, and
				4865	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4866	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				4867	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				4868	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4869	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4870	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				4871	/* Only way to know what the platform does is to try it. */
				4872	sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
				4873	if (fmt[1] != (char)type) {
				4874	/* Supply our own leading 0x/0X -- needed under std C */
				4875	use_native_c_format = 0;
				4876	sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
				4877	}
				4878	}
				4879	if (use_native_c_format)
				4880	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4881	return usprintf(buf, fmt, x);
				4882	}
				4883
				4884	static int
				4885	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4886	size_t buflen,
				4887	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4888	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4889	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4890	if (PyUnicode_Check(v)) {
				4891	if (PyUnicode_GET_SIZE(v) != 1)
				4892	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4893	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4894	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4895
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4896	else if (PyString_Check(v)) {
				4897	if (PyString_GET_SIZE(v) != 1)
				4898	goto onError;
				4899	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4900	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4901
				4902	else {
				4903	/* Integer input truncated to a character */
				4904	long x;
				4905	x = PyInt_AsLong(v);
				4906	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4907	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4908	buf[0] = (char) x;
				4909	}
				4910	buf[1] = '\0';
				4911	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4912
				4913	onError:
				4914	PyErr_SetString(PyExc_TypeError,
				4915	"%c requires int or char");
				4916	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4917	}
				4918
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4919	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4920
				4921	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4922	chars are formatted. XXX This is a magic number. Each formatting
				4923	routine does bounds checking to ensure no overflow, but a better
				4924	solution may be to malloc a buffer of appropriate size for each
				4925	format. For now, the current solution is sufficient.
				4926	*/
				4927	#define FORMATBUFLEN (size_t)120
				4928
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4929	PyObject PyUnicode_Format(PyObject format,
				4930	PyObject *args)
				4931	{
				4932	Py_UNICODE fmt, res;
				4933	int fmtcnt, rescnt, reslen, arglen, argidx;
				4934	int args_owned = 0;
				4935	PyUnicodeObject *result = NULL;
				4936	PyObject *dict = NULL;
				4937	PyObject *uformat;
				4938
				4939	if (format == NULL \|\| args == NULL) {
				4940	PyErr_BadInternalCall();
				4941	return NULL;
				4942	}
				4943	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4944	if (uformat == NULL)
				4945	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4946	fmt = PyUnicode_AS_UNICODE(uformat);
				4947	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4948
				4949	reslen = rescnt = fmtcnt + 100;
				4950	result = _PyUnicode_New(reslen);
				4951	if (result == NULL)
				4952	goto onError;
				4953	res = PyUnicode_AS_UNICODE(result);
				4954
				4955	if (PyTuple_Check(args)) {
				4956	arglen = PyTuple_Size(args);
				4957	argidx = 0;
				4958	}
				4959	else {
				4960	arglen = -1;
				4961	argidx = -2;
				4962	}
				4963	if (args->ob_type->tp_as_mapping)
				4964	dict = args;
				4965
				4966	while (--fmtcnt >= 0) {
				4967	if (*fmt != '%') {
				4968	if (--rescnt < 0) {
				4969	rescnt = fmtcnt + 100;
				4970	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	4971	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4972	return NULL;
				4973	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4974	--rescnt;
				4975	}
				4976	res++ = fmt++;
				4977	}
				4978	else {
				4979	/* Got a format specifier */
				4980	int flags = 0;
				4981	int width = -1;
				4982	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4983	Py_UNICODE c = '\0';
				4984	Py_UNICODE fill;
				4985	PyObject *v = NULL;
				4986	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4987	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4988	Py_UNICODE sign;
				4989	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4990	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4991
				4992	fmt++;
				4993	if (*fmt == '(') {
				4994	Py_UNICODE *keystart;
				4995	int keylen;
				4996	PyObject *key;
				4997	int pcount = 1;
				4998
				4999	if (dict == NULL) {
				5000	PyErr_SetString(PyExc_TypeError,
				5001	"format requires a mapping");
				5002	goto onError;
				5003	}
				5004	++fmt;
				5005	--fmtcnt;
				5006	keystart = fmt;
				5007	/* Skip over balanced parentheses */
				5008	while (pcount > 0 && --fmtcnt >= 0) {
				5009	if (*fmt == ')')
				5010	--pcount;
				5011	else if (*fmt == '(')
				5012	++pcount;
				5013	fmt++;
				5014	}
				5015	keylen = fmt - keystart - 1;
				5016	if (fmtcnt < 0 \|\| pcount > 0) {
				5017	PyErr_SetString(PyExc_ValueError,
				5018	"incomplete format key");
				5019	goto onError;
				5020	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5021	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5022	then looked up since Python uses strings to hold
				5023	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5024	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5025	key = PyUnicode_EncodeUTF8(keystart,
				5026	keylen,
				5027	NULL);
				5028	if (key == NULL)
				5029	goto onError;
				5030	if (args_owned) {
				5031	Py_DECREF(args);
				5032	args_owned = 0;
				5033	}
				5034	args = PyObject_GetItem(dict, key);
				5035	Py_DECREF(key);
				5036	if (args == NULL) {
				5037	goto onError;
				5038	}
				5039	args_owned = 1;
				5040	arglen = -1;
				5041	argidx = -2;
				5042	}
				5043	while (--fmtcnt >= 0) {
				5044	switch (c = *fmt++) {
				5045	case '-': flags \|= F_LJUST; continue;
				5046	case '+': flags \|= F_SIGN; continue;
				5047	case ' ': flags \|= F_BLANK; continue;
				5048	case '#': flags \|= F_ALT; continue;
				5049	case '0': flags \|= F_ZERO; continue;
				5050	}
				5051	break;
				5052	}
				5053	if (c == '*') {
				5054	v = getnextarg(args, arglen, &argidx);
				5055	if (v == NULL)
				5056	goto onError;
				5057	if (!PyInt_Check(v)) {
				5058	PyErr_SetString(PyExc_TypeError,
				5059	"* wants int");
				5060	goto onError;
				5061	}
				5062	width = PyInt_AsLong(v);
				5063	if (width < 0) {
				5064	flags \|= F_LJUST;
				5065	width = -width;
				5066	}
				5067	if (--fmtcnt >= 0)
				5068	c = *fmt++;
				5069	}
				5070	else if (c >= '0' && c <= '9') {
				5071	width = c - '0';
				5072	while (--fmtcnt >= 0) {
				5073	c = *fmt++;
				5074	if (c < '0' \|\| c > '9')
				5075	break;
				5076	if ((width*10) / 10 != width) {
				5077	PyErr_SetString(PyExc_ValueError,
				5078	"width too big");
				5079	goto onError;
				5080	}
				5081	width = width*10 + (c - '0');
				5082	}
				5083	}
				5084	if (c == '.') {
				5085	prec = 0;
				5086	if (--fmtcnt >= 0)
				5087	c = *fmt++;
				5088	if (c == '*') {
				5089	v = getnextarg(args, arglen, &argidx);
				5090	if (v == NULL)
				5091	goto onError;
				5092	if (!PyInt_Check(v)) {
				5093	PyErr_SetString(PyExc_TypeError,
				5094	"* wants int");
				5095	goto onError;
				5096	}
				5097	prec = PyInt_AsLong(v);
				5098	if (prec < 0)
				5099	prec = 0;
				5100	if (--fmtcnt >= 0)
				5101	c = *fmt++;
				5102	}
				5103	else if (c >= '0' && c <= '9') {
				5104	prec = c - '0';
				5105	while (--fmtcnt >= 0) {
				5106	c = Py_CHARMASK(*fmt++);
				5107	if (c < '0' \|\| c > '9')
				5108	break;
				5109	if ((prec*10) / 10 != prec) {
				5110	PyErr_SetString(PyExc_ValueError,
				5111	"prec too big");
				5112	goto onError;
				5113	}
				5114	prec = prec*10 + (c - '0');
				5115	}
				5116	}
				5117	} /* prec */
				5118	if (fmtcnt >= 0) {
				5119	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5120	if (--fmtcnt >= 0)
				5121	c = *fmt++;
				5122	}
				5123	}
				5124	if (fmtcnt < 0) {
				5125	PyErr_SetString(PyExc_ValueError,
				5126	"incomplete format");
				5127	goto onError;
				5128	}
				5129	if (c != '%') {
				5130	v = getnextarg(args, arglen, &argidx);
				5131	if (v == NULL)
				5132	goto onError;
				5133	}
				5134	sign = 0;
				5135	fill = ' ';
				5136	switch (c) {
				5137
				5138	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5139	pbuf = formatbuf;
				5140	/* presume that buffer length is at least 1 */
				5141	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5142	len = 1;
				5143	break;
				5144
				5145	case 's':
				5146	case 'r':
				5147	if (PyUnicode_Check(v) && c == 's') {
				5148	temp = v;
				5149	Py_INCREF(temp);
				5150	}
				5151	else {
				5152	PyObject *unicode;
				5153	if (c == 's')
				5154	temp = PyObject_Str(v);
				5155	else
				5156	temp = PyObject_Repr(v);
				5157	if (temp == NULL)
				5158	goto onError;
				5159	if (!PyString_Check(temp)) {
				5160	/* XXX Note: this should never happen, since
				5161	PyObject_Repr() and PyObject_Str() assure
				5162	this */
				5163	Py_DECREF(temp);
				5164	PyErr_SetString(PyExc_TypeError,
				5165	"%s argument has non-string str()");
				5166	goto onError;
				5167	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5168	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5169	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5170	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5171	"strict");
				5172	Py_DECREF(temp);
				5173	temp = unicode;
				5174	if (temp == NULL)
				5175	goto onError;
				5176	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5177	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5178	len = PyUnicode_GET_SIZE(temp);
				5179	if (prec >= 0 && len > prec)
				5180	len = prec;
				5181	break;
				5182
				5183	case 'i':
				5184	case 'd':
				5185	case 'u':
				5186	case 'o':
				5187	case 'x':
				5188	case 'X':
				5189	if (c == 'i')
				5190	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5191	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5192	temp = formatlong(v, flags, prec, c);
				5193	if (!temp)
				5194	goto onError;
				5195	pbuf = PyUnicode_AS_UNICODE(temp);
				5196	len = PyUnicode_GET_SIZE(temp);
				5197	/* unbounded ints can always produce
				5198	a sign character! */
				5199	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5200	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5201	else {
				5202	pbuf = formatbuf;
				5203	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5204	flags, prec, c, v);
				5205	if (len < 0)
				5206	goto onError;
				5207	/* only d conversion is signed */
				5208	sign = c == 'd';
				5209	}
				5210	if (flags & F_ZERO)
				5211	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5212	break;
				5213
				5214	case 'e':
				5215	case 'E':
				5216	case 'f':
				5217	case 'g':
				5218	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5219	pbuf = formatbuf;
				5220	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5221	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5222	if (len < 0)
				5223	goto onError;
				5224	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5225	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5226	fill = '0';
				5227	break;
				5228
				5229	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5230	pbuf = formatbuf;
				5231	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5232	if (len < 0)
				5233	goto onError;
				5234	break;
				5235
				5236	default:
				5237	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5238	"unsupported format character '%c' (0x%x) "
				5239	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5240	(31<=c && c<=126) ? c : '?',
				5241	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5242	goto onError;
				5243	}
				5244	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5245	if (pbuf == '-' \|\| pbuf == '+') {
				5246	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5247	len--;
				5248	}
				5249	else if (flags & F_SIGN)
				5250	sign = '+';
				5251	else if (flags & F_BLANK)
				5252	sign = ' ';
				5253	else
				5254	sign = 0;
				5255	}
				5256	if (width < len)
				5257	width = len;
				5258	if (rescnt < width + (sign != 0)) {
				5259	reslen -= rescnt;
				5260	rescnt = width + fmtcnt + 100;
				5261	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5262	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5263	return NULL;
				5264	res = PyUnicode_AS_UNICODE(result)
				5265	+ reslen - rescnt;
				5266	}
				5267	if (sign) {
				5268	if (fill != ' ')
				5269	*res++ = sign;
				5270	rescnt--;
				5271	if (width > len)
				5272	width--;
				5273	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5274	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5275	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5276	assert(pbuf[1] == c);
				5277	if (fill != ' ') {
				5278	res++ = pbuf++;
				5279	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5280	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5281	rescnt -= 2;
				5282	width -= 2;
				5283	if (width < 0)
				5284	width = 0;
				5285	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5286	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5287	if (width > len && !(flags & F_LJUST)) {
				5288	do {
				5289	--rescnt;
				5290	*res++ = fill;
				5291	} while (--width > len);
				5292	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5293	if (fill == ' ') {
				5294	if (sign)
				5295	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5296	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5297	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5298	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5299	res++ = pbuf++;
				5300	res++ = pbuf++;
				5301	}
				5302	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5303	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5304	res += len;
				5305	rescnt -= len;
				5306	while (--width >= len) {
				5307	--rescnt;
				5308	*res++ = ' ';
				5309	}
				5310	if (dict && (argidx < arglen) && c != '%') {
				5311	PyErr_SetString(PyExc_TypeError,
				5312	"not all arguments converted");
				5313	goto onError;
				5314	}
				5315	Py_XDECREF(temp);
				5316	} /* '%' */
				5317	} /* until end */
				5318	if (argidx < arglen && !dict) {
				5319	PyErr_SetString(PyExc_TypeError,
				5320	"not all arguments converted");
				5321	goto onError;
				5322	}
				5323
				5324	if (args_owned) {
				5325	Py_DECREF(args);
				5326	}
				5327	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5328	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5329	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5330	return (PyObject *)result;
				5331
				5332	onError:
				5333	Py_XDECREF(result);
				5334	Py_DECREF(uformat);
				5335	if (args_owned) {
				5336	Py_DECREF(args);
				5337	}
				5338	return NULL;
				5339	}
				5340
				5341	static PyBufferProcs unicode_as_buffer = {
				5342	(getreadbufferproc) unicode_buffer_getreadbuf,
				5343	(getwritebufferproc) unicode_buffer_getwritebuf,
				5344	(getsegcountproc) unicode_buffer_getsegcount,
				5345	(getcharbufferproc) unicode_buffer_getcharbuf,
				5346	};
				5347
				5348	PyTypeObject PyUnicode_Type = {
				5349	PyObject_HEAD_INIT(&PyType_Type)
				5350	0, /* ob_size */
				5351	"unicode", /* tp_name */
				5352	sizeof(PyUnicodeObject), /* tp_size */
				5353	0, /* tp_itemsize */
				5354	/* Slots */
				5355	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5356	0, /* tp_print */
				5357	(getattrfunc)unicode_getattr, /* tp_getattr */
				5358	0, /* tp_setattr */
				5359	(cmpfunc) unicode_compare, /* tp_compare */
				5360	(reprfunc) unicode_repr, /* tp_repr */
				5361	0, /* tp_as_number */
				5362	&unicode_as_sequence, /* tp_as_sequence */
				5363	0, /* tp_as_mapping */
				5364	(hashfunc) unicode_hash, /* tp_hash*/
				5365	0, /* tp_call*/
				5366	(reprfunc) unicode_str, /* tp_str */
				5367	(getattrofunc) NULL, /* tp_getattro */
				5368	(setattrofunc) NULL, /* tp_setattro */
				5369	&unicode_as_buffer, /* tp_as_buffer */
				5370	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5371	};
				5372
				5373	/* Initialize the Unicode implementation */
				5374
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5375	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5376	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5377	int i;
				5378
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5379	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5380	unicode_freelist = NULL;
				5381	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5382	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5383	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5384	for (i = 0; i < 256; i++)
				5385	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5386	}
				5387
				5388	/* Finalize the Unicode implementation */
				5389
				5390	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5391	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5392	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5393	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5394	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5395
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5396	Py_XDECREF(unicode_empty);
				5397	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5398
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5399	for (i = 0; i < 256; i++) {
				5400	if (unicode_latin1[i]) {
				5401	Py_DECREF(unicode_latin1[i]);
				5402	unicode_latin1[i] = NULL;
				5403	}
				5404	}
				5405
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5406	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5407	PyUnicodeObject *v = u;
				5408	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5409	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5410	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5411	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5412	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5413	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5414	unicode_freelist = NULL;
				5415	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5416	}