Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: a29c75b5a34bc900b6161152cc5379137c7172b8 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	107	PyUnicode_GetMax(void)
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	108	{
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	109	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
				204	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	222	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	227	void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	228	{
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	229	if (!PyUnicode_CheckExact(unicode)) {
				230	unicode->ob_type->tp_free((PyObject *)unicode);
				231	return;
				232	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	233	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	234	/* Keep-Alive optimization */
				235	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	236	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	237	unicode->str = NULL;
				238	unicode->length = 0;
				239	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	240	if (unicode->defenc) {
				241	Py_DECREF(unicode->defenc);
				242	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	243	}
				244	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	245	(PyUnicodeObject *)unicode = unicode_freelist;
				246	unicode_freelist = unicode;
				247	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	248	}
				249	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	250	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	251	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	252	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	253	}
				254	}
				255
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	256	int PyUnicode_Resize(PyObject **unicode,
				257	int length)
				258	{
				259	register PyUnicodeObject *v;
				260
				261	/* Argument checks */
				262	if (unicode == NULL) {
				263	PyErr_BadInternalCall();
				264	return -1;
				265	}
				266	v = (PyUnicodeObject )unicode;
				267	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				268	PyErr_BadInternalCall();
				269	return -1;
				270	}
				271
				272	/* Resizing unicode_empty and single character objects is not
				273	possible since these are being shared. We simply return a fresh
				274	copy with the same Unicode content. */
				275	if (v->length != length &&
				276	(v == unicode_empty \|\| v->length == 1)) {
				277	PyUnicodeObject *w = _PyUnicode_New(length);
				278	if (w == NULL)
				279	return -1;
				280	Py_UNICODE_COPY(w->str, v->str,
				281	length < v->length ? length : v->length);
				282	unicode = (PyObject )w;
				283	return 0;
				284	}
				285
				286	/* Note that we don't have to modify *unicode for unshared Unicode
				287	objects, since we can modify them in-place. */
				288	return unicode_resize(v, length);
				289	}
				290
				291	/* Internal API for use in unicodeobject.c only ! */
				292	#define _PyUnicode_Resize(unicodevar, length) \
				293	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				294
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	295	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				296	int size)
				297	{
				298	PyUnicodeObject *unicode;
				299
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	300	/* If the Unicode data is known at construction time, we can apply
				301	some optimizations which share commonly used objects. */
				302	if (u != NULL) {
				303
				304	/* Optimization for empty strings */
				305	if (size == 0 && unicode_empty != NULL) {
				306	Py_INCREF(unicode_empty);
				307	return (PyObject *)unicode_empty;
				308	}
				309
				310	/* Single character Unicode objects in the Latin-1 range are
				311	shared when using this constructor */
				312	if (size == 1 && *u < 256) {
				313	unicode = unicode_latin1[*u];
				314	if (!unicode) {
				315	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	316	if (!unicode)
				317	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	318	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	319	unicode_latin1[*u] = unicode;
				320	}
				321	Py_INCREF(unicode);
				322	return (PyObject *)unicode;
				323	}
				324	}
				325
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	326	unicode = _PyUnicode_New(size);
				327	if (!unicode)
				328	return NULL;
				329
				330	/* Copy the Unicode data into the new object */
				331	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	332	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	333
				334	return (PyObject *)unicode;
				335	}
				336
				337	#ifdef HAVE_WCHAR_H
				338
				339	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				340	int size)
				341	{
				342	PyUnicodeObject *unicode;
				343
				344	if (w == NULL) {
				345	PyErr_BadInternalCall();
				346	return NULL;
				347	}
				348
				349	unicode = _PyUnicode_New(size);
				350	if (!unicode)
				351	return NULL;
				352
				353	/* Copy the wchar_t data into the new object */
				354	#ifdef HAVE_USABLE_WCHAR_T
				355	memcpy(unicode->str, w, size * sizeof(wchar_t));
				356	#else
				357	{
				358	register Py_UNICODE *u;
				359	register int i;
				360	u = PyUnicode_AS_UNICODE(unicode);
				361	for (i = size; i >= 0; i--)
				362	u++ = w++;
				363	}
				364	#endif
				365
				366	return (PyObject *)unicode;
				367	}
				368
				369	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				370	register wchar_t *w,
				371	int size)
				372	{
				373	if (unicode == NULL) {
				374	PyErr_BadInternalCall();
				375	return -1;
				376	}
				377	if (size > PyUnicode_GET_SIZE(unicode))
				378	size = PyUnicode_GET_SIZE(unicode);
				379	#ifdef HAVE_USABLE_WCHAR_T
				380	memcpy(w, unicode->str, size * sizeof(wchar_t));
				381	#else
				382	{
				383	register Py_UNICODE *u;
				384	register int i;
				385	u = PyUnicode_AS_UNICODE(unicode);
				386	for (i = size; i >= 0; i--)
				387	w++ = u++;
				388	}
				389	#endif
				390
				391	return size;
				392	}
				393
				394	#endif
				395
				396	PyObject PyUnicode_FromObject(register PyObject obj)
				397	{
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame^]	398	/* XXX Perhaps we should make this API an alias of
				399	PyObject_Unicode() instead ?! */
				400	if (PyUnicode_CheckExact(obj)) {
				401	Py_INCREF(obj);
				402	return obj;
				403	}
				404	if (PyUnicode_Check(obj)) {
				405	/* For a Unicode subtype that's not a Unicode object,
				406	return a true Unicode object with the same data. */
				407	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
				408	PyUnicode_GET_SIZE(obj));
				409	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	410	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				411	}
				412
				413	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				414	const char *encoding,
				415	const char *errors)
				416	{
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	417	const char *s = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	418	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	419	int owned = 0;
				420	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	421
				422	if (obj == NULL) {
				423	PyErr_BadInternalCall();
				424	return NULL;
				425	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	426
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame^]	427	#if 0
				428	/* For b/w compatibility we also accept Unicode objects provided
				429	that no encodings is given and then redirect to PyObject_Unicode()
				430	which then applies the additional logic for Unicode subclasses.
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	431
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame^]	432	NOTE: This API should really only be used for object which
				433	represent encoded Unicode !
				434
				435	*/
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	436	if (PyUnicode_Check(obj)) {
				437	if (encoding) {
				438	PyErr_SetString(PyExc_TypeError,
				439	"decoding Unicode is not supported");
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame^]	440	return NULL;
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	441	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame^]	442	return PyObject_Unicode(obj);
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	443	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame^]	444	#else
				445	if (PyUnicode_Check(obj)) {
				446	PyErr_SetString(PyExc_TypeError,
				447	"decoding Unicode is not supported");
				448	return NULL;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	449	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame^]	450	#endif
				451
				452	/* Coerce object */
				453	if (PyString_Check(obj)) {
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	454	s = PyString_AS_STRING(obj);
				455	len = PyString_GET_SIZE(obj);
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	456	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame^]	457	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				458	/* Overwrite the error message with something more useful in
				459	case of a TypeError. */
				460	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	461	PyErr_Format(PyExc_TypeError,
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame^]	462	"coercing to Unicode: need string or buffer, "
				463	"%.80s found",
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	464	obj->ob_type->tp_name);
				465	goto onError;
				466	}
				467
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	468	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	469	if (len == 0) {
				470	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	471	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	472	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	473	else
				474	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	475
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	476	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	477	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	478	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	479	return v;
				480
				481	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	482	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	483	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	484	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	485	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	486	}
				487
				488	PyObject PyUnicode_Decode(const char s,
				489	int size,
				490	const char *encoding,
				491	const char *errors)
				492	{
				493	PyObject buffer = NULL, unicode;
				494
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	495	if (encoding == NULL)
				496	encoding = PyUnicode_GetDefaultEncoding();
				497
				498	/* Shortcuts for common default encodings */
				499	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	500	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	501	else if (strcmp(encoding, "latin-1") == 0)
				502	return PyUnicode_DecodeLatin1(s, size, errors);
				503	else if (strcmp(encoding, "ascii") == 0)
				504	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	505
				506	/* Decode via the codec registry */
				507	buffer = PyBuffer_FromMemory((void *)s, size);
				508	if (buffer == NULL)
				509	goto onError;
				510	unicode = PyCodec_Decode(buffer, encoding, errors);
				511	if (unicode == NULL)
				512	goto onError;
				513	if (!PyUnicode_Check(unicode)) {
				514	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	515	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	516	unicode->ob_type->tp_name);
				517	Py_DECREF(unicode);
				518	goto onError;
				519	}
				520	Py_DECREF(buffer);
				521	return unicode;
				522
				523	onError:
				524	Py_XDECREF(buffer);
				525	return NULL;
				526	}
				527
				528	PyObject PyUnicode_Encode(const Py_UNICODE s,
				529	int size,
				530	const char *encoding,
				531	const char *errors)
				532	{
				533	PyObject v, unicode;
				534
				535	unicode = PyUnicode_FromUnicode(s, size);
				536	if (unicode == NULL)
				537	return NULL;
				538	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				539	Py_DECREF(unicode);
				540	return v;
				541	}
				542
				543	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				544	const char *encoding,
				545	const char *errors)
				546	{
				547	PyObject *v;
				548
				549	if (!PyUnicode_Check(unicode)) {
				550	PyErr_BadArgument();
				551	goto onError;
				552	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	553
				554	if (encoding == NULL)
				555	encoding = PyUnicode_GetDefaultEncoding();
				556
				557	/* Shortcuts for common default encodings */
				558	if (errors == NULL) {
				559	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	560	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	561	else if (strcmp(encoding, "latin-1") == 0)
				562	return PyUnicode_AsLatin1String(unicode);
				563	else if (strcmp(encoding, "ascii") == 0)
				564	return PyUnicode_AsASCIIString(unicode);
				565	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	566
				567	/* Encode via the codec registry */
				568	v = PyCodec_Encode(unicode, encoding, errors);
				569	if (v == NULL)
				570	goto onError;
				571	/* XXX Should we really enforce this ? */
				572	if (!PyString_Check(v)) {
				573	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	574	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	575	v->ob_type->tp_name);
				576	Py_DECREF(v);
				577	goto onError;
				578	}
				579	return v;
				580
				581	onError:
				582	return NULL;
				583	}
				584
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	585	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				586	const char *errors)
				587	{
				588	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				589
				590	if (v)
				591	return v;
				592	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				593	if (v && errors == NULL)
				594	((PyUnicodeObject *)unicode)->defenc = v;
				595	return v;
				596	}
				597
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	598	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				599	{
				600	if (!PyUnicode_Check(unicode)) {
				601	PyErr_BadArgument();
				602	goto onError;
				603	}
				604	return PyUnicode_AS_UNICODE(unicode);
				605
				606	onError:
				607	return NULL;
				608	}
				609
				610	int PyUnicode_GetSize(PyObject *unicode)
				611	{
				612	if (!PyUnicode_Check(unicode)) {
				613	PyErr_BadArgument();
				614	goto onError;
				615	}
				616	return PyUnicode_GET_SIZE(unicode);
				617
				618	onError:
				619	return -1;
				620	}
				621
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	622	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	623	{
				624	return unicode_default_encoding;
				625	}
				626
				627	int PyUnicode_SetDefaultEncoding(const char *encoding)
				628	{
				629	PyObject *v;
				630
				631	/* Make sure the encoding is valid. As side effect, this also
				632	loads the encoding into the codec registry cache. */
				633	v = _PyCodec_Lookup(encoding);
				634	if (v == NULL)
				635	goto onError;
				636	Py_DECREF(v);
				637	strncpy(unicode_default_encoding,
				638	encoding,
				639	sizeof(unicode_default_encoding));
				640	return 0;
				641
				642	onError:
				643	return -1;
				644	}
				645
Marc-André Lemburg	c60e6f7	2001-09-20 10:35:46 +0000	[diff] [blame]	646	/* --- UTF-7 Codec -------------------------------------------------------- */
				647
				648	/* see RFC2152 for details */
				649
				650	static
				651	char utf7_special[128] = {
				652	/* indicate whether a UTF-7 character is special i.e. cannot be directly
				653	encoded:
				654	0 - not special
				655	1 - special
				656	2 - whitespace (optional)
				657	3 - RFC2152 Set O (optional) */
				658	1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
				659	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				660	2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
				661	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
				662	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				663	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
				664	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				665	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
				666
				667	};
				668
				669	#define SPECIAL(c, encodeO, encodeWS) \
				670	(((c)>127 \|\| utf7_special[(c)] == 1) \|\| \
				671	(encodeWS && (utf7_special[(c)] == 2)) \|\| \
				672	(encodeO && (utf7_special[(c)] == 3)))
				673
				674	#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
				675	#define B64CHAR(c) (isalnum(c) \|\| (c) == '+' \|\| (c) == '/')
				676	#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
				677	(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
				678
				679	#define ENCODE(out, ch, bits) \
				680	while (bits >= 6) { \
				681	*out++ = B64(ch >> (bits-6)); \
				682	bits -= 6; \
				683	}
				684
				685	#define DECODE(out, ch, bits, surrogate) \
				686	while (bits >= 16) { \
				687	Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
				688	bits -= 16; \
				689	if (surrogate) { \
				690	/* We have already generated an error for the high surrogate
				691	so let's not bother seeing if the low surrogate is correct or not */\
				692	surrogate = 0; \
				693	} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
				694	/* This is a surrogate pair. Unfortunately we can't represent \
				695	it in a 16-bit character */ \
				696	surrogate = 1; \
				697	errmsg = "code pairs are not supported"; \
				698	goto utf7Error; \
				699	} else { \
				700	*out++ = outCh; \
				701	} \
				702	} \
				703
				704	static
				705	int utf7_decoding_error(Py_UNICODE **dest,
				706	const char *errors,
				707	const char *details)
				708	{
				709	if ((errors == NULL) \|\|
				710	(strcmp(errors,"strict") == 0)) {
				711	PyErr_Format(PyExc_UnicodeError,
				712	"UTF-7 decoding error: %.400s",
				713	details);
				714	return -1;
				715	}
				716	else if (strcmp(errors,"ignore") == 0) {
				717	return 0;
				718	}
				719	else if (strcmp(errors,"replace") == 0) {
				720	if (dest != NULL) {
				721	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				722	(*dest)++;
				723	}
				724	return 0;
				725	}
				726	else {
				727	PyErr_Format(PyExc_ValueError,
				728	"UTF-7 decoding error; unknown error handling code: %.400s",
				729	errors);
				730	return -1;
				731	}
				732	}
				733
				734	PyObject PyUnicode_DecodeUTF7(const char s,
				735	int size,
				736	const char *errors)
				737	{
				738	const char *e;
				739	PyUnicodeObject *unicode;
				740	Py_UNICODE *p;
				741	const char *errmsg = "";
				742	int inShift = 0;
				743	unsigned int bitsleft = 0;
				744	unsigned long charsleft = 0;
				745	int surrogate = 0;
				746
				747	unicode = _PyUnicode_New(size);
				748	if (!unicode)
				749	return NULL;
				750	if (size == 0)
				751	return (PyObject *)unicode;
				752
				753	p = unicode->str;
				754	e = s + size;
				755
				756	while (s < e) {
				757	Py_UNICODE ch = *s;
				758
				759	if (inShift) {
				760	if ((ch == '-') \|\| !B64CHAR(ch)) {
				761	inShift = 0;
				762	s++;
				763
				764	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				765	if (bitsleft >= 6) {
				766	/* The shift sequence has a partial character in it. If
				767	bitsleft < 6 then we could just classify it as padding
				768	but that is not the case here */
				769
				770	errmsg = "partial character in shift sequence";
				771	goto utf7Error;
				772	}
				773	/* According to RFC2152 the remaining bits should be zero. We
				774	choose to signal an error/insert a replacement character
				775	here so indicate the potential of a misencoded character. */
				776
				777	/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
				778	if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
				779	errmsg = "non-zero padding bits in shift sequence";
				780	goto utf7Error;
				781	}
				782
				783	if (ch == '-') {
				784	if ((s < e) && (*(s) == '-')) {
				785	*p++ = '-';
				786	inShift = 1;
				787	}
				788	} else if (SPECIAL(ch,0,0)) {
				789	errmsg = "unexpected special character";
				790	goto utf7Error;
				791	} else {
				792	*p++ = ch;
				793	}
				794	} else {
				795	charsleft = (charsleft << 6) \| UB64(ch);
				796	bitsleft += 6;
				797	s++;
				798	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				799	}
				800	}
				801	else if ( ch == '+' ) {
				802	s++;
				803	if (s < e && *s == '-') {
				804	s++;
				805	*p++ = '+';
				806	} else
				807	{
				808	inShift = 1;
				809	bitsleft = 0;
				810	}
				811	}
				812	else if (SPECIAL(ch,0,0)) {
				813	errmsg = "unexpected special character";
				814	s++;
				815	goto utf7Error;
				816	}
				817	else {
				818	*p++ = ch;
				819	s++;
				820	}
				821	continue;
				822	utf7Error:
				823	if (utf7_decoding_error(&p, errors, errmsg))
				824	goto onError;
				825	}
				826
				827	if (inShift) {
				828	if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
				829	goto onError;
				830	}
				831
				832	if (_PyUnicode_Resize(&unicode, p - unicode->str))
				833	goto onError;
				834
				835	return (PyObject *)unicode;
				836
				837	onError:
				838	Py_DECREF(unicode);
				839	return NULL;
				840	}
				841
				842
				843	PyObject PyUnicode_EncodeUTF7(const Py_UNICODE s,
				844	int size,
				845	int encodeSetO,
				846	int encodeWhiteSpace,
				847	const char *errors)
				848	{
				849	PyObject *v;
				850	/* It might be possible to tighten this worst case */
				851	unsigned int cbAllocated = 5 * size;
				852	int inShift = 0;
				853	int i = 0;
				854	unsigned int bitsleft = 0;
				855	unsigned long charsleft = 0;
				856	char * out;
				857	char * start;
				858
				859	if (size == 0)
				860	return PyString_FromStringAndSize(NULL, 0);
				861
				862	v = PyString_FromStringAndSize(NULL, cbAllocated);
				863	if (v == NULL)
				864	return NULL;
				865
				866	start = out = PyString_AS_STRING(v);
				867	for (;i < size; ++i) {
				868	Py_UNICODE ch = s[i];
				869
				870	if (!inShift) {
				871	if (ch == '+') {
				872	*out++ = '+';
				873	*out++ = '-';
				874	} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				875	charsleft = ch;
				876	bitsleft = 16;
				877	*out++ = '+';
				878	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				879	inShift = bitsleft > 0;
				880	} else {
				881	*out++ = (char) ch;
				882	}
				883	} else {
				884	if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				885	*out++ = B64(charsleft << (6-bitsleft));
				886	charsleft = 0;
				887	bitsleft = 0;
				888	/* Characters not in the BASE64 set implicitly unshift the sequence
				889	so no '-' is required, except if the character is itself a '-' */
				890	if (B64CHAR(ch) \|\| ch == '-') {
				891	*out++ = '-';
				892	}
				893	inShift = 0;
				894	*out++ = (char) ch;
				895	} else {
				896	bitsleft += 16;
				897	charsleft = (charsleft << 16) \| ch;
				898	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				899
				900	/* If the next character is special then we dont' need to terminate
				901	the shift sequence. If the next character is not a BASE64 character
				902	or '-' then the shift sequence will be terminated implicitly and we
				903	don't have to insert a '-'. */
				904
				905	if (bitsleft == 0) {
				906	if (i + 1 < size) {
				907	Py_UNICODE ch2 = s[i+1];
				908
				909	if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
				910
				911	} else if (B64CHAR(ch2) \|\| ch2 == '-') {
				912	*out++ = '-';
				913	inShift = 0;
				914	} else {
				915	inShift = 0;
				916	}
				917
				918	}
				919	else {
				920	*out++ = '-';
				921	inShift = 0;
				922	}
				923	}
				924	}
				925	}
				926	}
				927	if (bitsleft) {
				928	*out++= B64(charsleft << (6-bitsleft) );
				929	*out++ = '-';
				930	}
				931
				932	if (_PyString_Resize(&v, out - start)) {
				933	Py_DECREF(v);
				934	return NULL;
				935	}
				936	return v;
				937	}
				938
				939	#undef SPECIAL
				940	#undef B64
				941	#undef B64CHAR
				942	#undef UB64
				943	#undef ENCODE
				944	#undef DECODE
				945
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	946	/* --- UTF-8 Codec -------------------------------------------------------- */
				947
				948	static
				949	char utf8_code_length[256] = {
				950	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				951	illegal prefix. see RFC 2279 for details */
				952	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				953	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				954	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				955	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				956	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				957	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				958	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				959	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				960	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				961	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				962	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				963	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				964	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				965	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				966	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				967	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				968	};
				969
				970	static
				971	int utf8_decoding_error(const char **source,
				972	Py_UNICODE **dest,
				973	const char *errors,
				974	const char *details)
				975	{
				976	if ((errors == NULL) \|\|
				977	(strcmp(errors,"strict") == 0)) {
				978	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	979	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	980	details);
				981	return -1;
				982	}
				983	else if (strcmp(errors,"ignore") == 0) {
				984	(*source)++;
				985	return 0;
				986	}
				987	else if (strcmp(errors,"replace") == 0) {
				988	(*source)++;
				989	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				990	(*dest)++;
				991	return 0;
				992	}
				993	else {
				994	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	995	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	996	errors);
				997	return -1;
				998	}
				999	}
				1000
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1001	PyObject PyUnicode_DecodeUTF8(const char s,
				1002	int size,
				1003	const char *errors)
				1004	{
				1005	int n;
				1006	const char *e;
				1007	PyUnicodeObject *unicode;
				1008	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1009	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1010
				1011	/* Note: size will always be longer than the resulting Unicode
				1012	character count */
				1013	unicode = _PyUnicode_New(size);
				1014	if (!unicode)
				1015	return NULL;
				1016	if (size == 0)
				1017	return (PyObject *)unicode;
				1018
				1019	/* Unpack UTF-8 encoded data */
				1020	p = unicode->str;
				1021	e = s + size;
				1022
				1023	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1024	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1025
				1026	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1027	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1028	s++;
				1029	continue;
				1030	}
				1031
				1032	n = utf8_code_length[ch];
				1033
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1034	if (s + n > e) {
				1035	errmsg = "unexpected end of data";
				1036	goto utf8Error;
				1037	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1038
				1039	switch (n) {
				1040
				1041	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1042	errmsg = "unexpected code byte";
				1043	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1044
				1045	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1046	errmsg = "internal error";
				1047	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1048
				1049	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1050	if ((s[1] & 0xc0) != 0x80) {
				1051	errmsg = "invalid data";
				1052	goto utf8Error;
				1053	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1054	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1055	if (ch < 0x80) {
				1056	errmsg = "illegal encoding";
				1057	goto utf8Error;
				1058	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1059	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1060	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1061	break;
				1062
				1063	case 3:
				1064	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1065	(s[2] & 0xc0) != 0x80) {
				1066	errmsg = "invalid data";
				1067	goto utf8Error;
				1068	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1069	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1070	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				1071	errmsg = "illegal encoding";
				1072	goto utf8Error;
				1073	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1074	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1075	*p++ = (Py_UNICODE)ch;
				1076	break;
				1077
				1078	case 4:
				1079	if ((s[1] & 0xc0) != 0x80 \|\|
				1080	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1081	(s[3] & 0xc0) != 0x80) {
				1082	errmsg = "invalid data";
				1083	goto utf8Error;
				1084	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1085	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				1086	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				1087	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1088	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1089	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1090	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1091	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1092	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1093	errmsg = "illegal encoding";
				1094	goto utf8Error;
				1095	}
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1096	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1097	*p++ = (Py_UNICODE)ch;
				1098	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1099	/* compute and append the two surrogates: */
				1100
				1101	/* translate from 10000..10FFFF to 0..FFFF */
				1102	ch -= 0x10000;
				1103
				1104	/* high surrogate = top 10 bits added to D800 */
				1105	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				1106
				1107	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1108	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1109	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1110	break;
				1111
				1112	default:
				1113	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1114	errmsg = "unsupported Unicode code range";
				1115	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1116	}
				1117	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1118	continue;
				1119
				1120	utf8Error:
				1121	if (utf8_decoding_error(&s, &p, errors, errmsg))
				1122	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1123	}
				1124
				1125	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1126	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1127	goto onError;
				1128
				1129	return (PyObject *)unicode;
				1130
				1131	onError:
				1132	Py_DECREF(unicode);
				1133	return NULL;
				1134	}
				1135
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1136	/* Not used anymore, now that the encoder supports UTF-16
				1137	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1138	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1139	static
				1140	int utf8_encoding_error(const Py_UNICODE **source,
				1141	char **dest,
				1142	const char *errors,
				1143	const char *details)
				1144	{
				1145	if ((errors == NULL) \|\|
				1146	(strcmp(errors,"strict") == 0)) {
				1147	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1148	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1149	details);
				1150	return -1;
				1151	}
				1152	else if (strcmp(errors,"ignore") == 0) {
				1153	return 0;
				1154	}
				1155	else if (strcmp(errors,"replace") == 0) {
				1156	**dest = '?';
				1157	(*dest)++;
				1158	return 0;
				1159	}
				1160	else {
				1161	PyErr_Format(PyExc_ValueError,
				1162	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1163	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1164	errors);
				1165	return -1;
				1166	}
				1167	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1168	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1169
				1170	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				1171	int size,
				1172	const char *errors)
				1173	{
				1174	PyObject *v;
				1175	char *p;
				1176	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1177	Py_UCS4 ch2;
				1178	unsigned int cbAllocated = 3 * size;
				1179	unsigned int cbWritten = 0;
				1180	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1181
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1182	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1183	if (v == NULL)
				1184	return NULL;
				1185	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1186	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1187
				1188	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1189	while (i < size) {
				1190	Py_UCS4 ch = s[i++];
				1191	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1192	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1193	cbWritten++;
				1194	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1195	else if (ch < 0x0800) {
				1196	*p++ = 0xc0 \| (ch >> 6);
				1197	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1198	cbWritten += 2;
				1199	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1200	else if (ch < 0x10000) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1201	/* Check for high surrogate */
				1202	if (0xD800 <= ch && ch <= 0xDBFF) {
				1203	if (i != size) {
				1204	ch2 = s[i];
				1205	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				1206
				1207	if (cbWritten >= (cbAllocated - 4)) {
				1208	/* Provide enough room for some more
				1209	surrogates */
				1210	cbAllocated += 4*10;
				1211	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1212	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1213	}
				1214
				1215	/* combine the two values */
				1216	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				1217
				1218	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1219	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1220	i++;
				1221	cbWritten += 4;
				1222	}
				1223	}
				1224	}
				1225	else {
				1226	*p++ = (char)(0xe0 \| (ch >> 12));
				1227	cbWritten += 3;
				1228	}
				1229	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				1230	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1231	} else {
				1232	*p++ = 0xf0 \| (ch>>18);
				1233	*p++ = 0x80 \| ((ch>>12) & 0x3f);
				1234	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				1235	*p++ = 0x80 \| (ch & 0x3f);
				1236	cbWritten += 4;
				1237	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1238	}
				1239	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1240	if (_PyString_Resize(&v, p - q))
				1241	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1242	return v;
				1243
				1244	onError:
				1245	Py_DECREF(v);
				1246	return NULL;
				1247	}
				1248
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1249	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				1250	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1251	if (!PyUnicode_Check(unicode)) {
				1252	PyErr_BadArgument();
				1253	return NULL;
				1254	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	1255	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				1256	PyUnicode_GET_SIZE(unicode),
				1257	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1258	}
				1259
				1260	/* --- UTF-16 Codec ------------------------------------------------------- */
				1261
				1262	static
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1263	int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1264	const char *errors,
				1265	const char *details)
				1266	{
				1267	if ((errors == NULL) \|\|
				1268	(strcmp(errors,"strict") == 0)) {
				1269	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1270	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1271	details);
				1272	return -1;
				1273	}
				1274	else if (strcmp(errors,"ignore") == 0) {
				1275	return 0;
				1276	}
				1277	else if (strcmp(errors,"replace") == 0) {
				1278	if (dest) {
				1279	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1280	(*dest)++;
				1281	}
				1282	return 0;
				1283	}
				1284	else {
				1285	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	1286	"UTF-16 decoding error; "
				1287	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1288	errors);
				1289	return -1;
				1290	}
				1291	}
				1292
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1293	PyObject *
				1294	PyUnicode_DecodeUTF16(const char *s,
				1295	int size,
				1296	const char *errors,
				1297	int *byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1298	{
				1299	PyUnicodeObject *unicode;
				1300	Py_UNICODE *p;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1301	const unsigned char q, e;
				1302	int bo = 0; /* assume native ordering by default */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1303	const char *errmsg = "";
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1304	/* Offsets from q for retrieving byte pairs in the right order. */
				1305	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1306	int ihi = 1, ilo = 0;
				1307	#else
				1308	int ihi = 0, ilo = 1;
				1309	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1310
				1311	/* size should be an even number */
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1312	if (size & 1) {
				1313	if (utf16_decoding_error(NULL, errors, "truncated data"))
				1314	return NULL;
				1315	--size; /* else ignore the oddball byte */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1316	}
				1317
				1318	/* Note: size will always be longer than the resulting Unicode
				1319	character count */
				1320	unicode = _PyUnicode_New(size);
				1321	if (!unicode)
				1322	return NULL;
				1323	if (size == 0)
				1324	return (PyObject *)unicode;
				1325
				1326	/* Unpack UTF-16 encoded data */
				1327	p = unicode->str;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1328	q = (unsigned char *)s;
				1329	e = q + size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1330
				1331	if (byteorder)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1332	bo = *byteorder;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1333
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1334	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1335	byte order setting accordingly. In native mode, the leading BOM
				1336	mark is skipped, in all other modes, it is copied to the output
				1337	stream as-is (giving a ZWNBSP character). */
				1338	if (bo == 0) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1339	const Py_UNICODE bom = (q[ihi] << 8) \| q[ilo];
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1340	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1341	if (bom == 0xFEFF) {
				1342	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1343	bo = -1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1344	}
				1345	else if (bom == 0xFFFE) {
				1346	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1347	bo = 1;
				1348	}
				1349	#else
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1350	if (bom == 0xFEFF) {
				1351	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1352	bo = 1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1353	}
				1354	else if (bom == 0xFFFE) {
				1355	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1356	bo = -1;
				1357	}
				1358	#endif
				1359	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1360
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1361	if (bo == -1) {
				1362	/* force LE */
				1363	ihi = 1;
				1364	ilo = 0;
				1365	}
				1366	else if (bo == 1) {
				1367	/* force BE */
				1368	ihi = 0;
				1369	ilo = 1;
				1370	}
				1371
				1372	while (q < e) {
				1373	Py_UNICODE ch = (q[ihi] << 8) \| q[ilo];
				1374	q += 2;
				1375
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1376	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1377	*p++ = ch;
				1378	continue;
				1379	}
				1380
				1381	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1382	if (q >= e) {
				1383	errmsg = "unexpected end of data";
				1384	goto utf16Error;
				1385	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1386	if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1387	Py_UNICODE ch2 = (q[ihi] << 8) \| q[ilo];
				1388	q += 2;
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1389	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1390	#ifndef Py_UNICODE_WIDE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1391	*p++ = ch;
				1392	*p++ = ch2;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1393	#else
				1394	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1395	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1396	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1397	}
				1398	else {
				1399	errmsg = "illegal UTF-16 surrogate";
				1400	goto utf16Error;
				1401	}
				1402
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1403	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1404	errmsg = "illegal encoding";
				1405	/* Fall through to report the error */
				1406
				1407	utf16Error:
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1408	if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1409	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1410	}
				1411
				1412	if (byteorder)
				1413	*byteorder = bo;
				1414
				1415	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1416	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1417	goto onError;
				1418
				1419	return (PyObject *)unicode;
				1420
				1421	onError:
				1422	Py_DECREF(unicode);
				1423	return NULL;
				1424	}
				1425
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1426	PyObject *
				1427	PyUnicode_EncodeUTF16(const Py_UNICODE *s,
				1428	int size,
				1429	const char *errors,
				1430	int byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1431	{
				1432	PyObject *v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1433	unsigned char *p;
				1434	int i, pairs;
				1435	/* Offsets from p for storing byte pairs in the right order. */
				1436	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1437	int ihi = 1, ilo = 0;
				1438	#else
				1439	int ihi = 0, ilo = 1;
				1440	#endif
				1441
				1442	#define STORECHAR(CH) \
				1443	do { \
				1444	p[ihi] = ((CH) >> 8) & 0xff; \
				1445	p[ilo] = (CH) & 0xff; \
				1446	p += 2; \
				1447	} while(0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1448
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1449	for (i = pairs = 0; i < size; i++)
				1450	if (s[i] >= 0x10000)
				1451	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1452	v = PyString_FromStringAndSize(NULL,
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1453	2 * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1454	if (v == NULL)
				1455	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1456
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1457	p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1458	if (byteorder == 0)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1459	STORECHAR(0xFEFF);
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1460	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1461	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1462
				1463	if (byteorder == -1) {
				1464	/* force LE */
				1465	ihi = 1;
				1466	ilo = 0;
				1467	}
				1468	else if (byteorder == 1) {
				1469	/* force BE */
				1470	ihi = 0;
				1471	ilo = 1;
				1472	}
				1473
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1474	while (size-- > 0) {
				1475	Py_UNICODE ch = *s++;
				1476	Py_UNICODE ch2 = 0;
				1477	if (ch >= 0x10000) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1478	ch2 = 0xDC00 \| ((ch-0x10000) & 0x3FF);
				1479	ch = 0xD800 \| ((ch-0x10000) >> 10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1480	}
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1481	STORECHAR(ch);
				1482	if (ch2)
				1483	STORECHAR(ch2);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1484	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1485	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1486	#undef STORECHAR
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1487	}
				1488
				1489	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1490	{
				1491	if (!PyUnicode_Check(unicode)) {
				1492	PyErr_BadArgument();
				1493	return NULL;
				1494	}
				1495	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1496	PyUnicode_GET_SIZE(unicode),
				1497	NULL,
				1498	0);
				1499	}
				1500
				1501	/* --- Unicode Escape Codec ----------------------------------------------- */
				1502
				1503	static
				1504	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1505	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1506	const char *errors,
				1507	const char *details)
				1508	{
				1509	if ((errors == NULL) \|\|
				1510	(strcmp(errors,"strict") == 0)) {
				1511	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1512	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1513	details);
				1514	return -1;
				1515	}
				1516	else if (strcmp(errors,"ignore") == 0) {
				1517	return 0;
				1518	}
				1519	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1520	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1521	return 0;
				1522	}
				1523	else {
				1524	PyErr_Format(PyExc_ValueError,
				1525	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1526	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1527	errors);
				1528	return -1;
				1529	}
				1530	}
				1531
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1532	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1533
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1534	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1535	int size,
				1536	const char *errors)
				1537	{
				1538	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1539	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1540	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1541	char* message;
				1542	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1543
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1544	/* Escaped strings will always be longer than the resulting
				1545	Unicode string, so we start with size here and then reduce the
				1546	length after conversion to the true value. */
				1547	v = _PyUnicode_New(size);
				1548	if (v == NULL)
				1549	goto onError;
				1550	if (size == 0)
				1551	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1552
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1553	p = buf = PyUnicode_AS_UNICODE(v);
				1554	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1555
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1556	while (s < end) {
				1557	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1558	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1559	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1560
				1561	/* Non-escape characters are interpreted as Unicode ordinals */
				1562	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1563	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1564	continue;
				1565	}
				1566
				1567	/* \ - Escapes */
				1568	s++;
				1569	switch (*s++) {
				1570
				1571	/* \x escapes */
				1572	case '\n': break;
				1573	case '\\': *p++ = '\\'; break;
				1574	case '\'': *p++ = '\''; break;
				1575	case '\"': *p++ = '\"'; break;
				1576	case 'b': *p++ = '\b'; break;
				1577	case 'f': p++ = '\014'; break; / FF */
				1578	case 't': *p++ = '\t'; break;
				1579	case 'n': *p++ = '\n'; break;
				1580	case 'r': *p++ = '\r'; break;
				1581	case 'v': p++ = '\013'; break; / VT */
				1582	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1583
				1584	/* \OOO (octal) escapes */
				1585	case '0': case '1': case '2': case '3':
				1586	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1587	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1588	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1589	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1590	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1591	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1592	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1593	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1594	break;
				1595
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1596	/* hex escapes */
				1597	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1598	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1599	digits = 2;
				1600	message = "truncated \\xXX escape";
				1601	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1602
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1603	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1604	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1605	digits = 4;
				1606	message = "truncated \\uXXXX escape";
				1607	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1608
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1609	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1610	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1611	digits = 8;
				1612	message = "truncated \\UXXXXXXXX escape";
				1613	hexescape:
				1614	chr = 0;
				1615	for (i = 0; i < digits; i++) {
				1616	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1617	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1618	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1619	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1620	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1621	i++;
				1622	break;
				1623	}
				1624	chr = (chr<<4) & ~0xF;
				1625	if (c >= '0' && c <= '9')
				1626	chr += c - '0';
				1627	else if (c >= 'a' && c <= 'f')
				1628	chr += 10 + c - 'a';
				1629	else
				1630	chr += 10 + c - 'A';
				1631	}
				1632	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1633	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1634	/* when we get here, chr is a 32-bit unicode character */
				1635	if (chr <= 0xffff)
				1636	/* UCS-2 character */
				1637	*p++ = (Py_UNICODE) chr;
				1638	else if (chr <= 0x10ffff) {
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1639	/* UCS-4 character. Either store directly, or as
				1640	surrogate pair. */
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1641	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1642	*p++ = chr;
				1643	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1644	chr -= 0x10000L;
				1645	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1646	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1647	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1648	} else {
				1649	if (unicodeescape_decoding_error(
				1650	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1651	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1652	)
				1653	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1654	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1655	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1656	break;
				1657
				1658	/* \N{name} */
				1659	case 'N':
				1660	message = "malformed \\N character escape";
				1661	if (ucnhash_CAPI == NULL) {
				1662	/* load the unicode data module */
				1663	PyObject m, v;
				1664	m = PyImport_ImportModule("unicodedata");
				1665	if (m == NULL)
				1666	goto ucnhashError;
				1667	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1668	Py_DECREF(m);
				1669	if (v == NULL)
				1670	goto ucnhashError;
				1671	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1672	Py_DECREF(v);
				1673	if (ucnhash_CAPI == NULL)
				1674	goto ucnhashError;
				1675	}
				1676	if (*s == '{') {
				1677	const char *start = s+1;
				1678	/* look for the closing brace */
				1679	while (*s != '}' && s < end)
				1680	s++;
				1681	if (s > start && s < end && *s == '}') {
				1682	/* found a name. look it up in the unicode database */
				1683	message = "unknown Unicode character name";
				1684	s++;
				1685	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1686	goto store;
				1687	}
				1688	}
				1689	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1690	goto onError;
				1691	*p++ = x;
				1692	break;
				1693
				1694	default:
				1695	*p++ = '\\';
				1696	*p++ = (unsigned char)s[-1];
				1697	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1698	}
				1699	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1700	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1701	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1702	return (PyObject *)v;
				1703
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1704	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1705	PyErr_SetString(
				1706	PyExc_UnicodeError,
				1707	"\\N escapes not supported (can't load unicodedata module)"
				1708	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1709	return NULL;
				1710
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1711	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1712	Py_XDECREF(v);
				1713	return NULL;
				1714	}
				1715
				1716	/* Return a Unicode-Escape string version of the Unicode object.
				1717
				1718	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1719	appropriate.
				1720
				1721	*/
				1722
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1723	static const Py_UNICODE findchar(const Py_UNICODE s,
				1724	int size,
				1725	Py_UNICODE ch);
				1726
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1727	static
				1728	PyObject unicodeescape_string(const Py_UNICODE s,
				1729	int size,
				1730	int quotes)
				1731	{
				1732	PyObject *repr;
				1733	char *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1734
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1735	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1736
				1737	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1738	if (repr == NULL)
				1739	return NULL;
				1740
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1741	p = PyString_AS_STRING(repr);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1742
				1743	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1744	*p++ = 'u';
				1745	*p++ = (findchar(s, size, '\'') &&
				1746	!findchar(s, size, '"')) ? '"' : '\'';
				1747	}
				1748	while (size-- > 0) {
				1749	Py_UNICODE ch = *s++;
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1750
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1751	/* Escape quotes */
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1752	if (quotes &&
				1753	(ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1754	*p++ = '\\';
				1755	*p++ = (char) ch;
Guido van Rossum	ad9744a	2001-09-21 15:38:17 +0000	[diff] [blame]	1756	continue;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1757	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1758
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1759	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1760	/* Map 21-bit characters to '\U00xxxxxx' */
				1761	else if (ch >= 0x10000) {
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1762	int offset = p - PyString_AS_STRING(repr);
				1763
				1764	/* Resize the string if necessary */
				1765	if (offset + 12 > PyString_GET_SIZE(repr)) {
				1766	if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
				1767	goto onError;
				1768	p = PyString_AS_STRING(repr) + offset;
				1769	}
				1770
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1771	*p++ = '\\';
				1772	*p++ = 'U';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1773	*p++ = hexdigit[(ch >> 28) & 0x0000000F];
				1774	*p++ = hexdigit[(ch >> 24) & 0x0000000F];
				1775	*p++ = hexdigit[(ch >> 20) & 0x0000000F];
				1776	*p++ = hexdigit[(ch >> 16) & 0x0000000F];
				1777	*p++ = hexdigit[(ch >> 12) & 0x0000000F];
				1778	*p++ = hexdigit[(ch >> 8) & 0x0000000F];
				1779	*p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1780	*p++ = hexdigit[ch & 0x0000000F];
				1781	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1782	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1783	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1784	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
				1785	else if (ch >= 0xD800 && ch < 0xDC00) {
				1786	Py_UNICODE ch2;
				1787	Py_UCS4 ucs;
				1788
				1789	ch2 = *s++;
				1790	size--;
				1791	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
				1792	ucs = (((ch & 0x03FF) << 10) \| (ch2 & 0x03FF)) + 0x00010000;
				1793	*p++ = '\\';
				1794	*p++ = 'U';
				1795	*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
				1796	*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
				1797	*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
				1798	*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
				1799	*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
				1800	*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
				1801	*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
				1802	*p++ = hexdigit[ucs & 0x0000000F];
				1803	continue;
				1804	}
				1805	/* Fall through: isolated surrogates are copied as-is */
				1806	s--;
				1807	size++;
				1808	}
				1809
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1810	/* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1811	if (ch >= 256) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1812	*p++ = '\\';
				1813	*p++ = 'u';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1814	*p++ = hexdigit[(ch >> 12) & 0x000F];
				1815	*p++ = hexdigit[(ch >> 8) & 0x000F];
				1816	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1817	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1818	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1819
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1820	/* Map special whitespace to '\t', \n', '\r' */
				1821	else if (ch == '\t') {
				1822	*p++ = '\\';
				1823	*p++ = 't';
				1824	}
				1825	else if (ch == '\n') {
				1826	*p++ = '\\';
				1827	*p++ = 'n';
				1828	}
				1829	else if (ch == '\r') {
				1830	*p++ = '\\';
				1831	*p++ = 'r';
				1832	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1833
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1834	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1835	else if (ch < ' ' \|\| ch >= 128) {
				1836	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1837	*p++ = 'x';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1838	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1839	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1840	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1841
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1842	/* Copy everything else as-is */
				1843	else
				1844	*p++ = (char) ch;
				1845	}
				1846	if (quotes)
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1847	*p++ = PyString_AS_STRING(repr)[1];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1848
				1849	*p = '\0';
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1850	if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1851	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1852
				1853	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1854
				1855	onError:
				1856	Py_DECREF(repr);
				1857	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1858	}
				1859
				1860	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1861	int size)
				1862	{
				1863	return unicodeescape_string(s, size, 0);
				1864	}
				1865
				1866	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1867	{
				1868	if (!PyUnicode_Check(unicode)) {
				1869	PyErr_BadArgument();
				1870	return NULL;
				1871	}
				1872	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1873	PyUnicode_GET_SIZE(unicode));
				1874	}
				1875
				1876	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1877
				1878	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1879	int size,
				1880	const char *errors)
				1881	{
				1882	PyUnicodeObject *v;
				1883	Py_UNICODE p, buf;
				1884	const char *end;
				1885	const char *bs;
				1886
				1887	/* Escaped strings will always be longer than the resulting
				1888	Unicode string, so we start with size here and then reduce the
				1889	length after conversion to the true value. */
				1890	v = _PyUnicode_New(size);
				1891	if (v == NULL)
				1892	goto onError;
				1893	if (size == 0)
				1894	return (PyObject *)v;
				1895	p = buf = PyUnicode_AS_UNICODE(v);
				1896	end = s + size;
				1897	while (s < end) {
				1898	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1899	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1900	int i;
				1901
				1902	/* Non-escape characters are interpreted as Unicode ordinals */
				1903	if (*s != '\\') {
				1904	p++ = (unsigned char)s++;
				1905	continue;
				1906	}
				1907
				1908	/* \u-escapes are only interpreted iff the number of leading
				1909	backslashes if odd */
				1910	bs = s;
				1911	for (;s < end;) {
				1912	if (*s != '\\')
				1913	break;
				1914	p++ = (unsigned char)s++;
				1915	}
				1916	if (((s - bs) & 1) == 0 \|\|
				1917	s >= end \|\|
				1918	*s != 'u') {
				1919	continue;
				1920	}
				1921	p--;
				1922	s++;
				1923
				1924	/* \uXXXX with 4 hex digits */
				1925	for (x = 0, i = 0; i < 4; i++) {
				1926	c = (unsigned char)s[i];
				1927	if (!isxdigit(c)) {
				1928	if (unicodeescape_decoding_error(&s, &x, errors,
				1929	"truncated \\uXXXX"))
				1930	goto onError;
				1931	i++;
				1932	break;
				1933	}
				1934	x = (x<<4) & ~0xF;
				1935	if (c >= '0' && c <= '9')
				1936	x += c - '0';
				1937	else if (c >= 'a' && c <= 'f')
				1938	x += 10 + c - 'a';
				1939	else
				1940	x += 10 + c - 'A';
				1941	}
				1942	s += i;
				1943	*p++ = x;
				1944	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1945	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1946	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1947	return (PyObject *)v;
				1948
				1949	onError:
				1950	Py_XDECREF(v);
				1951	return NULL;
				1952	}
				1953
				1954	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1955	int size)
				1956	{
				1957	PyObject *repr;
				1958	char *p;
				1959	char *q;
				1960
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1961	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1962
				1963	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1964	if (repr == NULL)
				1965	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1966	if (size == 0)
				1967	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1968
				1969	p = q = PyString_AS_STRING(repr);
				1970	while (size-- > 0) {
				1971	Py_UNICODE ch = *s++;
				1972	/* Map 16-bit characters to '\uxxxx' */
				1973	if (ch >= 256) {
				1974	*p++ = '\\';
				1975	*p++ = 'u';
				1976	*p++ = hexdigit[(ch >> 12) & 0xf];
				1977	*p++ = hexdigit[(ch >> 8) & 0xf];
				1978	*p++ = hexdigit[(ch >> 4) & 0xf];
				1979	*p++ = hexdigit[ch & 15];
				1980	}
				1981	/* Copy everything else as-is */
				1982	else
				1983	*p++ = (char) ch;
				1984	}
				1985	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1986	if (_PyString_Resize(&repr, p - q))
				1987	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1988
				1989	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1990
				1991	onError:
				1992	Py_DECREF(repr);
				1993	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1994	}
				1995
				1996	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1997	{
				1998	if (!PyUnicode_Check(unicode)) {
				1999	PyErr_BadArgument();
				2000	return NULL;
				2001	}
				2002	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				2003	PyUnicode_GET_SIZE(unicode));
				2004	}
				2005
				2006	/* --- Latin-1 Codec ------------------------------------------------------ */
				2007
				2008	PyObject PyUnicode_DecodeLatin1(const char s,
				2009	int size,
				2010	const char *errors)
				2011	{
				2012	PyUnicodeObject *v;
				2013	Py_UNICODE *p;
				2014
				2015	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2016	if (size == 1 && (unsigned char)s < 256) {
				2017	Py_UNICODE r = (unsigned char)s;
				2018	return PyUnicode_FromUnicode(&r, 1);
				2019	}
				2020
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2021	v = _PyUnicode_New(size);
				2022	if (v == NULL)
				2023	goto onError;
				2024	if (size == 0)
				2025	return (PyObject *)v;
				2026	p = PyUnicode_AS_UNICODE(v);
				2027	while (size-- > 0)
				2028	p++ = (unsigned char)s++;
				2029	return (PyObject *)v;
				2030
				2031	onError:
				2032	Py_XDECREF(v);
				2033	return NULL;
				2034	}
				2035
				2036	static
				2037	int latin1_encoding_error(const Py_UNICODE **source,
				2038	char **dest,
				2039	const char *errors,
				2040	const char *details)
				2041	{
				2042	if ((errors == NULL) \|\|
				2043	(strcmp(errors,"strict") == 0)) {
				2044	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2045	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2046	details);
				2047	return -1;
				2048	}
				2049	else if (strcmp(errors,"ignore") == 0) {
				2050	return 0;
				2051	}
				2052	else if (strcmp(errors,"replace") == 0) {
				2053	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2054	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2055	return 0;
				2056	}
				2057	else {
				2058	PyErr_Format(PyExc_ValueError,
				2059	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2060	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2061	errors);
				2062	return -1;
				2063	}
				2064	}
				2065
				2066	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				2067	int size,
				2068	const char *errors)
				2069	{
				2070	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2071	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2072
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2073	repr = PyString_FromStringAndSize(NULL, size);
				2074	if (repr == NULL)
				2075	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2076	if (size == 0)
				2077	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2078
				2079	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2080	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2081	while (size-- > 0) {
				2082	Py_UNICODE ch = *p++;
				2083	if (ch >= 256) {
				2084	if (latin1_encoding_error(&p, &s, errors,
				2085	"ordinal not in range(256)"))
				2086	goto onError;
				2087	}
				2088	else
				2089	*s++ = (char)ch;
				2090	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2091	/* Resize if error handling skipped some characters */
				2092	if (s - start < PyString_GET_SIZE(repr))
				2093	if (_PyString_Resize(&repr, s - start))
				2094	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2095	return repr;
				2096
				2097	onError:
				2098	Py_DECREF(repr);
				2099	return NULL;
				2100	}
				2101
				2102	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				2103	{
				2104	if (!PyUnicode_Check(unicode)) {
				2105	PyErr_BadArgument();
				2106	return NULL;
				2107	}
				2108	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				2109	PyUnicode_GET_SIZE(unicode),
				2110	NULL);
				2111	}
				2112
				2113	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				2114
				2115	static
				2116	int ascii_decoding_error(const char **source,
				2117	Py_UNICODE **dest,
				2118	const char *errors,
				2119	const char *details)
				2120	{
				2121	if ((errors == NULL) \|\|
				2122	(strcmp(errors,"strict") == 0)) {
				2123	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2124	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2125	details);
				2126	return -1;
				2127	}
				2128	else if (strcmp(errors,"ignore") == 0) {
				2129	return 0;
				2130	}
				2131	else if (strcmp(errors,"replace") == 0) {
				2132	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2133	(*dest)++;
				2134	return 0;
				2135	}
				2136	else {
				2137	PyErr_Format(PyExc_ValueError,
				2138	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2139	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2140	errors);
				2141	return -1;
				2142	}
				2143	}
				2144
				2145	PyObject PyUnicode_DecodeASCII(const char s,
				2146	int size,
				2147	const char *errors)
				2148	{
				2149	PyUnicodeObject *v;
				2150	Py_UNICODE *p;
				2151
				2152	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2153	if (size == 1 && (unsigned char)s < 128) {
				2154	Py_UNICODE r = (unsigned char)s;
				2155	return PyUnicode_FromUnicode(&r, 1);
				2156	}
				2157
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2158	v = _PyUnicode_New(size);
				2159	if (v == NULL)
				2160	goto onError;
				2161	if (size == 0)
				2162	return (PyObject *)v;
				2163	p = PyUnicode_AS_UNICODE(v);
				2164	while (size-- > 0) {
				2165	register unsigned char c;
				2166
				2167	c = (unsigned char)*s++;
				2168	if (c < 128)
				2169	*p++ = c;
				2170	else if (ascii_decoding_error(&s, &p, errors,
				2171	"ordinal not in range(128)"))
				2172	goto onError;
				2173	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2174	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2175	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2176	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2177	return (PyObject *)v;
				2178
				2179	onError:
				2180	Py_XDECREF(v);
				2181	return NULL;
				2182	}
				2183
				2184	static
				2185	int ascii_encoding_error(const Py_UNICODE **source,
				2186	char **dest,
				2187	const char *errors,
				2188	const char *details)
				2189	{
				2190	if ((errors == NULL) \|\|
				2191	(strcmp(errors,"strict") == 0)) {
				2192	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2193	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2194	details);
				2195	return -1;
				2196	}
				2197	else if (strcmp(errors,"ignore") == 0) {
				2198	return 0;
				2199	}
				2200	else if (strcmp(errors,"replace") == 0) {
				2201	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2202	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2203	return 0;
				2204	}
				2205	else {
				2206	PyErr_Format(PyExc_ValueError,
				2207	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2208	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2209	errors);
				2210	return -1;
				2211	}
				2212	}
				2213
				2214	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				2215	int size,
				2216	const char *errors)
				2217	{
				2218	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2219	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2220
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2221	repr = PyString_FromStringAndSize(NULL, size);
				2222	if (repr == NULL)
				2223	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2224	if (size == 0)
				2225	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2226
				2227	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2228	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2229	while (size-- > 0) {
				2230	Py_UNICODE ch = *p++;
				2231	if (ch >= 128) {
				2232	if (ascii_encoding_error(&p, &s, errors,
				2233	"ordinal not in range(128)"))
				2234	goto onError;
				2235	}
				2236	else
				2237	*s++ = (char)ch;
				2238	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2239	/* Resize if error handling skipped some characters */
				2240	if (s - start < PyString_GET_SIZE(repr))
				2241	if (_PyString_Resize(&repr, s - start))
				2242	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2243	return repr;
				2244
				2245	onError:
				2246	Py_DECREF(repr);
				2247	return NULL;
				2248	}
				2249
				2250	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				2251	{
				2252	if (!PyUnicode_Check(unicode)) {
				2253	PyErr_BadArgument();
				2254	return NULL;
				2255	}
				2256	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				2257	PyUnicode_GET_SIZE(unicode),
				2258	NULL);
				2259	}
				2260
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	2261	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2262
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2263	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2264
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2265	PyObject PyUnicode_DecodeMBCS(const char s,
				2266	int size,
				2267	const char *errors)
				2268	{
				2269	PyUnicodeObject *v;
				2270	Py_UNICODE *p;
				2271
				2272	/* First get the size of the result */
				2273	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2274	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2275	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2276
				2277	v = _PyUnicode_New(usize);
				2278	if (v == NULL)
				2279	return NULL;
				2280	if (usize == 0)
				2281	return (PyObject *)v;
				2282	p = PyUnicode_AS_UNICODE(v);
				2283	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				2284	Py_DECREF(v);
				2285	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2286	}
				2287
				2288	return (PyObject *)v;
				2289	}
				2290
				2291	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				2292	int size,
				2293	const char *errors)
				2294	{
				2295	PyObject *repr;
				2296	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2297	DWORD mbcssize;
				2298
				2299	/* If there are no characters, bail now! */
				2300	if (size==0)
				2301	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2302
				2303	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2304	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2305	if (mbcssize==0)
				2306	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2307
				2308	repr = PyString_FromStringAndSize(NULL, mbcssize);
				2309	if (repr == NULL)
				2310	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2311	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2312	return repr;
				2313
				2314	/* Do the conversion */
				2315	s = PyString_AS_STRING(repr);
				2316	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				2317	Py_DECREF(repr);
				2318	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2319	}
				2320	return repr;
				2321	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2322
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2323	#endif /* MS_WIN32 */
				2324
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2325	/* --- Character Mapping Codec -------------------------------------------- */
				2326
				2327	static
				2328	int charmap_decoding_error(const char **source,
				2329	Py_UNICODE **dest,
				2330	const char *errors,
				2331	const char *details)
				2332	{
				2333	if ((errors == NULL) \|\|
				2334	(strcmp(errors,"strict") == 0)) {
				2335	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2336	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2337	details);
				2338	return -1;
				2339	}
				2340	else if (strcmp(errors,"ignore") == 0) {
				2341	return 0;
				2342	}
				2343	else if (strcmp(errors,"replace") == 0) {
				2344	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2345	(*dest)++;
				2346	return 0;
				2347	}
				2348	else {
				2349	PyErr_Format(PyExc_ValueError,
				2350	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2351	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2352	errors);
				2353	return -1;
				2354	}
				2355	}
				2356
				2357	PyObject PyUnicode_DecodeCharmap(const char s,
				2358	int size,
				2359	PyObject *mapping,
				2360	const char *errors)
				2361	{
				2362	PyUnicodeObject *v;
				2363	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2364	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2365
				2366	/* Default to Latin-1 */
				2367	if (mapping == NULL)
				2368	return PyUnicode_DecodeLatin1(s, size, errors);
				2369
				2370	v = _PyUnicode_New(size);
				2371	if (v == NULL)
				2372	goto onError;
				2373	if (size == 0)
				2374	return (PyObject *)v;
				2375	p = PyUnicode_AS_UNICODE(v);
				2376	while (size-- > 0) {
				2377	unsigned char ch = *s++;
				2378	PyObject w, x;
				2379
				2380	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2381	w = PyInt_FromLong((long)ch);
				2382	if (w == NULL)
				2383	goto onError;
				2384	x = PyObject_GetItem(mapping, w);
				2385	Py_DECREF(w);
				2386	if (x == NULL) {
				2387	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2388	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2389	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2390	x = Py_None;
				2391	Py_INCREF(x);
				2392	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2393	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2394	}
				2395
				2396	/* Apply mapping */
				2397	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2398	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2399	if (value < 0 \|\| value > 65535) {
				2400	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2401	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2402	Py_DECREF(x);
				2403	goto onError;
				2404	}
				2405	*p++ = (Py_UNICODE)value;
				2406	}
				2407	else if (x == Py_None) {
				2408	/* undefined mapping */
				2409	if (charmap_decoding_error(&s, &p, errors,
				2410	"character maps to <undefined>")) {
				2411	Py_DECREF(x);
				2412	goto onError;
				2413	}
				2414	}
				2415	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2416	int targetsize = PyUnicode_GET_SIZE(x);
				2417
				2418	if (targetsize == 1)
				2419	/* 1-1 mapping */
				2420	p++ = PyUnicode_AS_UNICODE(x);
				2421
				2422	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2423	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2424	if (targetsize > extrachars) {
				2425	/* resize first */
				2426	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2427	int needed = (targetsize - extrachars) + \
				2428	(targetsize << 2);
				2429	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2430	if (_PyUnicode_Resize(&v,
				2431	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2432	Py_DECREF(x);
				2433	goto onError;
				2434	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2435	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2436	}
				2437	Py_UNICODE_COPY(p,
				2438	PyUnicode_AS_UNICODE(x),
				2439	targetsize);
				2440	p += targetsize;
				2441	extrachars -= targetsize;
				2442	}
				2443	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2444	}
				2445	else {
				2446	/* wrong return value */
				2447	PyErr_SetString(PyExc_TypeError,
				2448	"character mapping must return integer, None or unicode");
				2449	Py_DECREF(x);
				2450	goto onError;
				2451	}
				2452	Py_DECREF(x);
				2453	}
				2454	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2455	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2456	goto onError;
				2457	return (PyObject *)v;
				2458
				2459	onError:
				2460	Py_XDECREF(v);
				2461	return NULL;
				2462	}
				2463
				2464	static
				2465	int charmap_encoding_error(const Py_UNICODE **source,
				2466	char **dest,
				2467	const char *errors,
				2468	const char *details)
				2469	{
				2470	if ((errors == NULL) \|\|
				2471	(strcmp(errors,"strict") == 0)) {
				2472	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2473	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2474	details);
				2475	return -1;
				2476	}
				2477	else if (strcmp(errors,"ignore") == 0) {
				2478	return 0;
				2479	}
				2480	else if (strcmp(errors,"replace") == 0) {
				2481	**dest = '?';
				2482	(*dest)++;
				2483	return 0;
				2484	}
				2485	else {
				2486	PyErr_Format(PyExc_ValueError,
				2487	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2488	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2489	errors);
				2490	return -1;
				2491	}
				2492	}
				2493
				2494	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2495	int size,
				2496	PyObject *mapping,
				2497	const char *errors)
				2498	{
				2499	PyObject *v;
				2500	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2501	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2502
				2503	/* Default to Latin-1 */
				2504	if (mapping == NULL)
				2505	return PyUnicode_EncodeLatin1(p, size, errors);
				2506
				2507	v = PyString_FromStringAndSize(NULL, size);
				2508	if (v == NULL)
				2509	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2510	if (size == 0)
				2511	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2512	s = PyString_AS_STRING(v);
				2513	while (size-- > 0) {
				2514	Py_UNICODE ch = *p++;
				2515	PyObject w, x;
				2516
				2517	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2518	w = PyInt_FromLong((long)ch);
				2519	if (w == NULL)
				2520	goto onError;
				2521	x = PyObject_GetItem(mapping, w);
				2522	Py_DECREF(w);
				2523	if (x == NULL) {
				2524	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2525	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2526	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2527	x = Py_None;
				2528	Py_INCREF(x);
				2529	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2530	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2531	}
				2532
				2533	/* Apply mapping */
				2534	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2535	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2536	if (value < 0 \|\| value > 255) {
				2537	PyErr_SetString(PyExc_TypeError,
				2538	"character mapping must be in range(256)");
				2539	Py_DECREF(x);
				2540	goto onError;
				2541	}
				2542	*s++ = (char)value;
				2543	}
				2544	else if (x == Py_None) {
				2545	/* undefined mapping */
				2546	if (charmap_encoding_error(&p, &s, errors,
				2547	"character maps to <undefined>")) {
				2548	Py_DECREF(x);
				2549	goto onError;
				2550	}
				2551	}
				2552	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2553	int targetsize = PyString_GET_SIZE(x);
				2554
				2555	if (targetsize == 1)
				2556	/* 1-1 mapping */
				2557	s++ = PyString_AS_STRING(x);
				2558
				2559	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2560	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2561	if (targetsize > extrachars) {
				2562	/* resize first */
				2563	int oldpos = (int)(s - PyString_AS_STRING(v));
				2564	int needed = (targetsize - extrachars) + \
				2565	(targetsize << 2);
				2566	extrachars += needed;
				2567	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2568	Py_DECREF(x);
				2569	goto onError;
				2570	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2571	s = PyString_AS_STRING(v) + oldpos;
				2572	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2573	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2574	s += targetsize;
				2575	extrachars -= targetsize;
				2576	}
				2577	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2578	}
				2579	else {
				2580	/* wrong return value */
				2581	PyErr_SetString(PyExc_TypeError,
				2582	"character mapping must return integer, None or unicode");
				2583	Py_DECREF(x);
				2584	goto onError;
				2585	}
				2586	Py_DECREF(x);
				2587	}
				2588	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2589	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2590	goto onError;
				2591	return v;
				2592
				2593	onError:
				2594	Py_DECREF(v);
				2595	return NULL;
				2596	}
				2597
				2598	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2599	PyObject *mapping)
				2600	{
				2601	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2602	PyErr_BadArgument();
				2603	return NULL;
				2604	}
				2605	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2606	PyUnicode_GET_SIZE(unicode),
				2607	mapping,
				2608	NULL);
				2609	}
				2610
				2611	static
				2612	int translate_error(const Py_UNICODE **source,
				2613	Py_UNICODE **dest,
				2614	const char *errors,
				2615	const char *details)
				2616	{
				2617	if ((errors == NULL) \|\|
				2618	(strcmp(errors,"strict") == 0)) {
				2619	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2620	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2621	details);
				2622	return -1;
				2623	}
				2624	else if (strcmp(errors,"ignore") == 0) {
				2625	return 0;
				2626	}
				2627	else if (strcmp(errors,"replace") == 0) {
				2628	**dest = '?';
				2629	(*dest)++;
				2630	return 0;
				2631	}
				2632	else {
				2633	PyErr_Format(PyExc_ValueError,
				2634	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2635	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2636	errors);
				2637	return -1;
				2638	}
				2639	}
				2640
				2641	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2642	int size,
				2643	PyObject *mapping,
				2644	const char *errors)
				2645	{
				2646	PyUnicodeObject *v;
				2647	Py_UNICODE *p;
				2648
				2649	if (mapping == NULL) {
				2650	PyErr_BadArgument();
				2651	return NULL;
				2652	}
				2653
				2654	/* Output will never be longer than input */
				2655	v = _PyUnicode_New(size);
				2656	if (v == NULL)
				2657	goto onError;
				2658	if (size == 0)
				2659	goto done;
				2660	p = PyUnicode_AS_UNICODE(v);
				2661	while (size-- > 0) {
				2662	Py_UNICODE ch = *s++;
				2663	PyObject w, x;
				2664
				2665	/* Get mapping */
				2666	w = PyInt_FromLong(ch);
				2667	if (w == NULL)
				2668	goto onError;
				2669	x = PyObject_GetItem(mapping, w);
				2670	Py_DECREF(w);
				2671	if (x == NULL) {
				2672	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2673	/* No mapping found: default to 1-1 mapping */
				2674	PyErr_Clear();
				2675	*p++ = ch;
				2676	continue;
				2677	}
				2678	goto onError;
				2679	}
				2680
				2681	/* Apply mapping */
				2682	if (PyInt_Check(x))
				2683	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2684	else if (x == Py_None) {
				2685	/* undefined mapping */
				2686	if (translate_error(&s, &p, errors,
				2687	"character maps to <undefined>")) {
				2688	Py_DECREF(x);
				2689	goto onError;
				2690	}
				2691	}
				2692	else if (PyUnicode_Check(x)) {
				2693	if (PyUnicode_GET_SIZE(x) != 1) {
				2694	/* 1-n mapping */
				2695	PyErr_SetString(PyExc_NotImplementedError,
				2696	"1-n mappings are currently not implemented");
				2697	Py_DECREF(x);
				2698	goto onError;
				2699	}
				2700	p++ = PyUnicode_AS_UNICODE(x);
				2701	}
				2702	else {
				2703	/* wrong return value */
				2704	PyErr_SetString(PyExc_TypeError,
				2705	"translate mapping must return integer, None or unicode");
				2706	Py_DECREF(x);
				2707	goto onError;
				2708	}
				2709	Py_DECREF(x);
				2710	}
				2711	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2712	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2713	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2714
				2715	done:
				2716	return (PyObject *)v;
				2717
				2718	onError:
				2719	Py_XDECREF(v);
				2720	return NULL;
				2721	}
				2722
				2723	PyObject PyUnicode_Translate(PyObject str,
				2724	PyObject *mapping,
				2725	const char *errors)
				2726	{
				2727	PyObject *result;
				2728
				2729	str = PyUnicode_FromObject(str);
				2730	if (str == NULL)
				2731	goto onError;
				2732	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2733	PyUnicode_GET_SIZE(str),
				2734	mapping,
				2735	errors);
				2736	Py_DECREF(str);
				2737	return result;
				2738
				2739	onError:
				2740	Py_XDECREF(str);
				2741	return NULL;
				2742	}
				2743
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2744	/* --- Decimal Encoder ---------------------------------------------------- */
				2745
				2746	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2747	int length,
				2748	char *output,
				2749	const char *errors)
				2750	{
				2751	Py_UNICODE p, end;
				2752
				2753	if (output == NULL) {
				2754	PyErr_BadArgument();
				2755	return -1;
				2756	}
				2757
				2758	p = s;
				2759	end = s + length;
				2760	while (p < end) {
				2761	register Py_UNICODE ch = *p++;
				2762	int decimal;
				2763
				2764	if (Py_UNICODE_ISSPACE(ch)) {
				2765	*output++ = ' ';
				2766	continue;
				2767	}
				2768	decimal = Py_UNICODE_TODECIMAL(ch);
				2769	if (decimal >= 0) {
				2770	*output++ = '0' + decimal;
				2771	continue;
				2772	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2773	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2774	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2775	continue;
				2776	}
				2777	/* All other characters are considered invalid */
				2778	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2779	PyErr_SetString(PyExc_ValueError,
				2780	"invalid decimal Unicode string");
				2781	goto onError;
				2782	}
				2783	else if (strcmp(errors, "ignore") == 0)
				2784	continue;
				2785	else if (strcmp(errors, "replace") == 0) {
				2786	*output++ = '?';
				2787	continue;
				2788	}
				2789	}
				2790	/* 0-terminate the output string */
				2791	*output++ = '\0';
				2792	return 0;
				2793
				2794	onError:
				2795	return -1;
				2796	}
				2797
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2798	/* --- Helpers ------------------------------------------------------------ */
				2799
				2800	static
				2801	int count(PyUnicodeObject *self,
				2802	int start,
				2803	int end,
				2804	PyUnicodeObject *substring)
				2805	{
				2806	int count = 0;
				2807
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2808	if (start < 0)
				2809	start += self->length;
				2810	if (start < 0)
				2811	start = 0;
				2812	if (end > self->length)
				2813	end = self->length;
				2814	if (end < 0)
				2815	end += self->length;
				2816	if (end < 0)
				2817	end = 0;
				2818
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2819	if (substring->length == 0)
				2820	return (end - start + 1);
				2821
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2822	end -= substring->length;
				2823
				2824	while (start <= end)
				2825	if (Py_UNICODE_MATCH(self, start, substring)) {
				2826	count++;
				2827	start += substring->length;
				2828	} else
				2829	start++;
				2830
				2831	return count;
				2832	}
				2833
				2834	int PyUnicode_Count(PyObject *str,
				2835	PyObject *substr,
				2836	int start,
				2837	int end)
				2838	{
				2839	int result;
				2840
				2841	str = PyUnicode_FromObject(str);
				2842	if (str == NULL)
				2843	return -1;
				2844	substr = PyUnicode_FromObject(substr);
				2845	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2846	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2847	return -1;
				2848	}
				2849
				2850	result = count((PyUnicodeObject *)str,
				2851	start, end,
				2852	(PyUnicodeObject *)substr);
				2853
				2854	Py_DECREF(str);
				2855	Py_DECREF(substr);
				2856	return result;
				2857	}
				2858
				2859	static
				2860	int findstring(PyUnicodeObject *self,
				2861	PyUnicodeObject *substring,
				2862	int start,
				2863	int end,
				2864	int direction)
				2865	{
				2866	if (start < 0)
				2867	start += self->length;
				2868	if (start < 0)
				2869	start = 0;
				2870
				2871	if (substring->length == 0)
				2872	return start;
				2873
				2874	if (end > self->length)
				2875	end = self->length;
				2876	if (end < 0)
				2877	end += self->length;
				2878	if (end < 0)
				2879	end = 0;
				2880
				2881	end -= substring->length;
				2882
				2883	if (direction < 0) {
				2884	for (; end >= start; end--)
				2885	if (Py_UNICODE_MATCH(self, end, substring))
				2886	return end;
				2887	} else {
				2888	for (; start <= end; start++)
				2889	if (Py_UNICODE_MATCH(self, start, substring))
				2890	return start;
				2891	}
				2892
				2893	return -1;
				2894	}
				2895
				2896	int PyUnicode_Find(PyObject *str,
				2897	PyObject *substr,
				2898	int start,
				2899	int end,
				2900	int direction)
				2901	{
				2902	int result;
				2903
				2904	str = PyUnicode_FromObject(str);
				2905	if (str == NULL)
				2906	return -1;
				2907	substr = PyUnicode_FromObject(substr);
				2908	if (substr == NULL) {
				2909	Py_DECREF(substr);
				2910	return -1;
				2911	}
				2912
				2913	result = findstring((PyUnicodeObject *)str,
				2914	(PyUnicodeObject *)substr,
				2915	start, end, direction);
				2916	Py_DECREF(str);
				2917	Py_DECREF(substr);
				2918	return result;
				2919	}
				2920
				2921	static
				2922	int tailmatch(PyUnicodeObject *self,
				2923	PyUnicodeObject *substring,
				2924	int start,
				2925	int end,
				2926	int direction)
				2927	{
				2928	if (start < 0)
				2929	start += self->length;
				2930	if (start < 0)
				2931	start = 0;
				2932
				2933	if (substring->length == 0)
				2934	return 1;
				2935
				2936	if (end > self->length)
				2937	end = self->length;
				2938	if (end < 0)
				2939	end += self->length;
				2940	if (end < 0)
				2941	end = 0;
				2942
				2943	end -= substring->length;
				2944	if (end < start)
				2945	return 0;
				2946
				2947	if (direction > 0) {
				2948	if (Py_UNICODE_MATCH(self, end, substring))
				2949	return 1;
				2950	} else {
				2951	if (Py_UNICODE_MATCH(self, start, substring))
				2952	return 1;
				2953	}
				2954
				2955	return 0;
				2956	}
				2957
				2958	int PyUnicode_Tailmatch(PyObject *str,
				2959	PyObject *substr,
				2960	int start,
				2961	int end,
				2962	int direction)
				2963	{
				2964	int result;
				2965
				2966	str = PyUnicode_FromObject(str);
				2967	if (str == NULL)
				2968	return -1;
				2969	substr = PyUnicode_FromObject(substr);
				2970	if (substr == NULL) {
				2971	Py_DECREF(substr);
				2972	return -1;
				2973	}
				2974
				2975	result = tailmatch((PyUnicodeObject *)str,
				2976	(PyUnicodeObject *)substr,
				2977	start, end, direction);
				2978	Py_DECREF(str);
				2979	Py_DECREF(substr);
				2980	return result;
				2981	}
				2982
				2983	static
				2984	const Py_UNICODE findchar(const Py_UNICODE s,
				2985	int size,
				2986	Py_UNICODE ch)
				2987	{
				2988	/* like wcschr, but doesn't stop at NULL characters */
				2989
				2990	while (size-- > 0) {
				2991	if (*s == ch)
				2992	return s;
				2993	s++;
				2994	}
				2995
				2996	return NULL;
				2997	}
				2998
				2999	/* Apply fixfct filter to the Unicode object self and return a
				3000	reference to the modified object */
				3001
				3002	static
				3003	PyObject fixup(PyUnicodeObject self,
				3004	int (fixfct)(PyUnicodeObject s))
				3005	{
				3006
				3007	PyUnicodeObject *u;
				3008
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3009	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3010	if (u == NULL)
				3011	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3012
				3013	Py_UNICODE_COPY(u->str, self->str, self->length);
				3014
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3015	if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3016	/* fixfct should return TRUE if it modified the buffer. If
				3017	FALSE, return a reference to the original buffer instead
				3018	(to save space, not time) */
				3019	Py_INCREF(self);
				3020	Py_DECREF(u);
				3021	return (PyObject*) self;
				3022	}
				3023	return (PyObject*) u;
				3024	}
				3025
				3026	static
				3027	int fixupper(PyUnicodeObject *self)
				3028	{
				3029	int len = self->length;
				3030	Py_UNICODE *s = self->str;
				3031	int status = 0;
				3032
				3033	while (len-- > 0) {
				3034	register Py_UNICODE ch;
				3035
				3036	ch = Py_UNICODE_TOUPPER(*s);
				3037	if (ch != *s) {
				3038	status = 1;
				3039	*s = ch;
				3040	}
				3041	s++;
				3042	}
				3043
				3044	return status;
				3045	}
				3046
				3047	static
				3048	int fixlower(PyUnicodeObject *self)
				3049	{
				3050	int len = self->length;
				3051	Py_UNICODE *s = self->str;
				3052	int status = 0;
				3053
				3054	while (len-- > 0) {
				3055	register Py_UNICODE ch;
				3056
				3057	ch = Py_UNICODE_TOLOWER(*s);
				3058	if (ch != *s) {
				3059	status = 1;
				3060	*s = ch;
				3061	}
				3062	s++;
				3063	}
				3064
				3065	return status;
				3066	}
				3067
				3068	static
				3069	int fixswapcase(PyUnicodeObject *self)
				3070	{
				3071	int len = self->length;
				3072	Py_UNICODE *s = self->str;
				3073	int status = 0;
				3074
				3075	while (len-- > 0) {
				3076	if (Py_UNICODE_ISUPPER(*s)) {
				3077	s = Py_UNICODE_TOLOWER(s);
				3078	status = 1;
				3079	} else if (Py_UNICODE_ISLOWER(*s)) {
				3080	s = Py_UNICODE_TOUPPER(s);
				3081	status = 1;
				3082	}
				3083	s++;
				3084	}
				3085
				3086	return status;
				3087	}
				3088
				3089	static
				3090	int fixcapitalize(PyUnicodeObject *self)
				3091	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3092	int len = self->length;
				3093	Py_UNICODE *s = self->str;
				3094	int status = 0;
				3095
				3096	if (len == 0)
				3097	return 0;
				3098	if (Py_UNICODE_ISLOWER(*s)) {
				3099	s = Py_UNICODE_TOUPPER(s);
				3100	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3101	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3102	s++;
				3103	while (--len > 0) {
				3104	if (Py_UNICODE_ISUPPER(*s)) {
				3105	s = Py_UNICODE_TOLOWER(s);
				3106	status = 1;
				3107	}
				3108	s++;
				3109	}
				3110	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3111	}
				3112
				3113	static
				3114	int fixtitle(PyUnicodeObject *self)
				3115	{
				3116	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3117	register Py_UNICODE *e;
				3118	int previous_is_cased;
				3119
				3120	/* Shortcut for single character strings */
				3121	if (PyUnicode_GET_SIZE(self) == 1) {
				3122	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				3123	if (*p != ch) {
				3124	*p = ch;
				3125	return 1;
				3126	}
				3127	else
				3128	return 0;
				3129	}
				3130
				3131	e = p + PyUnicode_GET_SIZE(self);
				3132	previous_is_cased = 0;
				3133	for (; p < e; p++) {
				3134	register const Py_UNICODE ch = *p;
				3135
				3136	if (previous_is_cased)
				3137	*p = Py_UNICODE_TOLOWER(ch);
				3138	else
				3139	*p = Py_UNICODE_TOTITLE(ch);
				3140
				3141	if (Py_UNICODE_ISLOWER(ch) \|\|
				3142	Py_UNICODE_ISUPPER(ch) \|\|
				3143	Py_UNICODE_ISTITLE(ch))
				3144	previous_is_cased = 1;
				3145	else
				3146	previous_is_cased = 0;
				3147	}
				3148	return 1;
				3149	}
				3150
				3151	PyObject PyUnicode_Join(PyObject separator,
				3152	PyObject *seq)
				3153	{
				3154	Py_UNICODE *sep;
				3155	int seplen;
				3156	PyUnicodeObject *res = NULL;
				3157	int reslen = 0;
				3158	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3159	int sz = 100;
				3160	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3161	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3162
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3163	it = PyObject_GetIter(seq);
				3164	if (it == NULL)
				3165	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3166
				3167	if (separator == NULL) {
				3168	Py_UNICODE blank = ' ';
				3169	sep = &blank;
				3170	seplen = 1;
				3171	}
				3172	else {
				3173	separator = PyUnicode_FromObject(separator);
				3174	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3175	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3176	sep = PyUnicode_AS_UNICODE(separator);
				3177	seplen = PyUnicode_GET_SIZE(separator);
				3178	}
				3179
				3180	res = _PyUnicode_New(sz);
				3181	if (res == NULL)
				3182	goto onError;
				3183	p = PyUnicode_AS_UNICODE(res);
				3184	reslen = 0;
				3185
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3186	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3187	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3188	PyObject *item = PyIter_Next(it);
				3189	if (item == NULL) {
				3190	if (PyErr_Occurred())
				3191	goto onError;
				3192	break;
				3193	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3194	if (!PyUnicode_Check(item)) {
				3195	PyObject *v;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3196	if (!PyString_Check(item)) {
				3197	PyErr_Format(PyExc_TypeError,
				3198	"sequence item %i: expected string or Unicode,"
				3199	" %.80s found",
				3200	i, item->ob_type->tp_name);
				3201	Py_DECREF(item);
				3202	goto onError;
				3203	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3204	v = PyUnicode_FromObject(item);
				3205	Py_DECREF(item);
				3206	item = v;
				3207	if (item == NULL)
				3208	goto onError;
				3209	}
				3210	itemlen = PyUnicode_GET_SIZE(item);
				3211	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3212	if (_PyUnicode_Resize(&res, sz*2)) {
				3213	Py_DECREF(item);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3214	goto onError;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3215	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3216	sz *= 2;
				3217	p = PyUnicode_AS_UNICODE(res) + reslen;
				3218	}
				3219	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3220	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3221	p += seplen;
				3222	reslen += seplen;
				3223	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3224	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3225	p += itemlen;
				3226	reslen += itemlen;
				3227	Py_DECREF(item);
				3228	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3229	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3230	goto onError;
				3231
				3232	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3233	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3234	return (PyObject *)res;
				3235
				3236	onError:
				3237	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3238	Py_XDECREF(res);
				3239	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3240	return NULL;
				3241	}
				3242
				3243	static
				3244	PyUnicodeObject pad(PyUnicodeObject self,
				3245	int left,
				3246	int right,
				3247	Py_UNICODE fill)
				3248	{
				3249	PyUnicodeObject *u;
				3250
				3251	if (left < 0)
				3252	left = 0;
				3253	if (right < 0)
				3254	right = 0;
				3255
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3256	if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3257	Py_INCREF(self);
				3258	return self;
				3259	}
				3260
				3261	u = _PyUnicode_New(left + self->length + right);
				3262	if (u) {
				3263	if (left)
				3264	Py_UNICODE_FILL(u->str, fill, left);
				3265	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				3266	if (right)
				3267	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				3268	}
				3269
				3270	return u;
				3271	}
				3272
				3273	#define SPLIT_APPEND(data, left, right) \
				3274	str = PyUnicode_FromUnicode(data + left, right - left); \
				3275	if (!str) \
				3276	goto onError; \
				3277	if (PyList_Append(list, str)) { \
				3278	Py_DECREF(str); \
				3279	goto onError; \
				3280	} \
				3281	else \
				3282	Py_DECREF(str);
				3283
				3284	static
				3285	PyObject split_whitespace(PyUnicodeObject self,
				3286	PyObject *list,
				3287	int maxcount)
				3288	{
				3289	register int i;
				3290	register int j;
				3291	int len = self->length;
				3292	PyObject *str;
				3293
				3294	for (i = j = 0; i < len; ) {
				3295	/* find a token */
				3296	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3297	i++;
				3298	j = i;
				3299	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				3300	i++;
				3301	if (j < i) {
				3302	if (maxcount-- <= 0)
				3303	break;
				3304	SPLIT_APPEND(self->str, j, i);
				3305	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3306	i++;
				3307	j = i;
				3308	}
				3309	}
				3310	if (j < len) {
				3311	SPLIT_APPEND(self->str, j, len);
				3312	}
				3313	return list;
				3314
				3315	onError:
				3316	Py_DECREF(list);
				3317	return NULL;
				3318	}
				3319
				3320	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3321	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3322	{
				3323	register int i;
				3324	register int j;
				3325	int len;
				3326	PyObject *list;
				3327	PyObject *str;
				3328	Py_UNICODE *data;
				3329
				3330	string = PyUnicode_FromObject(string);
				3331	if (string == NULL)
				3332	return NULL;
				3333	data = PyUnicode_AS_UNICODE(string);
				3334	len = PyUnicode_GET_SIZE(string);
				3335
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3336	list = PyList_New(0);
				3337	if (!list)
				3338	goto onError;
				3339
				3340	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3341	int eol;
				3342
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3343	/* Find a line and append it */
				3344	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				3345	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3346
				3347	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3348	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3349	if (i < len) {
				3350	if (data[i] == '\r' && i + 1 < len &&
				3351	data[i+1] == '\n')
				3352	i += 2;
				3353	else
				3354	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3355	if (keepends)
				3356	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3357	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3358	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3359	j = i;
				3360	}
				3361	if (j < len) {
				3362	SPLIT_APPEND(data, j, len);
				3363	}
				3364
				3365	Py_DECREF(string);
				3366	return list;
				3367
				3368	onError:
				3369	Py_DECREF(list);
				3370	Py_DECREF(string);
				3371	return NULL;
				3372	}
				3373
				3374	static
				3375	PyObject split_char(PyUnicodeObject self,
				3376	PyObject *list,
				3377	Py_UNICODE ch,
				3378	int maxcount)
				3379	{
				3380	register int i;
				3381	register int j;
				3382	int len = self->length;
				3383	PyObject *str;
				3384
				3385	for (i = j = 0; i < len; ) {
				3386	if (self->str[i] == ch) {
				3387	if (maxcount-- <= 0)
				3388	break;
				3389	SPLIT_APPEND(self->str, j, i);
				3390	i = j = i + 1;
				3391	} else
				3392	i++;
				3393	}
				3394	if (j <= len) {
				3395	SPLIT_APPEND(self->str, j, len);
				3396	}
				3397	return list;
				3398
				3399	onError:
				3400	Py_DECREF(list);
				3401	return NULL;
				3402	}
				3403
				3404	static
				3405	PyObject split_substring(PyUnicodeObject self,
				3406	PyObject *list,
				3407	PyUnicodeObject *substring,
				3408	int maxcount)
				3409	{
				3410	register int i;
				3411	register int j;
				3412	int len = self->length;
				3413	int sublen = substring->length;
				3414	PyObject *str;
				3415
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3416	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3417	if (Py_UNICODE_MATCH(self, i, substring)) {
				3418	if (maxcount-- <= 0)
				3419	break;
				3420	SPLIT_APPEND(self->str, j, i);
				3421	i = j = i + sublen;
				3422	} else
				3423	i++;
				3424	}
				3425	if (j <= len) {
				3426	SPLIT_APPEND(self->str, j, len);
				3427	}
				3428	return list;
				3429
				3430	onError:
				3431	Py_DECREF(list);
				3432	return NULL;
				3433	}
				3434
				3435	#undef SPLIT_APPEND
				3436
				3437	static
				3438	PyObject split(PyUnicodeObject self,
				3439	PyUnicodeObject *substring,
				3440	int maxcount)
				3441	{
				3442	PyObject *list;
				3443
				3444	if (maxcount < 0)
				3445	maxcount = INT_MAX;
				3446
				3447	list = PyList_New(0);
				3448	if (!list)
				3449	return NULL;
				3450
				3451	if (substring == NULL)
				3452	return split_whitespace(self,list,maxcount);
				3453
				3454	else if (substring->length == 1)
				3455	return split_char(self,list,substring->str[0],maxcount);
				3456
				3457	else if (substring->length == 0) {
				3458	Py_DECREF(list);
				3459	PyErr_SetString(PyExc_ValueError, "empty separator");
				3460	return NULL;
				3461	}
				3462	else
				3463	return split_substring(self,list,substring,maxcount);
				3464	}
				3465
				3466	static
				3467	PyObject strip(PyUnicodeObject self,
				3468	int left,
				3469	int right)
				3470	{
				3471	Py_UNICODE *p = self->str;
				3472	int start = 0;
				3473	int end = self->length;
				3474
				3475	if (left)
				3476	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3477	start++;
				3478
				3479	if (right)
				3480	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3481	end--;
				3482
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3483	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3484	/* couldn't strip anything off, return original string */
				3485	Py_INCREF(self);
				3486	return (PyObject*) self;
				3487	}
				3488
				3489	return (PyObject*) PyUnicode_FromUnicode(
				3490	self->str + start,
				3491	end - start
				3492	);
				3493	}
				3494
				3495	static
				3496	PyObject replace(PyUnicodeObject self,
				3497	PyUnicodeObject *str1,
				3498	PyUnicodeObject *str2,
				3499	int maxcount)
				3500	{
				3501	PyUnicodeObject *u;
				3502
				3503	if (maxcount < 0)
				3504	maxcount = INT_MAX;
				3505
				3506	if (str1->length == 1 && str2->length == 1) {
				3507	int i;
				3508
				3509	/* replace characters */
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3510	if (!findchar(self->str, self->length, str1->str[0]) &&
				3511	PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3512	/* nothing to replace, return original string */
				3513	Py_INCREF(self);
				3514	u = self;
				3515	} else {
				3516	Py_UNICODE u1 = str1->str[0];
				3517	Py_UNICODE u2 = str2->str[0];
				3518
				3519	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3520	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3521	self->length
				3522	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3523	if (u != NULL) {
				3524	Py_UNICODE_COPY(u->str, self->str,
				3525	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3526	for (i = 0; i < u->length; i++)
				3527	if (u->str[i] == u1) {
				3528	if (--maxcount < 0)
				3529	break;
				3530	u->str[i] = u2;
				3531	}
				3532	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3533	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3534
				3535	} else {
				3536	int n, i;
				3537	Py_UNICODE *p;
				3538
				3539	/* replace strings */
				3540	n = count(self, 0, self->length, str1);
				3541	if (n > maxcount)
				3542	n = maxcount;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3543	if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3544	/* nothing to replace, return original string */
				3545	Py_INCREF(self);
				3546	u = self;
				3547	} else {
				3548	u = _PyUnicode_New(
				3549	self->length + n * (str2->length - str1->length));
				3550	if (u) {
				3551	i = 0;
				3552	p = u->str;
				3553	while (i <= self->length - str1->length)
				3554	if (Py_UNICODE_MATCH(self, i, str1)) {
				3555	/* replace string segment */
				3556	Py_UNICODE_COPY(p, str2->str, str2->length);
				3557	p += str2->length;
				3558	i += str1->length;
				3559	if (--n <= 0) {
				3560	/* copy remaining part */
				3561	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3562	break;
				3563	}
				3564	} else
				3565	*p++ = self->str[i++];
				3566	}
				3567	}
				3568	}
				3569
				3570	return (PyObject *) u;
				3571	}
				3572
				3573	/* --- Unicode Object Methods --------------------------------------------- */
				3574
				3575	static char title__doc__[] =
				3576	"S.title() -> unicode\n\
				3577	\n\
				3578	Return a titlecased version of S, i.e. words start with title case\n\
				3579	characters, all remaining cased characters have lower case.";
				3580
				3581	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3582	unicode_title(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3583	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3584	return fixup(self, fixtitle);
				3585	}
				3586
				3587	static char capitalize__doc__[] =
				3588	"S.capitalize() -> unicode\n\
				3589	\n\
				3590	Return a capitalized version of S, i.e. make the first character\n\
				3591	have upper case.";
				3592
				3593	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3594	unicode_capitalize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3595	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3596	return fixup(self, fixcapitalize);
				3597	}
				3598
				3599	#if 0
				3600	static char capwords__doc__[] =
				3601	"S.capwords() -> unicode\n\
				3602	\n\
				3603	Apply .capitalize() to all words in S and return the result with\n\
				3604	normalized whitespace (all whitespace strings are replaced by ' ').";
				3605
				3606	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3607	unicode_capwords(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3608	{
				3609	PyObject *list;
				3610	PyObject *item;
				3611	int i;
				3612
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3613	/* Split into words */
				3614	list = split(self, NULL, -1);
				3615	if (!list)
				3616	return NULL;
				3617
				3618	/* Capitalize each word */
				3619	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3620	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3621	fixcapitalize);
				3622	if (item == NULL)
				3623	goto onError;
				3624	Py_DECREF(PyList_GET_ITEM(list, i));
				3625	PyList_SET_ITEM(list, i, item);
				3626	}
				3627
				3628	/* Join the words to form a new string */
				3629	item = PyUnicode_Join(NULL, list);
				3630
				3631	onError:
				3632	Py_DECREF(list);
				3633	return (PyObject *)item;
				3634	}
				3635	#endif
				3636
				3637	static char center__doc__[] =
				3638	"S.center(width) -> unicode\n\
				3639	\n\
				3640	Return S centered in a Unicode string of length width. Padding is done\n\
				3641	using spaces.";
				3642
				3643	static PyObject *
				3644	unicode_center(PyUnicodeObject self, PyObject args)
				3645	{
				3646	int marg, left;
				3647	int width;
				3648
				3649	if (!PyArg_ParseTuple(args, "i:center", &width))
				3650	return NULL;
				3651
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3652	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3653	Py_INCREF(self);
				3654	return (PyObject*) self;
				3655	}
				3656
				3657	marg = width - self->length;
				3658	left = marg / 2 + (marg & width & 1);
				3659
				3660	return (PyObject*) pad(self, left, marg - left, ' ');
				3661	}
				3662
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3663	#if 0
				3664
				3665	/* This code should go into some future Unicode collation support
				3666	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3667	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3668
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3669	/* speedy UTF-16 code point order comparison */
				3670	/* gleaned from: */
				3671	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3672
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3673	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3674	{
				3675	0, 0, 0, 0, 0, 0, 0, 0,
				3676	0, 0, 0, 0, 0, 0, 0, 0,
				3677	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3678	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3679	};
				3680
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3681	static int
				3682	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3683	{
				3684	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3685
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3686	Py_UNICODE *s1 = str1->str;
				3687	Py_UNICODE *s2 = str2->str;
				3688
				3689	len1 = str1->length;
				3690	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3691
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3692	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3693	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3694
				3695	c1 = *s1++;
				3696	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3697
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3698	if (c1 > (1<<11) * 26)
				3699	c1 += utf16Fixup[c1>>11];
				3700	if (c2 > (1<<11) * 26)
				3701	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3702	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3703
				3704	if (c1 != c2)
				3705	return (c1 < c2) ? -1 : 1;
				3706
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3707	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3708	}
				3709
				3710	return (len1 < len2) ? -1 : (len1 != len2);
				3711	}
				3712
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3713	#else
				3714
				3715	static int
				3716	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3717	{
				3718	register int len1, len2;
				3719
				3720	Py_UNICODE *s1 = str1->str;
				3721	Py_UNICODE *s2 = str2->str;
				3722
				3723	len1 = str1->length;
				3724	len2 = str2->length;
				3725
				3726	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3727	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3728
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3729	c1 = *s1++;
				3730	c2 = *s2++;
				3731
				3732	if (c1 != c2)
				3733	return (c1 < c2) ? -1 : 1;
				3734
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3735	len1--; len2--;
				3736	}
				3737
				3738	return (len1 < len2) ? -1 : (len1 != len2);
				3739	}
				3740
				3741	#endif
				3742
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3743	int PyUnicode_Compare(PyObject *left,
				3744	PyObject *right)
				3745	{
				3746	PyUnicodeObject u = NULL, v = NULL;
				3747	int result;
				3748
				3749	/* Coerce the two arguments */
				3750	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3751	if (u == NULL)
				3752	goto onError;
				3753	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3754	if (v == NULL)
				3755	goto onError;
				3756
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3757	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3758	if (v == u) {
				3759	Py_DECREF(u);
				3760	Py_DECREF(v);
				3761	return 0;
				3762	}
				3763
				3764	result = unicode_compare(u, v);
				3765
				3766	Py_DECREF(u);
				3767	Py_DECREF(v);
				3768	return result;
				3769
				3770	onError:
				3771	Py_XDECREF(u);
				3772	Py_XDECREF(v);
				3773	return -1;
				3774	}
				3775
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3776	int PyUnicode_Contains(PyObject *container,
				3777	PyObject *element)
				3778	{
				3779	PyUnicodeObject u = NULL, v = NULL;
				3780	int result;
				3781	register const Py_UNICODE p, e;
				3782	register Py_UNICODE ch;
				3783
				3784	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3785	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3786	if (v == NULL) {
				3787	PyErr_SetString(PyExc_TypeError,
				3788	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3789	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3790	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3791	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3792	if (u == NULL) {
				3793	Py_DECREF(v);
				3794	goto onError;
				3795	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3796
				3797	/* Check v in u */
				3798	if (PyUnicode_GET_SIZE(v) != 1) {
				3799	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3800	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3801	goto onError;
				3802	}
				3803	ch = *PyUnicode_AS_UNICODE(v);
				3804	p = PyUnicode_AS_UNICODE(u);
				3805	e = p + PyUnicode_GET_SIZE(u);
				3806	result = 0;
				3807	while (p < e) {
				3808	if (*p++ == ch) {
				3809	result = 1;
				3810	break;
				3811	}
				3812	}
				3813
				3814	Py_DECREF(u);
				3815	Py_DECREF(v);
				3816	return result;
				3817
				3818	onError:
				3819	Py_XDECREF(u);
				3820	Py_XDECREF(v);
				3821	return -1;
				3822	}
				3823
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3824	/* Concat to string or Unicode object giving a new Unicode object. */
				3825
				3826	PyObject PyUnicode_Concat(PyObject left,
				3827	PyObject *right)
				3828	{
				3829	PyUnicodeObject u = NULL, v = NULL, *w;
				3830
				3831	/* Coerce the two arguments */
				3832	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3833	if (u == NULL)
				3834	goto onError;
				3835	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3836	if (v == NULL)
				3837	goto onError;
				3838
				3839	/* Shortcuts */
				3840	if (v == unicode_empty) {
				3841	Py_DECREF(v);
				3842	return (PyObject *)u;
				3843	}
				3844	if (u == unicode_empty) {
				3845	Py_DECREF(u);
				3846	return (PyObject *)v;
				3847	}
				3848
				3849	/* Concat the two Unicode strings */
				3850	w = _PyUnicode_New(u->length + v->length);
				3851	if (w == NULL)
				3852	goto onError;
				3853	Py_UNICODE_COPY(w->str, u->str, u->length);
				3854	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3855
				3856	Py_DECREF(u);
				3857	Py_DECREF(v);
				3858	return (PyObject *)w;
				3859
				3860	onError:
				3861	Py_XDECREF(u);
				3862	Py_XDECREF(v);
				3863	return NULL;
				3864	}
				3865
				3866	static char count__doc__[] =
				3867	"S.count(sub[, start[, end]]) -> int\n\
				3868	\n\
				3869	Return the number of occurrences of substring sub in Unicode string\n\
				3870	S[start:end]. Optional arguments start and end are\n\
				3871	interpreted as in slice notation.";
				3872
				3873	static PyObject *
				3874	unicode_count(PyUnicodeObject self, PyObject args)
				3875	{
				3876	PyUnicodeObject *substring;
				3877	int start = 0;
				3878	int end = INT_MAX;
				3879	PyObject *result;
				3880
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3881	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3882	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3883	return NULL;
				3884
				3885	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3886	(PyObject *)substring);
				3887	if (substring == NULL)
				3888	return NULL;
				3889
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3890	if (start < 0)
				3891	start += self->length;
				3892	if (start < 0)
				3893	start = 0;
				3894	if (end > self->length)
				3895	end = self->length;
				3896	if (end < 0)
				3897	end += self->length;
				3898	if (end < 0)
				3899	end = 0;
				3900
				3901	result = PyInt_FromLong((long) count(self, start, end, substring));
				3902
				3903	Py_DECREF(substring);
				3904	return result;
				3905	}
				3906
				3907	static char encode__doc__[] =
				3908	"S.encode([encoding[,errors]]) -> string\n\
				3909	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3910	Return an encoded string version of S. Default encoding is the current\n\
				3911	default string encoding. errors may be given to set a different error\n\
				3912	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3913	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3914
				3915	static PyObject *
				3916	unicode_encode(PyUnicodeObject self, PyObject args)
				3917	{
				3918	char *encoding = NULL;
				3919	char *errors = NULL;
				3920	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3921	return NULL;
				3922	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3923	}
				3924
				3925	static char expandtabs__doc__[] =
				3926	"S.expandtabs([tabsize]) -> unicode\n\
				3927	\n\
				3928	Return a copy of S where all tab characters are expanded using spaces.\n\
				3929	If tabsize is not given, a tab size of 8 characters is assumed.";
				3930
				3931	static PyObject*
				3932	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3933	{
				3934	Py_UNICODE *e;
				3935	Py_UNICODE *p;
				3936	Py_UNICODE *q;
				3937	int i, j;
				3938	PyUnicodeObject *u;
				3939	int tabsize = 8;
				3940
				3941	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3942	return NULL;
				3943
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3944	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3945	i = j = 0;
				3946	e = self->str + self->length;
				3947	for (p = self->str; p < e; p++)
				3948	if (*p == '\t') {
				3949	if (tabsize > 0)
				3950	j += tabsize - (j % tabsize);
				3951	}
				3952	else {
				3953	j++;
				3954	if (p == '\n' \|\| p == '\r') {
				3955	i += j;
				3956	j = 0;
				3957	}
				3958	}
				3959
				3960	/* Second pass: create output string and fill it */
				3961	u = _PyUnicode_New(i + j);
				3962	if (!u)
				3963	return NULL;
				3964
				3965	j = 0;
				3966	q = u->str;
				3967
				3968	for (p = self->str; p < e; p++)
				3969	if (*p == '\t') {
				3970	if (tabsize > 0) {
				3971	i = tabsize - (j % tabsize);
				3972	j += i;
				3973	while (i--)
				3974	*q++ = ' ';
				3975	}
				3976	}
				3977	else {
				3978	j++;
				3979	q++ = p;
				3980	if (p == '\n' \|\| p == '\r')
				3981	j = 0;
				3982	}
				3983
				3984	return (PyObject*) u;
				3985	}
				3986
				3987	static char find__doc__[] =
				3988	"S.find(sub [,start [,end]]) -> int\n\
				3989	\n\
				3990	Return the lowest index in S where substring sub is found,\n\
				3991	such that sub is contained within s[start,end]. Optional\n\
				3992	arguments start and end are interpreted as in slice notation.\n\
				3993	\n\
				3994	Return -1 on failure.";
				3995
				3996	static PyObject *
				3997	unicode_find(PyUnicodeObject self, PyObject args)
				3998	{
				3999	PyUnicodeObject *substring;
				4000	int start = 0;
				4001	int end = INT_MAX;
				4002	PyObject *result;
				4003
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4004	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				4005	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4006	return NULL;
				4007	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4008	(PyObject *)substring);
				4009	if (substring == NULL)
				4010	return NULL;
				4011
				4012	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				4013
				4014	Py_DECREF(substring);
				4015	return result;
				4016	}
				4017
				4018	static PyObject *
				4019	unicode_getitem(PyUnicodeObject *self, int index)
				4020	{
				4021	if (index < 0 \|\| index >= self->length) {
				4022	PyErr_SetString(PyExc_IndexError, "string index out of range");
				4023	return NULL;
				4024	}
				4025
				4026	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				4027	}
				4028
				4029	static long
				4030	unicode_hash(PyUnicodeObject *self)
				4031	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4032	/* Since Unicode objects compare equal to their ASCII string
				4033	counterparts, they should use the individual character values
				4034	as basis for their hash value. This is needed to assure that
				4035	strings and Unicode objects behave in the same way as
				4036	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4037
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4038	register int len;
				4039	register Py_UNICODE *p;
				4040	register long x;
				4041
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4042	if (self->hash != -1)
				4043	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4044	len = PyUnicode_GET_SIZE(self);
				4045	p = PyUnicode_AS_UNICODE(self);
				4046	x = *p << 7;
				4047	while (--len >= 0)
				4048	x = (1000003x) ^ p++;
				4049	x ^= PyUnicode_GET_SIZE(self);
				4050	if (x == -1)
				4051	x = -2;
				4052	self->hash = x;
				4053	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4054	}
				4055
				4056	static char index__doc__[] =
				4057	"S.index(sub [,start [,end]]) -> int\n\
				4058	\n\
				4059	Like S.find() but raise ValueError when the substring is not found.";
				4060
				4061	static PyObject *
				4062	unicode_index(PyUnicodeObject self, PyObject args)
				4063	{
				4064	int result;
				4065	PyUnicodeObject *substring;
				4066	int start = 0;
				4067	int end = INT_MAX;
				4068
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4069	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				4070	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4071	return NULL;
				4072
				4073	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4074	(PyObject *)substring);
				4075	if (substring == NULL)
				4076	return NULL;
				4077
				4078	result = findstring(self, substring, start, end, 1);
				4079
				4080	Py_DECREF(substring);
				4081	if (result < 0) {
				4082	PyErr_SetString(PyExc_ValueError, "substring not found");
				4083	return NULL;
				4084	}
				4085	return PyInt_FromLong(result);
				4086	}
				4087
				4088	static char islower__doc__[] =
				4089	"S.islower() -> int\n\
				4090	\n\
				4091	Return 1 if all cased characters in S are lowercase and there is\n\
				4092	at least one cased character in S, 0 otherwise.";
				4093
				4094	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4095	unicode_islower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4096	{
				4097	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4098	register const Py_UNICODE *e;
				4099	int cased;
				4100
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4101	/* Shortcut for single character strings */
				4102	if (PyUnicode_GET_SIZE(self) == 1)
				4103	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				4104
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4105	/* Special case for empty strings */
				4106	if (PyString_GET_SIZE(self) == 0)
				4107	return PyInt_FromLong(0);
				4108
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4109	e = p + PyUnicode_GET_SIZE(self);
				4110	cased = 0;
				4111	for (; p < e; p++) {
				4112	register const Py_UNICODE ch = *p;
				4113
				4114	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4115	return PyInt_FromLong(0);
				4116	else if (!cased && Py_UNICODE_ISLOWER(ch))
				4117	cased = 1;
				4118	}
				4119	return PyInt_FromLong(cased);
				4120	}
				4121
				4122	static char isupper__doc__[] =
				4123	"S.isupper() -> int\n\
				4124	\n\
				4125	Return 1 if all cased characters in S are uppercase and there is\n\
				4126	at least one cased character in S, 0 otherwise.";
				4127
				4128	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4129	unicode_isupper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4130	{
				4131	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4132	register const Py_UNICODE *e;
				4133	int cased;
				4134
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4135	/* Shortcut for single character strings */
				4136	if (PyUnicode_GET_SIZE(self) == 1)
				4137	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				4138
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4139	/* Special case for empty strings */
				4140	if (PyString_GET_SIZE(self) == 0)
				4141	return PyInt_FromLong(0);
				4142
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4143	e = p + PyUnicode_GET_SIZE(self);
				4144	cased = 0;
				4145	for (; p < e; p++) {
				4146	register const Py_UNICODE ch = *p;
				4147
				4148	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4149	return PyInt_FromLong(0);
				4150	else if (!cased && Py_UNICODE_ISUPPER(ch))
				4151	cased = 1;
				4152	}
				4153	return PyInt_FromLong(cased);
				4154	}
				4155
				4156	static char istitle__doc__[] =
				4157	"S.istitle() -> int\n\
				4158	\n\
				4159	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				4160	may only follow uncased characters and lowercase characters only cased\n\
				4161	ones. Return 0 otherwise.";
				4162
				4163	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4164	unicode_istitle(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4165	{
				4166	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4167	register const Py_UNICODE *e;
				4168	int cased, previous_is_cased;
				4169
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4170	/* Shortcut for single character strings */
				4171	if (PyUnicode_GET_SIZE(self) == 1)
				4172	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				4173	(Py_UNICODE_ISUPPER(*p) != 0));
				4174
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4175	/* Special case for empty strings */
				4176	if (PyString_GET_SIZE(self) == 0)
				4177	return PyInt_FromLong(0);
				4178
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4179	e = p + PyUnicode_GET_SIZE(self);
				4180	cased = 0;
				4181	previous_is_cased = 0;
				4182	for (; p < e; p++) {
				4183	register const Py_UNICODE ch = *p;
				4184
				4185	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				4186	if (previous_is_cased)
				4187	return PyInt_FromLong(0);
				4188	previous_is_cased = 1;
				4189	cased = 1;
				4190	}
				4191	else if (Py_UNICODE_ISLOWER(ch)) {
				4192	if (!previous_is_cased)
				4193	return PyInt_FromLong(0);
				4194	previous_is_cased = 1;
				4195	cased = 1;
				4196	}
				4197	else
				4198	previous_is_cased = 0;
				4199	}
				4200	return PyInt_FromLong(cased);
				4201	}
				4202
				4203	static char isspace__doc__[] =
				4204	"S.isspace() -> int\n\
				4205	\n\
				4206	Return 1 if there are only whitespace characters in S,\n\
				4207	0 otherwise.";
				4208
				4209	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4210	unicode_isspace(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4211	{
				4212	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4213	register const Py_UNICODE *e;
				4214
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4215	/* Shortcut for single character strings */
				4216	if (PyUnicode_GET_SIZE(self) == 1 &&
				4217	Py_UNICODE_ISSPACE(*p))
				4218	return PyInt_FromLong(1);
				4219
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4220	/* Special case for empty strings */
				4221	if (PyString_GET_SIZE(self) == 0)
				4222	return PyInt_FromLong(0);
				4223
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4224	e = p + PyUnicode_GET_SIZE(self);
				4225	for (; p < e; p++) {
				4226	if (!Py_UNICODE_ISSPACE(*p))
				4227	return PyInt_FromLong(0);
				4228	}
				4229	return PyInt_FromLong(1);
				4230	}
				4231
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4232	static char isalpha__doc__[] =
				4233	"S.isalpha() -> int\n\
				4234	\n\
				4235	Return 1 if all characters in S are alphabetic\n\
				4236	and there is at least one character in S, 0 otherwise.";
				4237
				4238	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4239	unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4240	{
				4241	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4242	register const Py_UNICODE *e;
				4243
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4244	/* Shortcut for single character strings */
				4245	if (PyUnicode_GET_SIZE(self) == 1 &&
				4246	Py_UNICODE_ISALPHA(*p))
				4247	return PyInt_FromLong(1);
				4248
				4249	/* Special case for empty strings */
				4250	if (PyString_GET_SIZE(self) == 0)
				4251	return PyInt_FromLong(0);
				4252
				4253	e = p + PyUnicode_GET_SIZE(self);
				4254	for (; p < e; p++) {
				4255	if (!Py_UNICODE_ISALPHA(*p))
				4256	return PyInt_FromLong(0);
				4257	}
				4258	return PyInt_FromLong(1);
				4259	}
				4260
				4261	static char isalnum__doc__[] =
				4262	"S.isalnum() -> int\n\
				4263	\n\
				4264	Return 1 if all characters in S are alphanumeric\n\
				4265	and there is at least one character in S, 0 otherwise.";
				4266
				4267	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4268	unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4269	{
				4270	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4271	register const Py_UNICODE *e;
				4272
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4273	/* Shortcut for single character strings */
				4274	if (PyUnicode_GET_SIZE(self) == 1 &&
				4275	Py_UNICODE_ISALNUM(*p))
				4276	return PyInt_FromLong(1);
				4277
				4278	/* Special case for empty strings */
				4279	if (PyString_GET_SIZE(self) == 0)
				4280	return PyInt_FromLong(0);
				4281
				4282	e = p + PyUnicode_GET_SIZE(self);
				4283	for (; p < e; p++) {
				4284	if (!Py_UNICODE_ISALNUM(*p))
				4285	return PyInt_FromLong(0);
				4286	}
				4287	return PyInt_FromLong(1);
				4288	}
				4289
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4290	static char isdecimal__doc__[] =
				4291	"S.isdecimal() -> int\n\
				4292	\n\
				4293	Return 1 if there are only decimal characters in S,\n\
				4294	0 otherwise.";
				4295
				4296	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4297	unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4298	{
				4299	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4300	register const Py_UNICODE *e;
				4301
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4302	/* Shortcut for single character strings */
				4303	if (PyUnicode_GET_SIZE(self) == 1 &&
				4304	Py_UNICODE_ISDECIMAL(*p))
				4305	return PyInt_FromLong(1);
				4306
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4307	/* Special case for empty strings */
				4308	if (PyString_GET_SIZE(self) == 0)
				4309	return PyInt_FromLong(0);
				4310
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4311	e = p + PyUnicode_GET_SIZE(self);
				4312	for (; p < e; p++) {
				4313	if (!Py_UNICODE_ISDECIMAL(*p))
				4314	return PyInt_FromLong(0);
				4315	}
				4316	return PyInt_FromLong(1);
				4317	}
				4318
				4319	static char isdigit__doc__[] =
				4320	"S.isdigit() -> int\n\
				4321	\n\
				4322	Return 1 if there are only digit characters in S,\n\
				4323	0 otherwise.";
				4324
				4325	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4326	unicode_isdigit(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4327	{
				4328	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4329	register const Py_UNICODE *e;
				4330
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4331	/* Shortcut for single character strings */
				4332	if (PyUnicode_GET_SIZE(self) == 1 &&
				4333	Py_UNICODE_ISDIGIT(*p))
				4334	return PyInt_FromLong(1);
				4335
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4336	/* Special case for empty strings */
				4337	if (PyString_GET_SIZE(self) == 0)
				4338	return PyInt_FromLong(0);
				4339
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4340	e = p + PyUnicode_GET_SIZE(self);
				4341	for (; p < e; p++) {
				4342	if (!Py_UNICODE_ISDIGIT(*p))
				4343	return PyInt_FromLong(0);
				4344	}
				4345	return PyInt_FromLong(1);
				4346	}
				4347
				4348	static char isnumeric__doc__[] =
				4349	"S.isnumeric() -> int\n\
				4350	\n\
				4351	Return 1 if there are only numeric characters in S,\n\
				4352	0 otherwise.";
				4353
				4354	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4355	unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4356	{
				4357	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4358	register const Py_UNICODE *e;
				4359
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4360	/* Shortcut for single character strings */
				4361	if (PyUnicode_GET_SIZE(self) == 1 &&
				4362	Py_UNICODE_ISNUMERIC(*p))
				4363	return PyInt_FromLong(1);
				4364
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4365	/* Special case for empty strings */
				4366	if (PyString_GET_SIZE(self) == 0)
				4367	return PyInt_FromLong(0);
				4368
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4369	e = p + PyUnicode_GET_SIZE(self);
				4370	for (; p < e; p++) {
				4371	if (!Py_UNICODE_ISNUMERIC(*p))
				4372	return PyInt_FromLong(0);
				4373	}
				4374	return PyInt_FromLong(1);
				4375	}
				4376
				4377	static char join__doc__[] =
				4378	"S.join(sequence) -> unicode\n\
				4379	\n\
				4380	Return a string which is the concatenation of the strings in the\n\
				4381	sequence. The separator between elements is S.";
				4382
				4383	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4384	unicode_join(PyObject self, PyObject data)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4385	{
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4386	return PyUnicode_Join(self, data);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4387	}
				4388
				4389	static int
				4390	unicode_length(PyUnicodeObject *self)
				4391	{
				4392	return self->length;
				4393	}
				4394
				4395	static char ljust__doc__[] =
				4396	"S.ljust(width) -> unicode\n\
				4397	\n\
				4398	Return S left justified in a Unicode string of length width. Padding is\n\
				4399	done using spaces.";
				4400
				4401	static PyObject *
				4402	unicode_ljust(PyUnicodeObject self, PyObject args)
				4403	{
				4404	int width;
				4405	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4406	return NULL;
				4407
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4408	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4409	Py_INCREF(self);
				4410	return (PyObject*) self;
				4411	}
				4412
				4413	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4414	}
				4415
				4416	static char lower__doc__[] =
				4417	"S.lower() -> unicode\n\
				4418	\n\
				4419	Return a copy of the string S converted to lowercase.";
				4420
				4421	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4422	unicode_lower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4423	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4424	return fixup(self, fixlower);
				4425	}
				4426
				4427	static char lstrip__doc__[] =
				4428	"S.lstrip() -> unicode\n\
				4429	\n\
				4430	Return a copy of the string S with leading whitespace removed.";
				4431
				4432	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4433	unicode_lstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4434	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4435	return strip(self, 1, 0);
				4436	}
				4437
				4438	static PyObject*
				4439	unicode_repeat(PyUnicodeObject *str, int len)
				4440	{
				4441	PyUnicodeObject *u;
				4442	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4443	int nchars;
				4444	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4445
				4446	if (len < 0)
				4447	len = 0;
				4448
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4449	if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4450	/* no repeat, return original string */
				4451	Py_INCREF(str);
				4452	return (PyObject*) str;
				4453	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4454
				4455	/* ensure # of chars needed doesn't overflow int and # of bytes
				4456	* needed doesn't overflow size_t
				4457	*/
				4458	nchars = len * str->length;
				4459	if (len && nchars / len != str->length) {
				4460	PyErr_SetString(PyExc_OverflowError,
				4461	"repeated string is too long");
				4462	return NULL;
				4463	}
				4464	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4465	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4466	PyErr_SetString(PyExc_OverflowError,
				4467	"repeated string is too long");
				4468	return NULL;
				4469	}
				4470	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4471	if (!u)
				4472	return NULL;
				4473
				4474	p = u->str;
				4475
				4476	while (len-- > 0) {
				4477	Py_UNICODE_COPY(p, str->str, str->length);
				4478	p += str->length;
				4479	}
				4480
				4481	return (PyObject*) u;
				4482	}
				4483
				4484	PyObject PyUnicode_Replace(PyObject obj,
				4485	PyObject *subobj,
				4486	PyObject *replobj,
				4487	int maxcount)
				4488	{
				4489	PyObject *self;
				4490	PyObject *str1;
				4491	PyObject *str2;
				4492	PyObject *result;
				4493
				4494	self = PyUnicode_FromObject(obj);
				4495	if (self == NULL)
				4496	return NULL;
				4497	str1 = PyUnicode_FromObject(subobj);
				4498	if (str1 == NULL) {
				4499	Py_DECREF(self);
				4500	return NULL;
				4501	}
				4502	str2 = PyUnicode_FromObject(replobj);
				4503	if (str2 == NULL) {
				4504	Py_DECREF(self);
				4505	Py_DECREF(str1);
				4506	return NULL;
				4507	}
				4508	result = replace((PyUnicodeObject *)self,
				4509	(PyUnicodeObject *)str1,
				4510	(PyUnicodeObject *)str2,
				4511	maxcount);
				4512	Py_DECREF(self);
				4513	Py_DECREF(str1);
				4514	Py_DECREF(str2);
				4515	return result;
				4516	}
				4517
				4518	static char replace__doc__[] =
				4519	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4520	\n\
				4521	Return a copy of S with all occurrences of substring\n\
				4522	old replaced by new. If the optional argument maxsplit is\n\
				4523	given, only the first maxsplit occurrences are replaced.";
				4524
				4525	static PyObject*
				4526	unicode_replace(PyUnicodeObject self, PyObject args)
				4527	{
				4528	PyUnicodeObject *str1;
				4529	PyUnicodeObject *str2;
				4530	int maxcount = -1;
				4531	PyObject *result;
				4532
				4533	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4534	return NULL;
				4535	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4536	if (str1 == NULL)
				4537	return NULL;
				4538	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4539	if (str2 == NULL)
				4540	return NULL;
				4541
				4542	result = replace(self, str1, str2, maxcount);
				4543
				4544	Py_DECREF(str1);
				4545	Py_DECREF(str2);
				4546	return result;
				4547	}
				4548
				4549	static
				4550	PyObject unicode_repr(PyObject unicode)
				4551	{
				4552	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4553	PyUnicode_GET_SIZE(unicode),
				4554	1);
				4555	}
				4556
				4557	static char rfind__doc__[] =
				4558	"S.rfind(sub [,start [,end]]) -> int\n\
				4559	\n\
				4560	Return the highest index in S where substring sub is found,\n\
				4561	such that sub is contained within s[start,end]. Optional\n\
				4562	arguments start and end are interpreted as in slice notation.\n\
				4563	\n\
				4564	Return -1 on failure.";
				4565
				4566	static PyObject *
				4567	unicode_rfind(PyUnicodeObject self, PyObject args)
				4568	{
				4569	PyUnicodeObject *substring;
				4570	int start = 0;
				4571	int end = INT_MAX;
				4572	PyObject *result;
				4573
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4574	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4575	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4576	return NULL;
				4577	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4578	(PyObject *)substring);
				4579	if (substring == NULL)
				4580	return NULL;
				4581
				4582	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4583
				4584	Py_DECREF(substring);
				4585	return result;
				4586	}
				4587
				4588	static char rindex__doc__[] =
				4589	"S.rindex(sub [,start [,end]]) -> int\n\
				4590	\n\
				4591	Like S.rfind() but raise ValueError when the substring is not found.";
				4592
				4593	static PyObject *
				4594	unicode_rindex(PyUnicodeObject self, PyObject args)
				4595	{
				4596	int result;
				4597	PyUnicodeObject *substring;
				4598	int start = 0;
				4599	int end = INT_MAX;
				4600
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4601	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4602	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4603	return NULL;
				4604	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4605	(PyObject *)substring);
				4606	if (substring == NULL)
				4607	return NULL;
				4608
				4609	result = findstring(self, substring, start, end, -1);
				4610
				4611	Py_DECREF(substring);
				4612	if (result < 0) {
				4613	PyErr_SetString(PyExc_ValueError, "substring not found");
				4614	return NULL;
				4615	}
				4616	return PyInt_FromLong(result);
				4617	}
				4618
				4619	static char rjust__doc__[] =
				4620	"S.rjust(width) -> unicode\n\
				4621	\n\
				4622	Return S right justified in a Unicode string of length width. Padding is\n\
				4623	done using spaces.";
				4624
				4625	static PyObject *
				4626	unicode_rjust(PyUnicodeObject self, PyObject args)
				4627	{
				4628	int width;
				4629	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4630	return NULL;
				4631
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4632	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4633	Py_INCREF(self);
				4634	return (PyObject*) self;
				4635	}
				4636
				4637	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4638	}
				4639
				4640	static char rstrip__doc__[] =
				4641	"S.rstrip() -> unicode\n\
				4642	\n\
				4643	Return a copy of the string S with trailing whitespace removed.";
				4644
				4645	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4646	unicode_rstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4647	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4648	return strip(self, 0, 1);
				4649	}
				4650
				4651	static PyObject*
				4652	unicode_slice(PyUnicodeObject *self, int start, int end)
				4653	{
				4654	/* standard clamping */
				4655	if (start < 0)
				4656	start = 0;
				4657	if (end < 0)
				4658	end = 0;
				4659	if (end > self->length)
				4660	end = self->length;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4661	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4662	/* full slice, return original string */
				4663	Py_INCREF(self);
				4664	return (PyObject*) self;
				4665	}
				4666	if (start > end)
				4667	start = end;
				4668	/* copy slice */
				4669	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4670	end - start);
				4671	}
				4672
				4673	PyObject PyUnicode_Split(PyObject s,
				4674	PyObject *sep,
				4675	int maxsplit)
				4676	{
				4677	PyObject *result;
				4678
				4679	s = PyUnicode_FromObject(s);
				4680	if (s == NULL)
				4681	return NULL;
				4682	if (sep != NULL) {
				4683	sep = PyUnicode_FromObject(sep);
				4684	if (sep == NULL) {
				4685	Py_DECREF(s);
				4686	return NULL;
				4687	}
				4688	}
				4689
				4690	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4691
				4692	Py_DECREF(s);
				4693	Py_XDECREF(sep);
				4694	return result;
				4695	}
				4696
				4697	static char split__doc__[] =
				4698	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4699	\n\
				4700	Return a list of the words in S, using sep as the\n\
				4701	delimiter string. If maxsplit is given, at most maxsplit\n\
				4702	splits are done. If sep is not specified, any whitespace string\n\
				4703	is a separator.";
				4704
				4705	static PyObject*
				4706	unicode_split(PyUnicodeObject self, PyObject args)
				4707	{
				4708	PyObject *substring = Py_None;
				4709	int maxcount = -1;
				4710
				4711	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4712	return NULL;
				4713
				4714	if (substring == Py_None)
				4715	return split(self, NULL, maxcount);
				4716	else if (PyUnicode_Check(substring))
				4717	return split(self, (PyUnicodeObject *)substring, maxcount);
				4718	else
				4719	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4720	}
				4721
				4722	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4723	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4724	\n\
				4725	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4726	Line breaks are not included in the resulting list unless keepends\n\
				4727	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4728
				4729	static PyObject*
				4730	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4731	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4732	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4733
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4734	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4735	return NULL;
				4736
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4737	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4738	}
				4739
				4740	static
				4741	PyObject unicode_str(PyUnicodeObject self)
				4742	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4743	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4744	}
				4745
				4746	static char strip__doc__[] =
				4747	"S.strip() -> unicode\n\
				4748	\n\
				4749	Return a copy of S with leading and trailing whitespace removed.";
				4750
				4751	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4752	unicode_strip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4753	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4754	return strip(self, 1, 1);
				4755	}
				4756
				4757	static char swapcase__doc__[] =
				4758	"S.swapcase() -> unicode\n\
				4759	\n\
				4760	Return a copy of S with uppercase characters converted to lowercase\n\
				4761	and vice versa.";
				4762
				4763	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4764	unicode_swapcase(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4765	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4766	return fixup(self, fixswapcase);
				4767	}
				4768
				4769	static char translate__doc__[] =
				4770	"S.translate(table) -> unicode\n\
				4771	\n\
				4772	Return a copy of the string S, where all characters have been mapped\n\
				4773	through the given translation table, which must be a mapping of\n\
				4774	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4775	are left untouched. Characters mapped to None are deleted.";
				4776
				4777	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4778	unicode_translate(PyUnicodeObject self, PyObject table)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4779	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4780	return PyUnicode_TranslateCharmap(self->str,
				4781	self->length,
				4782	table,
				4783	"ignore");
				4784	}
				4785
				4786	static char upper__doc__[] =
				4787	"S.upper() -> unicode\n\
				4788	\n\
				4789	Return a copy of S converted to uppercase.";
				4790
				4791	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4792	unicode_upper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4793	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4794	return fixup(self, fixupper);
				4795	}
				4796
				4797	#if 0
				4798	static char zfill__doc__[] =
				4799	"S.zfill(width) -> unicode\n\
				4800	\n\
				4801	Pad a numeric string x with zeros on the left, to fill a field\n\
				4802	of the specified width. The string x is never truncated.";
				4803
				4804	static PyObject *
				4805	unicode_zfill(PyUnicodeObject self, PyObject args)
				4806	{
				4807	int fill;
				4808	PyUnicodeObject *u;
				4809
				4810	int width;
				4811	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4812	return NULL;
				4813
				4814	if (self->length >= width) {
				4815	Py_INCREF(self);
				4816	return (PyObject*) self;
				4817	}
				4818
				4819	fill = width - self->length;
				4820
				4821	u = pad(self, fill, 0, '0');
				4822
				4823	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4824	/* move sign to beginning of string */
				4825	u->str[0] = u->str[fill];
				4826	u->str[fill] = '0';
				4827	}
				4828
				4829	return (PyObject*) u;
				4830	}
				4831	#endif
				4832
				4833	#if 0
				4834	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4835	unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4836	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4837	return PyInt_FromLong(unicode_freelist_size);
				4838	}
				4839	#endif
				4840
				4841	static char startswith__doc__[] =
				4842	"S.startswith(prefix[, start[, end]]) -> int\n\
				4843	\n\
				4844	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4845	optional start, test S beginning at that position. With optional end, stop\n\
				4846	comparing S at that position.";
				4847
				4848	static PyObject *
				4849	unicode_startswith(PyUnicodeObject *self,
				4850	PyObject *args)
				4851	{
				4852	PyUnicodeObject *substring;
				4853	int start = 0;
				4854	int end = INT_MAX;
				4855	PyObject *result;
				4856
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4857	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4858	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4859	return NULL;
				4860	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4861	(PyObject *)substring);
				4862	if (substring == NULL)
				4863	return NULL;
				4864
				4865	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4866
				4867	Py_DECREF(substring);
				4868	return result;
				4869	}
				4870
				4871
				4872	static char endswith__doc__[] =
				4873	"S.endswith(suffix[, start[, end]]) -> int\n\
				4874	\n\
				4875	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4876	optional start, test S beginning at that position. With optional end, stop\n\
				4877	comparing S at that position.";
				4878
				4879	static PyObject *
				4880	unicode_endswith(PyUnicodeObject *self,
				4881	PyObject *args)
				4882	{
				4883	PyUnicodeObject *substring;
				4884	int start = 0;
				4885	int end = INT_MAX;
				4886	PyObject *result;
				4887
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4888	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4889	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4890	return NULL;
				4891	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4892	(PyObject *)substring);
				4893	if (substring == NULL)
				4894	return NULL;
				4895
				4896	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4897
				4898	Py_DECREF(substring);
				4899	return result;
				4900	}
				4901
				4902
				4903	static PyMethodDef unicode_methods[] = {
				4904
				4905	/* Order is according to common usage: often used methods should
				4906	appear first, since lookup is done sequentially. */
				4907
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4908	{"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
				4909	{"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
				4910	{"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
				4911	{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
				4912	{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
				4913	{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
				4914	{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
				4915	{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
				4916	{"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
				4917	{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
				4918	{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
				4919	{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
				4920	{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
				4921	{"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
				4922	/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
				4923	{"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
				4924	{"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
				4925	{"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
				4926	{"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
				4927	{"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
				4928	{"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
				4929	{"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
				4930	{"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
				4931	{"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
				4932	{"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
				4933	{"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
				4934	{"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
				4935	{"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
				4936	{"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
				4937	{"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
				4938	{"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
				4939	{"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
				4940	{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
				4941	{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
				4942	{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4943	#if 0
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4944	{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
				4945	{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4946	#endif
				4947
				4948	#if 0
				4949	/* This one is just used for debugging the implementation. */
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4950	{"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4951	#endif
				4952
				4953	{NULL, NULL}
				4954	};
				4955
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4956	static PySequenceMethods unicode_as_sequence = {
				4957	(inquiry) unicode_length, /* sq_length */
				4958	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4959	(intargfunc) unicode_repeat, /* sq_repeat */
				4960	(intargfunc) unicode_getitem, /* sq_item */
				4961	(intintargfunc) unicode_slice, /* sq_slice */
				4962	0, /* sq_ass_item */
				4963	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4964	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4965	};
				4966
				4967	static int
				4968	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4969	int index,
				4970	const void **ptr)
				4971	{
				4972	if (index != 0) {
				4973	PyErr_SetString(PyExc_SystemError,
				4974	"accessing non-existent unicode segment");
				4975	return -1;
				4976	}
				4977	ptr = (void ) self->str;
				4978	return PyUnicode_GET_DATA_SIZE(self);
				4979	}
				4980
				4981	static int
				4982	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4983	const void **ptr)
				4984	{
				4985	PyErr_SetString(PyExc_TypeError,
				4986	"cannot use unicode as modifyable buffer");
				4987	return -1;
				4988	}
				4989
				4990	static int
				4991	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4992	int *lenp)
				4993	{
				4994	if (lenp)
				4995	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4996	return 1;
				4997	}
				4998
				4999	static int
				5000	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				5001	int index,
				5002	const void **ptr)
				5003	{
				5004	PyObject *str;
				5005
				5006	if (index != 0) {
				5007	PyErr_SetString(PyExc_SystemError,
				5008	"accessing non-existent unicode segment");
				5009	return -1;
				5010	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5011	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5012	if (str == NULL)
				5013	return -1;
				5014	ptr = (void ) PyString_AS_STRING(str);
				5015	return PyString_GET_SIZE(str);
				5016	}
				5017
				5018	/* Helpers for PyUnicode_Format() */
				5019
				5020	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5021	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5022	{
				5023	int argidx = *p_argidx;
				5024	if (argidx < arglen) {
				5025	(*p_argidx)++;
				5026	if (arglen < 0)
				5027	return args;
				5028	else
				5029	return PyTuple_GetItem(args, argidx);
				5030	}
				5031	PyErr_SetString(PyExc_TypeError,
				5032	"not enough arguments for format string");
				5033	return NULL;
				5034	}
				5035
				5036	#define F_LJUST (1<<0)
				5037	#define F_SIGN (1<<1)
				5038	#define F_BLANK (1<<2)
				5039	#define F_ALT (1<<3)
				5040	#define F_ZERO (1<<4)
				5041
				5042	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5043	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5044	{
				5045	register int i;
				5046	int len;
				5047	va_list va;
				5048	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5049	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5050
				5051	/* First, format the string as char array, then expand to Py_UNICODE
				5052	array. */
				5053	charbuffer = (char *)buffer;
				5054	len = vsprintf(charbuffer, format, va);
				5055	for (i = len - 1; i >= 0; i--)
				5056	buffer[i] = (Py_UNICODE) charbuffer[i];
				5057
				5058	va_end(va);
				5059	return len;
				5060	}
				5061
				5062	static int
				5063	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5064	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5065	int flags,
				5066	int prec,
				5067	int type,
				5068	PyObject *v)
				5069	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5070	/* fmt = '%#.' + `prec` + `type`
				5071	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5072	char fmt[20];
				5073	double x;
				5074
				5075	x = PyFloat_AsDouble(v);
				5076	if (x == -1.0 && PyErr_Occurred())
				5077	return -1;
				5078	if (prec < 0)
				5079	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5080	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				5081	type = 'g';
				5082	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5083	/* worst case length calc to ensure no buffer overrun:
				5084	fmt = %#.<prec>g
				5085	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				5086	for any double rep.)
				5087	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				5088	If prec=0 the effective precision is 1 (the leading digit is
				5089	always given), therefore increase by one to 10+prec. */
				5090	if (buflen <= (size_t)10 + (size_t)prec) {
				5091	PyErr_SetString(PyExc_OverflowError,
				5092	"formatted float is too long (precision too long?)");
				5093	return -1;
				5094	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5095	return usprintf(buf, fmt, x);
				5096	}
				5097
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5098	static PyObject*
				5099	formatlong(PyObject *val, int flags, int prec, int type)
				5100	{
				5101	char *buf;
				5102	int i, len;
				5103	PyObject str; / temporary string object. */
				5104	PyUnicodeObject *result;
				5105
				5106	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				5107	if (!str)
				5108	return NULL;
				5109	result = _PyUnicode_New(len);
				5110	for (i = 0; i < len; i++)
				5111	result->str[i] = buf[i];
				5112	result->str[len] = 0;
				5113	Py_DECREF(str);
				5114	return (PyObject*)result;
				5115	}
				5116
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5117	static int
				5118	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5119	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5120	int flags,
				5121	int prec,
				5122	int type,
				5123	PyObject *v)
				5124	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5125	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5126	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				5127	+ 1 + 1 = 24*/
				5128	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5129	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5130	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5131
				5132	x = PyInt_AsLong(v);
				5133	if (x == -1 && PyErr_Occurred())
				5134	return -1;
				5135	if (prec < 0)
				5136	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5137	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				5138	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				5139	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				5140	PyErr_SetString(PyExc_OverflowError,
				5141	"formatted integer is too long (precision too long?)");
				5142	return -1;
				5143	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5144	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				5145	* but we want it (for consistency with other %#x conversions, and
				5146	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5147	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				5148	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				5149	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5150	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5151	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				5152	/* Only way to know what the platform does is to try it. */
				5153	sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
				5154	if (fmt[1] != (char)type) {
				5155	/* Supply our own leading 0x/0X -- needed under std C */
				5156	use_native_c_format = 0;
				5157	sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
				5158	}
				5159	}
				5160	if (use_native_c_format)
				5161	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5162	return usprintf(buf, fmt, x);
				5163	}
				5164
				5165	static int
				5166	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5167	size_t buflen,
				5168	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5169	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5170	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5171	if (PyUnicode_Check(v)) {
				5172	if (PyUnicode_GET_SIZE(v) != 1)
				5173	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5174	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5175	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5176
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5177	else if (PyString_Check(v)) {
				5178	if (PyString_GET_SIZE(v) != 1)
				5179	goto onError;
				5180	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				5181	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5182
				5183	else {
				5184	/* Integer input truncated to a character */
				5185	long x;
				5186	x = PyInt_AsLong(v);
				5187	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5188	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5189	buf[0] = (char) x;
				5190	}
				5191	buf[1] = '\0';
				5192	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5193
				5194	onError:
				5195	PyErr_SetString(PyExc_TypeError,
				5196	"%c requires int or char");
				5197	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5198	}
				5199
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5200	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				5201
				5202	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				5203	chars are formatted. XXX This is a magic number. Each formatting
				5204	routine does bounds checking to ensure no overflow, but a better
				5205	solution may be to malloc a buffer of appropriate size for each
				5206	format. For now, the current solution is sufficient.
				5207	*/
				5208	#define FORMATBUFLEN (size_t)120
				5209
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5210	PyObject PyUnicode_Format(PyObject format,
				5211	PyObject *args)
				5212	{
				5213	Py_UNICODE fmt, res;
				5214	int fmtcnt, rescnt, reslen, arglen, argidx;
				5215	int args_owned = 0;
				5216	PyUnicodeObject *result = NULL;
				5217	PyObject *dict = NULL;
				5218	PyObject *uformat;
				5219
				5220	if (format == NULL \|\| args == NULL) {
				5221	PyErr_BadInternalCall();
				5222	return NULL;
				5223	}
				5224	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5225	if (uformat == NULL)
				5226	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5227	fmt = PyUnicode_AS_UNICODE(uformat);
				5228	fmtcnt = PyUnicode_GET_SIZE(uformat);
				5229
				5230	reslen = rescnt = fmtcnt + 100;
				5231	result = _PyUnicode_New(reslen);
				5232	if (result == NULL)
				5233	goto onError;
				5234	res = PyUnicode_AS_UNICODE(result);
				5235
				5236	if (PyTuple_Check(args)) {
				5237	arglen = PyTuple_Size(args);
				5238	argidx = 0;
				5239	}
				5240	else {
				5241	arglen = -1;
				5242	argidx = -2;
				5243	}
				5244	if (args->ob_type->tp_as_mapping)
				5245	dict = args;
				5246
				5247	while (--fmtcnt >= 0) {
				5248	if (*fmt != '%') {
				5249	if (--rescnt < 0) {
				5250	rescnt = fmtcnt + 100;
				5251	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5252	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5253	return NULL;
				5254	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				5255	--rescnt;
				5256	}
				5257	res++ = fmt++;
				5258	}
				5259	else {
				5260	/* Got a format specifier */
				5261	int flags = 0;
				5262	int width = -1;
				5263	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5264	Py_UNICODE c = '\0';
				5265	Py_UNICODE fill;
				5266	PyObject *v = NULL;
				5267	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5268	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5269	Py_UNICODE sign;
				5270	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5271	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5272
				5273	fmt++;
				5274	if (*fmt == '(') {
				5275	Py_UNICODE *keystart;
				5276	int keylen;
				5277	PyObject *key;
				5278	int pcount = 1;
				5279
				5280	if (dict == NULL) {
				5281	PyErr_SetString(PyExc_TypeError,
				5282	"format requires a mapping");
				5283	goto onError;
				5284	}
				5285	++fmt;
				5286	--fmtcnt;
				5287	keystart = fmt;
				5288	/* Skip over balanced parentheses */
				5289	while (pcount > 0 && --fmtcnt >= 0) {
				5290	if (*fmt == ')')
				5291	--pcount;
				5292	else if (*fmt == '(')
				5293	++pcount;
				5294	fmt++;
				5295	}
				5296	keylen = fmt - keystart - 1;
				5297	if (fmtcnt < 0 \|\| pcount > 0) {
				5298	PyErr_SetString(PyExc_ValueError,
				5299	"incomplete format key");
				5300	goto onError;
				5301	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5302	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5303	then looked up since Python uses strings to hold
				5304	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5305	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5306	key = PyUnicode_EncodeUTF8(keystart,
				5307	keylen,
				5308	NULL);
				5309	if (key == NULL)
				5310	goto onError;
				5311	if (args_owned) {
				5312	Py_DECREF(args);
				5313	args_owned = 0;
				5314	}
				5315	args = PyObject_GetItem(dict, key);
				5316	Py_DECREF(key);
				5317	if (args == NULL) {
				5318	goto onError;
				5319	}
				5320	args_owned = 1;
				5321	arglen = -1;
				5322	argidx = -2;
				5323	}
				5324	while (--fmtcnt >= 0) {
				5325	switch (c = *fmt++) {
				5326	case '-': flags \|= F_LJUST; continue;
				5327	case '+': flags \|= F_SIGN; continue;
				5328	case ' ': flags \|= F_BLANK; continue;
				5329	case '#': flags \|= F_ALT; continue;
				5330	case '0': flags \|= F_ZERO; continue;
				5331	}
				5332	break;
				5333	}
				5334	if (c == '*') {
				5335	v = getnextarg(args, arglen, &argidx);
				5336	if (v == NULL)
				5337	goto onError;
				5338	if (!PyInt_Check(v)) {
				5339	PyErr_SetString(PyExc_TypeError,
				5340	"* wants int");
				5341	goto onError;
				5342	}
				5343	width = PyInt_AsLong(v);
				5344	if (width < 0) {
				5345	flags \|= F_LJUST;
				5346	width = -width;
				5347	}
				5348	if (--fmtcnt >= 0)
				5349	c = *fmt++;
				5350	}
				5351	else if (c >= '0' && c <= '9') {
				5352	width = c - '0';
				5353	while (--fmtcnt >= 0) {
				5354	c = *fmt++;
				5355	if (c < '0' \|\| c > '9')
				5356	break;
				5357	if ((width*10) / 10 != width) {
				5358	PyErr_SetString(PyExc_ValueError,
				5359	"width too big");
				5360	goto onError;
				5361	}
				5362	width = width*10 + (c - '0');
				5363	}
				5364	}
				5365	if (c == '.') {
				5366	prec = 0;
				5367	if (--fmtcnt >= 0)
				5368	c = *fmt++;
				5369	if (c == '*') {
				5370	v = getnextarg(args, arglen, &argidx);
				5371	if (v == NULL)
				5372	goto onError;
				5373	if (!PyInt_Check(v)) {
				5374	PyErr_SetString(PyExc_TypeError,
				5375	"* wants int");
				5376	goto onError;
				5377	}
				5378	prec = PyInt_AsLong(v);
				5379	if (prec < 0)
				5380	prec = 0;
				5381	if (--fmtcnt >= 0)
				5382	c = *fmt++;
				5383	}
				5384	else if (c >= '0' && c <= '9') {
				5385	prec = c - '0';
				5386	while (--fmtcnt >= 0) {
				5387	c = Py_CHARMASK(*fmt++);
				5388	if (c < '0' \|\| c > '9')
				5389	break;
				5390	if ((prec*10) / 10 != prec) {
				5391	PyErr_SetString(PyExc_ValueError,
				5392	"prec too big");
				5393	goto onError;
				5394	}
				5395	prec = prec*10 + (c - '0');
				5396	}
				5397	}
				5398	} /* prec */
				5399	if (fmtcnt >= 0) {
				5400	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5401	if (--fmtcnt >= 0)
				5402	c = *fmt++;
				5403	}
				5404	}
				5405	if (fmtcnt < 0) {
				5406	PyErr_SetString(PyExc_ValueError,
				5407	"incomplete format");
				5408	goto onError;
				5409	}
				5410	if (c != '%') {
				5411	v = getnextarg(args, arglen, &argidx);
				5412	if (v == NULL)
				5413	goto onError;
				5414	}
				5415	sign = 0;
				5416	fill = ' ';
				5417	switch (c) {
				5418
				5419	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5420	pbuf = formatbuf;
				5421	/* presume that buffer length is at least 1 */
				5422	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5423	len = 1;
				5424	break;
				5425
				5426	case 's':
				5427	case 'r':
				5428	if (PyUnicode_Check(v) && c == 's') {
				5429	temp = v;
				5430	Py_INCREF(temp);
				5431	}
				5432	else {
				5433	PyObject *unicode;
				5434	if (c == 's')
				5435	temp = PyObject_Str(v);
				5436	else
				5437	temp = PyObject_Repr(v);
				5438	if (temp == NULL)
				5439	goto onError;
				5440	if (!PyString_Check(temp)) {
				5441	/* XXX Note: this should never happen, since
				5442	PyObject_Repr() and PyObject_Str() assure
				5443	this */
				5444	Py_DECREF(temp);
				5445	PyErr_SetString(PyExc_TypeError,
				5446	"%s argument has non-string str()");
				5447	goto onError;
				5448	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5449	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5450	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5451	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5452	"strict");
				5453	Py_DECREF(temp);
				5454	temp = unicode;
				5455	if (temp == NULL)
				5456	goto onError;
				5457	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5458	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5459	len = PyUnicode_GET_SIZE(temp);
				5460	if (prec >= 0 && len > prec)
				5461	len = prec;
				5462	break;
				5463
				5464	case 'i':
				5465	case 'd':
				5466	case 'u':
				5467	case 'o':
				5468	case 'x':
				5469	case 'X':
				5470	if (c == 'i')
				5471	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5472	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5473	temp = formatlong(v, flags, prec, c);
				5474	if (!temp)
				5475	goto onError;
				5476	pbuf = PyUnicode_AS_UNICODE(temp);
				5477	len = PyUnicode_GET_SIZE(temp);
				5478	/* unbounded ints can always produce
				5479	a sign character! */
				5480	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5481	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5482	else {
				5483	pbuf = formatbuf;
				5484	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5485	flags, prec, c, v);
				5486	if (len < 0)
				5487	goto onError;
				5488	/* only d conversion is signed */
				5489	sign = c == 'd';
				5490	}
				5491	if (flags & F_ZERO)
				5492	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5493	break;
				5494
				5495	case 'e':
				5496	case 'E':
				5497	case 'f':
				5498	case 'g':
				5499	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5500	pbuf = formatbuf;
				5501	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5502	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5503	if (len < 0)
				5504	goto onError;
				5505	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5506	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5507	fill = '0';
				5508	break;
				5509
				5510	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5511	pbuf = formatbuf;
				5512	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5513	if (len < 0)
				5514	goto onError;
				5515	break;
				5516
				5517	default:
				5518	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5519	"unsupported format character '%c' (0x%x) "
				5520	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5521	(31<=c && c<=126) ? c : '?',
				5522	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5523	goto onError;
				5524	}
				5525	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5526	if (pbuf == '-' \|\| pbuf == '+') {
				5527	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5528	len--;
				5529	}
				5530	else if (flags & F_SIGN)
				5531	sign = '+';
				5532	else if (flags & F_BLANK)
				5533	sign = ' ';
				5534	else
				5535	sign = 0;
				5536	}
				5537	if (width < len)
				5538	width = len;
				5539	if (rescnt < width + (sign != 0)) {
				5540	reslen -= rescnt;
				5541	rescnt = width + fmtcnt + 100;
				5542	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5543	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5544	return NULL;
				5545	res = PyUnicode_AS_UNICODE(result)
				5546	+ reslen - rescnt;
				5547	}
				5548	if (sign) {
				5549	if (fill != ' ')
				5550	*res++ = sign;
				5551	rescnt--;
				5552	if (width > len)
				5553	width--;
				5554	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5555	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5556	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5557	assert(pbuf[1] == c);
				5558	if (fill != ' ') {
				5559	res++ = pbuf++;
				5560	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5561	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5562	rescnt -= 2;
				5563	width -= 2;
				5564	if (width < 0)
				5565	width = 0;
				5566	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5567	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5568	if (width > len && !(flags & F_LJUST)) {
				5569	do {
				5570	--rescnt;
				5571	*res++ = fill;
				5572	} while (--width > len);
				5573	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5574	if (fill == ' ') {
				5575	if (sign)
				5576	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5577	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5578	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5579	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5580	res++ = pbuf++;
				5581	res++ = pbuf++;
				5582	}
				5583	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5584	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5585	res += len;
				5586	rescnt -= len;
				5587	while (--width >= len) {
				5588	--rescnt;
				5589	*res++ = ' ';
				5590	}
				5591	if (dict && (argidx < arglen) && c != '%') {
				5592	PyErr_SetString(PyExc_TypeError,
				5593	"not all arguments converted");
				5594	goto onError;
				5595	}
				5596	Py_XDECREF(temp);
				5597	} /* '%' */
				5598	} /* until end */
				5599	if (argidx < arglen && !dict) {
				5600	PyErr_SetString(PyExc_TypeError,
				5601	"not all arguments converted");
				5602	goto onError;
				5603	}
				5604
				5605	if (args_owned) {
				5606	Py_DECREF(args);
				5607	}
				5608	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5609	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5610	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5611	return (PyObject *)result;
				5612
				5613	onError:
				5614	Py_XDECREF(result);
				5615	Py_DECREF(uformat);
				5616	if (args_owned) {
				5617	Py_DECREF(args);
				5618	}
				5619	return NULL;
				5620	}
				5621
				5622	static PyBufferProcs unicode_as_buffer = {
				5623	(getreadbufferproc) unicode_buffer_getreadbuf,
				5624	(getwritebufferproc) unicode_buffer_getwritebuf,
				5625	(getsegcountproc) unicode_buffer_getsegcount,
				5626	(getcharbufferproc) unicode_buffer_getcharbuf,
				5627	};
				5628
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5629	staticforward PyObject *
				5630	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds);
				5631
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5632	static PyObject *
				5633	unicode_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5634	{
				5635	PyObject *x = NULL;
				5636	static char *kwlist[] = {"string", "encoding", "errors", 0};
				5637	char *encoding = NULL;
				5638	char *errors = NULL;
				5639
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5640	if (type != &PyUnicode_Type)
				5641	return unicode_subtype_new(type, args, kwds);
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5642	if (!PyArg_ParseTupleAndKeywords(args, kwds, "\|Oss:unicode",
				5643	kwlist, &x, &encoding, &errors))
				5644	return NULL;
				5645	if (x == NULL)
				5646	return (PyObject *)_PyUnicode_New(0);
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame^]	5647	if (encoding == NULL && errors == NULL)
				5648	return PyObject_Unicode(x);
				5649	else
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5650	return PyUnicode_FromEncodedObject(x, encoding, errors);
				5651	}
				5652
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5653	static PyObject *
				5654	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5655	{
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5656	PyUnicodeObject tmp, pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5657	int n;
				5658
				5659	assert(PyType_IsSubtype(type, &PyUnicode_Type));
				5660	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
				5661	if (tmp == NULL)
				5662	return NULL;
				5663	assert(PyUnicode_Check(tmp));
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5664	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
				5665	if (pnew == NULL)
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5666	return NULL;
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5667	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
				5668	if (pnew->str == NULL) {
				5669	_Py_ForgetReference((PyObject *)pnew);
				5670	PyObject_DEL(pnew);
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5671	return NULL;
				5672	}
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5673	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
				5674	pnew->length = n;
				5675	pnew->hash = tmp->hash;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5676	Py_DECREF(tmp);
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5677	return (PyObject *)pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5678	}
				5679
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5680	static char unicode_doc[] =
				5681	"unicode(string [, encoding[, errors]]) -> object\n\
				5682	\n\
				5683	Create a new Unicode object from the given encoded string.\n\
				5684	encoding defaults to the current default string encoding and \n\
				5685	errors, defining the error handling, to 'strict'.";
				5686
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5687	PyTypeObject PyUnicode_Type = {
				5688	PyObject_HEAD_INIT(&PyType_Type)
				5689	0, /* ob_size */
				5690	"unicode", /* tp_name */
				5691	sizeof(PyUnicodeObject), /* tp_size */
				5692	0, /* tp_itemsize */
				5693	/* Slots */
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	5694	(destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5695	0, /* tp_print */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5696	0, /* tp_getattr */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5697	0, /* tp_setattr */
				5698	(cmpfunc) unicode_compare, /* tp_compare */
				5699	(reprfunc) unicode_repr, /* tp_repr */
				5700	0, /* tp_as_number */
				5701	&unicode_as_sequence, /* tp_as_sequence */
				5702	0, /* tp_as_mapping */
				5703	(hashfunc) unicode_hash, /* tp_hash*/
				5704	0, /* tp_call*/
				5705	(reprfunc) unicode_str, /* tp_str */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5706	PyObject_GenericGetAttr, /* tp_getattro */
				5707	0, /* tp_setattro */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5708	&unicode_as_buffer, /* tp_as_buffer */
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5709	Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5710	unicode_doc, /* tp_doc */
				5711	0, /* tp_traverse */
				5712	0, /* tp_clear */
				5713	0, /* tp_richcompare */
				5714	0, /* tp_weaklistoffset */
				5715	0, /* tp_iter */
				5716	0, /* tp_iternext */
				5717	unicode_methods, /* tp_methods */
				5718	0, /* tp_members */
				5719	0, /* tp_getset */
				5720	0, /* tp_base */
				5721	0, /* tp_dict */
				5722	0, /* tp_descr_get */
				5723	0, /* tp_descr_set */
				5724	0, /* tp_dictoffset */
				5725	0, /* tp_init */
				5726	0, /* tp_alloc */
				5727	unicode_new, /* tp_new */
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	5728	_PyObject_Del, /* tp_free */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5729	};
				5730
				5731	/* Initialize the Unicode implementation */
				5732
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5733	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5734	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5735	int i;
				5736
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5737	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5738	unicode_freelist = NULL;
				5739	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5740	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5741	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5742	for (i = 0; i < 256; i++)
				5743	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5744	}
				5745
				5746	/* Finalize the Unicode implementation */
				5747
				5748	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5749	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5750	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5751	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5752	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5753
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5754	Py_XDECREF(unicode_empty);
				5755	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5756
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5757	for (i = 0; i < 256; i++) {
				5758	if (unicode_latin1[i]) {
				5759	Py_DECREF(unicode_latin1[i]);
				5760	unicode_latin1[i] = NULL;
				5761	}
				5762	}
				5763
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5764	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5765	PyUnicodeObject *v = u;
				5766	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5767	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5768	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5769	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5770	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5771	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5772	unicode_freelist = NULL;
				5773	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5774	}