Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: df8592d55e89b9ab43a0ee615ada345cfb2e4253 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	107	PyUnicode_GetMax(void)
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	108	{
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	109	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
				204	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	222	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
				227	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				228	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	230	/* Keep-Alive optimization */
				231	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	232	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	233	unicode->str = NULL;
				234	unicode->length = 0;
				235	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	236	if (unicode->defenc) {
				237	Py_DECREF(unicode->defenc);
				238	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	239	}
				240	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	(PyUnicodeObject *)unicode = unicode_freelist;
				242	unicode_freelist = unicode;
				243	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	244	}
				245	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	246	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	247	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	248	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	249	}
				250	}
				251
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	252	int PyUnicode_Resize(PyObject **unicode,
				253	int length)
				254	{
				255	register PyUnicodeObject *v;
				256
				257	/* Argument checks */
				258	if (unicode == NULL) {
				259	PyErr_BadInternalCall();
				260	return -1;
				261	}
				262	v = (PyUnicodeObject )unicode;
				263	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				264	PyErr_BadInternalCall();
				265	return -1;
				266	}
				267
				268	/* Resizing unicode_empty and single character objects is not
				269	possible since these are being shared. We simply return a fresh
				270	copy with the same Unicode content. */
				271	if (v->length != length &&
				272	(v == unicode_empty \|\| v->length == 1)) {
				273	PyUnicodeObject *w = _PyUnicode_New(length);
				274	if (w == NULL)
				275	return -1;
				276	Py_UNICODE_COPY(w->str, v->str,
				277	length < v->length ? length : v->length);
				278	unicode = (PyObject )w;
				279	return 0;
				280	}
				281
				282	/* Note that we don't have to modify *unicode for unshared Unicode
				283	objects, since we can modify them in-place. */
				284	return unicode_resize(v, length);
				285	}
				286
				287	/* Internal API for use in unicodeobject.c only ! */
				288	#define _PyUnicode_Resize(unicodevar, length) \
				289	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				290
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	291	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				292	int size)
				293	{
				294	PyUnicodeObject *unicode;
				295
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	296	/* If the Unicode data is known at construction time, we can apply
				297	some optimizations which share commonly used objects. */
				298	if (u != NULL) {
				299
				300	/* Optimization for empty strings */
				301	if (size == 0 && unicode_empty != NULL) {
				302	Py_INCREF(unicode_empty);
				303	return (PyObject *)unicode_empty;
				304	}
				305
				306	/* Single character Unicode objects in the Latin-1 range are
				307	shared when using this constructor */
				308	if (size == 1 && *u < 256) {
				309	unicode = unicode_latin1[*u];
				310	if (!unicode) {
				311	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	312	if (!unicode)
				313	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	314	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	315	unicode_latin1[*u] = unicode;
				316	}
				317	Py_INCREF(unicode);
				318	return (PyObject *)unicode;
				319	}
				320	}
				321
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	322	unicode = _PyUnicode_New(size);
				323	if (!unicode)
				324	return NULL;
				325
				326	/* Copy the Unicode data into the new object */
				327	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	328	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	329
				330	return (PyObject *)unicode;
				331	}
				332
				333	#ifdef HAVE_WCHAR_H
				334
				335	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				336	int size)
				337	{
				338	PyUnicodeObject *unicode;
				339
				340	if (w == NULL) {
				341	PyErr_BadInternalCall();
				342	return NULL;
				343	}
				344
				345	unicode = _PyUnicode_New(size);
				346	if (!unicode)
				347	return NULL;
				348
				349	/* Copy the wchar_t data into the new object */
				350	#ifdef HAVE_USABLE_WCHAR_T
				351	memcpy(unicode->str, w, size * sizeof(wchar_t));
				352	#else
				353	{
				354	register Py_UNICODE *u;
				355	register int i;
				356	u = PyUnicode_AS_UNICODE(unicode);
				357	for (i = size; i >= 0; i--)
				358	u++ = w++;
				359	}
				360	#endif
				361
				362	return (PyObject *)unicode;
				363	}
				364
				365	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				366	register wchar_t *w,
				367	int size)
				368	{
				369	if (unicode == NULL) {
				370	PyErr_BadInternalCall();
				371	return -1;
				372	}
				373	if (size > PyUnicode_GET_SIZE(unicode))
				374	size = PyUnicode_GET_SIZE(unicode);
				375	#ifdef HAVE_USABLE_WCHAR_T
				376	memcpy(w, unicode->str, size * sizeof(wchar_t));
				377	#else
				378	{
				379	register Py_UNICODE *u;
				380	register int i;
				381	u = PyUnicode_AS_UNICODE(unicode);
				382	for (i = size; i >= 0; i--)
				383	w++ = u++;
				384	}
				385	#endif
				386
				387	return size;
				388	}
				389
				390	#endif
				391
				392	PyObject PyUnicode_FromObject(register PyObject obj)
				393	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	394	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				395	}
				396
				397	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				398	const char *encoding,
				399	const char *errors)
				400	{
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	401	const char *s = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	402	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	403	int owned = 0;
				404	PyObject *v;
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	405	int reclevel;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	406
				407	if (obj == NULL) {
				408	PyErr_BadInternalCall();
				409	return NULL;
				410	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	411
				412	/* Coerce object */
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	413	for (reclevel = 0; reclevel < 2; reclevel++) {
				414
				415	if (PyUnicode_Check(obj)) {
				416	if (encoding) {
				417	PyErr_SetString(PyExc_TypeError,
				418	"decoding Unicode is not supported");
				419	goto onError;
				420	}
				421	if (PyUnicode_CheckExact(obj)) {
				422	Py_INCREF(obj);
				423	v = obj;
				424	}
				425	else {
				426	/* For a subclass of unicode, return a true unicode object
				427	with the same string value. */
				428	v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
				429	PyUnicode_GET_SIZE(obj));
				430	}
				431	goto done;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	432	}
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	433	else if (PyString_Check(obj)) {
				434	s = PyString_AS_STRING(obj);
				435	len = PyString_GET_SIZE(obj);
				436	break;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	437	}
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	438	else {
				439	PyObject *w;
				440
				441	/* Try char buffer interface */
				442	if (PyObject_AsCharBuffer(obj, &s, &len))
				443	PyErr_Clear();
				444	else
				445	break;
				446
				447	/* Mimic the behaviour of str(object) if everything else
				448	fails (see PyObject_Str()); this also covers instances
				449	which implement __str__. */
				450	if (obj->ob_type->tp_str == NULL)
				451	w = PyObject_Repr(obj);
				452	else
				453	w = (*obj->ob_type->tp_str)(obj);
				454	if (w == NULL)
				455	goto onError;
				456	if (owned) {
				457	Py_DECREF(obj);
				458	}
				459	obj = w;
				460	owned = 1;
Tim Peters	78e0fc7	2001-09-11 03:07:38 +0000	[diff] [blame]	461	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	462	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	463
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	464	if (s == NULL) {
				465	PyErr_Format(PyExc_TypeError,
				466	"coercing to Unicode: __str__ recursion limit exceeded "
				467	"(last type: %.80s)",
				468	obj->ob_type->tp_name);
				469	goto onError;
				470	}
				471
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	472	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	473	if (len == 0) {
				474	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	475	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	476	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	477	else
				478	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	479
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	480	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	481	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	482	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	483	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	484	return v;
				485
				486	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	487	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	488	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	489	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	490	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	491	}
				492
				493	PyObject PyUnicode_Decode(const char s,
				494	int size,
				495	const char *encoding,
				496	const char *errors)
				497	{
				498	PyObject buffer = NULL, unicode;
				499
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	500	if (encoding == NULL)
				501	encoding = PyUnicode_GetDefaultEncoding();
				502
				503	/* Shortcuts for common default encodings */
				504	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	505	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	506	else if (strcmp(encoding, "latin-1") == 0)
				507	return PyUnicode_DecodeLatin1(s, size, errors);
				508	else if (strcmp(encoding, "ascii") == 0)
				509	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	510
				511	/* Decode via the codec registry */
				512	buffer = PyBuffer_FromMemory((void *)s, size);
				513	if (buffer == NULL)
				514	goto onError;
				515	unicode = PyCodec_Decode(buffer, encoding, errors);
				516	if (unicode == NULL)
				517	goto onError;
				518	if (!PyUnicode_Check(unicode)) {
				519	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	520	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	521	unicode->ob_type->tp_name);
				522	Py_DECREF(unicode);
				523	goto onError;
				524	}
				525	Py_DECREF(buffer);
				526	return unicode;
				527
				528	onError:
				529	Py_XDECREF(buffer);
				530	return NULL;
				531	}
				532
				533	PyObject PyUnicode_Encode(const Py_UNICODE s,
				534	int size,
				535	const char *encoding,
				536	const char *errors)
				537	{
				538	PyObject v, unicode;
				539
				540	unicode = PyUnicode_FromUnicode(s, size);
				541	if (unicode == NULL)
				542	return NULL;
				543	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				544	Py_DECREF(unicode);
				545	return v;
				546	}
				547
				548	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				549	const char *encoding,
				550	const char *errors)
				551	{
				552	PyObject *v;
				553
				554	if (!PyUnicode_Check(unicode)) {
				555	PyErr_BadArgument();
				556	goto onError;
				557	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	558
				559	if (encoding == NULL)
				560	encoding = PyUnicode_GetDefaultEncoding();
				561
				562	/* Shortcuts for common default encodings */
				563	if (errors == NULL) {
				564	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	565	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	566	else if (strcmp(encoding, "latin-1") == 0)
				567	return PyUnicode_AsLatin1String(unicode);
				568	else if (strcmp(encoding, "ascii") == 0)
				569	return PyUnicode_AsASCIIString(unicode);
				570	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	571
				572	/* Encode via the codec registry */
				573	v = PyCodec_Encode(unicode, encoding, errors);
				574	if (v == NULL)
				575	goto onError;
				576	/* XXX Should we really enforce this ? */
				577	if (!PyString_Check(v)) {
				578	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	579	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	580	v->ob_type->tp_name);
				581	Py_DECREF(v);
				582	goto onError;
				583	}
				584	return v;
				585
				586	onError:
				587	return NULL;
				588	}
				589
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	590	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				591	const char *errors)
				592	{
				593	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				594
				595	if (v)
				596	return v;
				597	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				598	if (v && errors == NULL)
				599	((PyUnicodeObject *)unicode)->defenc = v;
				600	return v;
				601	}
				602
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	603	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				604	{
				605	if (!PyUnicode_Check(unicode)) {
				606	PyErr_BadArgument();
				607	goto onError;
				608	}
				609	return PyUnicode_AS_UNICODE(unicode);
				610
				611	onError:
				612	return NULL;
				613	}
				614
				615	int PyUnicode_GetSize(PyObject *unicode)
				616	{
				617	if (!PyUnicode_Check(unicode)) {
				618	PyErr_BadArgument();
				619	goto onError;
				620	}
				621	return PyUnicode_GET_SIZE(unicode);
				622
				623	onError:
				624	return -1;
				625	}
				626
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	627	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	628	{
				629	return unicode_default_encoding;
				630	}
				631
				632	int PyUnicode_SetDefaultEncoding(const char *encoding)
				633	{
				634	PyObject *v;
				635
				636	/* Make sure the encoding is valid. As side effect, this also
				637	loads the encoding into the codec registry cache. */
				638	v = _PyCodec_Lookup(encoding);
				639	if (v == NULL)
				640	goto onError;
				641	Py_DECREF(v);
				642	strncpy(unicode_default_encoding,
				643	encoding,
				644	sizeof(unicode_default_encoding));
				645	return 0;
				646
				647	onError:
				648	return -1;
				649	}
				650
Marc-André Lemburg	c60e6f7	2001-09-20 10:35:46 +0000	[diff] [blame]	651	/* --- UTF-7 Codec -------------------------------------------------------- */
				652
				653	/* see RFC2152 for details */
				654
				655	static
				656	char utf7_special[128] = {
				657	/* indicate whether a UTF-7 character is special i.e. cannot be directly
				658	encoded:
				659	0 - not special
				660	1 - special
				661	2 - whitespace (optional)
				662	3 - RFC2152 Set O (optional) */
				663	1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
				664	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				665	2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
				666	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
				667	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				668	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
				669	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				670	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
				671
				672	};
				673
				674	#define SPECIAL(c, encodeO, encodeWS) \
				675	(((c)>127 \|\| utf7_special[(c)] == 1) \|\| \
				676	(encodeWS && (utf7_special[(c)] == 2)) \|\| \
				677	(encodeO && (utf7_special[(c)] == 3)))
				678
				679	#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
				680	#define B64CHAR(c) (isalnum(c) \|\| (c) == '+' \|\| (c) == '/')
				681	#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
				682	(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
				683
				684	#define ENCODE(out, ch, bits) \
				685	while (bits >= 6) { \
				686	*out++ = B64(ch >> (bits-6)); \
				687	bits -= 6; \
				688	}
				689
				690	#define DECODE(out, ch, bits, surrogate) \
				691	while (bits >= 16) { \
				692	Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
				693	bits -= 16; \
				694	if (surrogate) { \
				695	/* We have already generated an error for the high surrogate
				696	so let's not bother seeing if the low surrogate is correct or not */\
				697	surrogate = 0; \
				698	} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
				699	/* This is a surrogate pair. Unfortunately we can't represent \
				700	it in a 16-bit character */ \
				701	surrogate = 1; \
				702	errmsg = "code pairs are not supported"; \
				703	goto utf7Error; \
				704	} else { \
				705	*out++ = outCh; \
				706	} \
				707	} \
				708
				709	static
				710	int utf7_decoding_error(Py_UNICODE **dest,
				711	const char *errors,
				712	const char *details)
				713	{
				714	if ((errors == NULL) \|\|
				715	(strcmp(errors,"strict") == 0)) {
				716	PyErr_Format(PyExc_UnicodeError,
				717	"UTF-7 decoding error: %.400s",
				718	details);
				719	return -1;
				720	}
				721	else if (strcmp(errors,"ignore") == 0) {
				722	return 0;
				723	}
				724	else if (strcmp(errors,"replace") == 0) {
				725	if (dest != NULL) {
				726	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				727	(*dest)++;
				728	}
				729	return 0;
				730	}
				731	else {
				732	PyErr_Format(PyExc_ValueError,
				733	"UTF-7 decoding error; unknown error handling code: %.400s",
				734	errors);
				735	return -1;
				736	}
				737	}
				738
				739	PyObject PyUnicode_DecodeUTF7(const char s,
				740	int size,
				741	const char *errors)
				742	{
				743	const char *e;
				744	PyUnicodeObject *unicode;
				745	Py_UNICODE *p;
				746	const char *errmsg = "";
				747	int inShift = 0;
				748	unsigned int bitsleft = 0;
				749	unsigned long charsleft = 0;
				750	int surrogate = 0;
				751
				752	unicode = _PyUnicode_New(size);
				753	if (!unicode)
				754	return NULL;
				755	if (size == 0)
				756	return (PyObject *)unicode;
				757
				758	p = unicode->str;
				759	e = s + size;
				760
				761	while (s < e) {
				762	Py_UNICODE ch = *s;
				763
				764	if (inShift) {
				765	if ((ch == '-') \|\| !B64CHAR(ch)) {
				766	inShift = 0;
				767	s++;
				768
				769	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				770	if (bitsleft >= 6) {
				771	/* The shift sequence has a partial character in it. If
				772	bitsleft < 6 then we could just classify it as padding
				773	but that is not the case here */
				774
				775	errmsg = "partial character in shift sequence";
				776	goto utf7Error;
				777	}
				778	/* According to RFC2152 the remaining bits should be zero. We
				779	choose to signal an error/insert a replacement character
				780	here so indicate the potential of a misencoded character. */
				781
				782	/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
				783	if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
				784	errmsg = "non-zero padding bits in shift sequence";
				785	goto utf7Error;
				786	}
				787
				788	if (ch == '-') {
				789	if ((s < e) && (*(s) == '-')) {
				790	*p++ = '-';
				791	inShift = 1;
				792	}
				793	} else if (SPECIAL(ch,0,0)) {
				794	errmsg = "unexpected special character";
				795	goto utf7Error;
				796	} else {
				797	*p++ = ch;
				798	}
				799	} else {
				800	charsleft = (charsleft << 6) \| UB64(ch);
				801	bitsleft += 6;
				802	s++;
				803	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				804	}
				805	}
				806	else if ( ch == '+' ) {
				807	s++;
				808	if (s < e && *s == '-') {
				809	s++;
				810	*p++ = '+';
				811	} else
				812	{
				813	inShift = 1;
				814	bitsleft = 0;
				815	}
				816	}
				817	else if (SPECIAL(ch,0,0)) {
				818	errmsg = "unexpected special character";
				819	s++;
				820	goto utf7Error;
				821	}
				822	else {
				823	*p++ = ch;
				824	s++;
				825	}
				826	continue;
				827	utf7Error:
				828	if (utf7_decoding_error(&p, errors, errmsg))
				829	goto onError;
				830	}
				831
				832	if (inShift) {
				833	if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
				834	goto onError;
				835	}
				836
				837	if (_PyUnicode_Resize(&unicode, p - unicode->str))
				838	goto onError;
				839
				840	return (PyObject *)unicode;
				841
				842	onError:
				843	Py_DECREF(unicode);
				844	return NULL;
				845	}
				846
				847
				848	PyObject PyUnicode_EncodeUTF7(const Py_UNICODE s,
				849	int size,
				850	int encodeSetO,
				851	int encodeWhiteSpace,
				852	const char *errors)
				853	{
				854	PyObject *v;
				855	/* It might be possible to tighten this worst case */
				856	unsigned int cbAllocated = 5 * size;
				857	int inShift = 0;
				858	int i = 0;
				859	unsigned int bitsleft = 0;
				860	unsigned long charsleft = 0;
				861	char * out;
				862	char * start;
				863
				864	if (size == 0)
				865	return PyString_FromStringAndSize(NULL, 0);
				866
				867	v = PyString_FromStringAndSize(NULL, cbAllocated);
				868	if (v == NULL)
				869	return NULL;
				870
				871	start = out = PyString_AS_STRING(v);
				872	for (;i < size; ++i) {
				873	Py_UNICODE ch = s[i];
				874
				875	if (!inShift) {
				876	if (ch == '+') {
				877	*out++ = '+';
				878	*out++ = '-';
				879	} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				880	charsleft = ch;
				881	bitsleft = 16;
				882	*out++ = '+';
				883	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				884	inShift = bitsleft > 0;
				885	} else {
				886	*out++ = (char) ch;
				887	}
				888	} else {
				889	if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				890	*out++ = B64(charsleft << (6-bitsleft));
				891	charsleft = 0;
				892	bitsleft = 0;
				893	/* Characters not in the BASE64 set implicitly unshift the sequence
				894	so no '-' is required, except if the character is itself a '-' */
				895	if (B64CHAR(ch) \|\| ch == '-') {
				896	*out++ = '-';
				897	}
				898	inShift = 0;
				899	*out++ = (char) ch;
				900	} else {
				901	bitsleft += 16;
				902	charsleft = (charsleft << 16) \| ch;
				903	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				904
				905	/* If the next character is special then we dont' need to terminate
				906	the shift sequence. If the next character is not a BASE64 character
				907	or '-' then the shift sequence will be terminated implicitly and we
				908	don't have to insert a '-'. */
				909
				910	if (bitsleft == 0) {
				911	if (i + 1 < size) {
				912	Py_UNICODE ch2 = s[i+1];
				913
				914	if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
				915
				916	} else if (B64CHAR(ch2) \|\| ch2 == '-') {
				917	*out++ = '-';
				918	inShift = 0;
				919	} else {
				920	inShift = 0;
				921	}
				922
				923	}
				924	else {
				925	*out++ = '-';
				926	inShift = 0;
				927	}
				928	}
				929	}
				930	}
				931	}
				932	if (bitsleft) {
				933	*out++= B64(charsleft << (6-bitsleft) );
				934	*out++ = '-';
				935	}
				936
				937	if (_PyString_Resize(&v, out - start)) {
				938	Py_DECREF(v);
				939	return NULL;
				940	}
				941	return v;
				942	}
				943
				944	#undef SPECIAL
				945	#undef B64
				946	#undef B64CHAR
				947	#undef UB64
				948	#undef ENCODE
				949	#undef DECODE
				950
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	951	/* --- UTF-8 Codec -------------------------------------------------------- */
				952
				953	static
				954	char utf8_code_length[256] = {
				955	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				956	illegal prefix. see RFC 2279 for details */
				957	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				958	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				959	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				960	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				961	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				962	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				963	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				964	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				965	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				966	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				967	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				968	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				969	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				970	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				971	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				972	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				973	};
				974
				975	static
				976	int utf8_decoding_error(const char **source,
				977	Py_UNICODE **dest,
				978	const char *errors,
				979	const char *details)
				980	{
				981	if ((errors == NULL) \|\|
				982	(strcmp(errors,"strict") == 0)) {
				983	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	984	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	985	details);
				986	return -1;
				987	}
				988	else if (strcmp(errors,"ignore") == 0) {
				989	(*source)++;
				990	return 0;
				991	}
				992	else if (strcmp(errors,"replace") == 0) {
				993	(*source)++;
				994	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				995	(*dest)++;
				996	return 0;
				997	}
				998	else {
				999	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1000	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1001	errors);
				1002	return -1;
				1003	}
				1004	}
				1005
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1006	PyObject PyUnicode_DecodeUTF8(const char s,
				1007	int size,
				1008	const char *errors)
				1009	{
				1010	int n;
				1011	const char *e;
				1012	PyUnicodeObject *unicode;
				1013	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1014	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1015
				1016	/* Note: size will always be longer than the resulting Unicode
				1017	character count */
				1018	unicode = _PyUnicode_New(size);
				1019	if (!unicode)
				1020	return NULL;
				1021	if (size == 0)
				1022	return (PyObject *)unicode;
				1023
				1024	/* Unpack UTF-8 encoded data */
				1025	p = unicode->str;
				1026	e = s + size;
				1027
				1028	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1029	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1030
				1031	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1032	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1033	s++;
				1034	continue;
				1035	}
				1036
				1037	n = utf8_code_length[ch];
				1038
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1039	if (s + n > e) {
				1040	errmsg = "unexpected end of data";
				1041	goto utf8Error;
				1042	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1043
				1044	switch (n) {
				1045
				1046	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1047	errmsg = "unexpected code byte";
				1048	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1049
				1050	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1051	errmsg = "internal error";
				1052	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1053
				1054	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1055	if ((s[1] & 0xc0) != 0x80) {
				1056	errmsg = "invalid data";
				1057	goto utf8Error;
				1058	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1059	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1060	if (ch < 0x80) {
				1061	errmsg = "illegal encoding";
				1062	goto utf8Error;
				1063	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1064	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1065	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1066	break;
				1067
				1068	case 3:
				1069	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1070	(s[2] & 0xc0) != 0x80) {
				1071	errmsg = "invalid data";
				1072	goto utf8Error;
				1073	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1074	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1075	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				1076	errmsg = "illegal encoding";
				1077	goto utf8Error;
				1078	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1079	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1080	*p++ = (Py_UNICODE)ch;
				1081	break;
				1082
				1083	case 4:
				1084	if ((s[1] & 0xc0) != 0x80 \|\|
				1085	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1086	(s[3] & 0xc0) != 0x80) {
				1087	errmsg = "invalid data";
				1088	goto utf8Error;
				1089	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1090	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				1091	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				1092	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1093	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1094	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1095	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1096	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1097	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1098	errmsg = "illegal encoding";
				1099	goto utf8Error;
				1100	}
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1101	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1102	*p++ = (Py_UNICODE)ch;
				1103	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1104	/* compute and append the two surrogates: */
				1105
				1106	/* translate from 10000..10FFFF to 0..FFFF */
				1107	ch -= 0x10000;
				1108
				1109	/* high surrogate = top 10 bits added to D800 */
				1110	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				1111
				1112	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1113	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1114	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1115	break;
				1116
				1117	default:
				1118	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1119	errmsg = "unsupported Unicode code range";
				1120	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1121	}
				1122	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1123	continue;
				1124
				1125	utf8Error:
				1126	if (utf8_decoding_error(&s, &p, errors, errmsg))
				1127	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1128	}
				1129
				1130	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1131	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1132	goto onError;
				1133
				1134	return (PyObject *)unicode;
				1135
				1136	onError:
				1137	Py_DECREF(unicode);
				1138	return NULL;
				1139	}
				1140
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1141	/* Not used anymore, now that the encoder supports UTF-16
				1142	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1143	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1144	static
				1145	int utf8_encoding_error(const Py_UNICODE **source,
				1146	char **dest,
				1147	const char *errors,
				1148	const char *details)
				1149	{
				1150	if ((errors == NULL) \|\|
				1151	(strcmp(errors,"strict") == 0)) {
				1152	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1153	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1154	details);
				1155	return -1;
				1156	}
				1157	else if (strcmp(errors,"ignore") == 0) {
				1158	return 0;
				1159	}
				1160	else if (strcmp(errors,"replace") == 0) {
				1161	**dest = '?';
				1162	(*dest)++;
				1163	return 0;
				1164	}
				1165	else {
				1166	PyErr_Format(PyExc_ValueError,
				1167	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1168	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1169	errors);
				1170	return -1;
				1171	}
				1172	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1173	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1174
				1175	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				1176	int size,
				1177	const char *errors)
				1178	{
				1179	PyObject *v;
				1180	char *p;
				1181	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1182	Py_UCS4 ch2;
				1183	unsigned int cbAllocated = 3 * size;
				1184	unsigned int cbWritten = 0;
				1185	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1186
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1187	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1188	if (v == NULL)
				1189	return NULL;
				1190	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1191	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1192
				1193	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1194	while (i < size) {
				1195	Py_UCS4 ch = s[i++];
				1196	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1197	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1198	cbWritten++;
				1199	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1200	else if (ch < 0x0800) {
				1201	*p++ = 0xc0 \| (ch >> 6);
				1202	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1203	cbWritten += 2;
				1204	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1205	else if (ch < 0x10000) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1206	/* Check for high surrogate */
				1207	if (0xD800 <= ch && ch <= 0xDBFF) {
				1208	if (i != size) {
				1209	ch2 = s[i];
				1210	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				1211
				1212	if (cbWritten >= (cbAllocated - 4)) {
				1213	/* Provide enough room for some more
				1214	surrogates */
				1215	cbAllocated += 4*10;
				1216	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1217	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1218	}
				1219
				1220	/* combine the two values */
				1221	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				1222
				1223	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1224	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1225	i++;
				1226	cbWritten += 4;
				1227	}
				1228	}
				1229	}
				1230	else {
				1231	*p++ = (char)(0xe0 \| (ch >> 12));
				1232	cbWritten += 3;
				1233	}
				1234	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				1235	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1236	} else {
				1237	*p++ = 0xf0 \| (ch>>18);
				1238	*p++ = 0x80 \| ((ch>>12) & 0x3f);
				1239	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				1240	*p++ = 0x80 \| (ch & 0x3f);
				1241	cbWritten += 4;
				1242	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1243	}
				1244	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1245	if (_PyString_Resize(&v, p - q))
				1246	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1247	return v;
				1248
				1249	onError:
				1250	Py_DECREF(v);
				1251	return NULL;
				1252	}
				1253
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1254	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				1255	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1256	if (!PyUnicode_Check(unicode)) {
				1257	PyErr_BadArgument();
				1258	return NULL;
				1259	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	1260	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				1261	PyUnicode_GET_SIZE(unicode),
				1262	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1263	}
				1264
				1265	/* --- UTF-16 Codec ------------------------------------------------------- */
				1266
				1267	static
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1268	int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1269	const char *errors,
				1270	const char *details)
				1271	{
				1272	if ((errors == NULL) \|\|
				1273	(strcmp(errors,"strict") == 0)) {
				1274	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1275	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1276	details);
				1277	return -1;
				1278	}
				1279	else if (strcmp(errors,"ignore") == 0) {
				1280	return 0;
				1281	}
				1282	else if (strcmp(errors,"replace") == 0) {
				1283	if (dest) {
				1284	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1285	(*dest)++;
				1286	}
				1287	return 0;
				1288	}
				1289	else {
				1290	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	1291	"UTF-16 decoding error; "
				1292	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1293	errors);
				1294	return -1;
				1295	}
				1296	}
				1297
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1298	PyObject *
				1299	PyUnicode_DecodeUTF16(const char *s,
				1300	int size,
				1301	const char *errors,
				1302	int *byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1303	{
				1304	PyUnicodeObject *unicode;
				1305	Py_UNICODE *p;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1306	const unsigned char q, e;
				1307	int bo = 0; /* assume native ordering by default */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1308	const char *errmsg = "";
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1309	/* Offsets from q for retrieving byte pairs in the right order. */
				1310	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1311	int ihi = 1, ilo = 0;
				1312	#else
				1313	int ihi = 0, ilo = 1;
				1314	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1315
				1316	/* size should be an even number */
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1317	if (size & 1) {
				1318	if (utf16_decoding_error(NULL, errors, "truncated data"))
				1319	return NULL;
				1320	--size; /* else ignore the oddball byte */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1321	}
				1322
				1323	/* Note: size will always be longer than the resulting Unicode
				1324	character count */
				1325	unicode = _PyUnicode_New(size);
				1326	if (!unicode)
				1327	return NULL;
				1328	if (size == 0)
				1329	return (PyObject *)unicode;
				1330
				1331	/* Unpack UTF-16 encoded data */
				1332	p = unicode->str;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1333	q = (unsigned char *)s;
				1334	e = q + size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1335
				1336	if (byteorder)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1337	bo = *byteorder;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1338
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1339	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1340	byte order setting accordingly. In native mode, the leading BOM
				1341	mark is skipped, in all other modes, it is copied to the output
				1342	stream as-is (giving a ZWNBSP character). */
				1343	if (bo == 0) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1344	const Py_UNICODE bom = (q[ihi] << 8) \| q[ilo];
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1345	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1346	if (bom == 0xFEFF) {
				1347	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1348	bo = -1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1349	}
				1350	else if (bom == 0xFFFE) {
				1351	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1352	bo = 1;
				1353	}
				1354	#else
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1355	if (bom == 0xFEFF) {
				1356	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1357	bo = 1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1358	}
				1359	else if (bom == 0xFFFE) {
				1360	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1361	bo = -1;
				1362	}
				1363	#endif
				1364	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1365
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1366	if (bo == -1) {
				1367	/* force LE */
				1368	ihi = 1;
				1369	ilo = 0;
				1370	}
				1371	else if (bo == 1) {
				1372	/* force BE */
				1373	ihi = 0;
				1374	ilo = 1;
				1375	}
				1376
				1377	while (q < e) {
				1378	Py_UNICODE ch = (q[ihi] << 8) \| q[ilo];
				1379	q += 2;
				1380
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1381	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1382	*p++ = ch;
				1383	continue;
				1384	}
				1385
				1386	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1387	if (q >= e) {
				1388	errmsg = "unexpected end of data";
				1389	goto utf16Error;
				1390	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1391	if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1392	Py_UNICODE ch2 = (q[ihi] << 8) \| q[ilo];
				1393	q += 2;
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1394	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1395	#ifndef Py_UNICODE_WIDE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1396	*p++ = ch;
				1397	*p++ = ch2;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1398	#else
				1399	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1400	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1401	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1402	}
				1403	else {
				1404	errmsg = "illegal UTF-16 surrogate";
				1405	goto utf16Error;
				1406	}
				1407
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1408	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1409	errmsg = "illegal encoding";
				1410	/* Fall through to report the error */
				1411
				1412	utf16Error:
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1413	if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1414	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1415	}
				1416
				1417	if (byteorder)
				1418	*byteorder = bo;
				1419
				1420	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1421	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1422	goto onError;
				1423
				1424	return (PyObject *)unicode;
				1425
				1426	onError:
				1427	Py_DECREF(unicode);
				1428	return NULL;
				1429	}
				1430
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1431	PyObject *
				1432	PyUnicode_EncodeUTF16(const Py_UNICODE *s,
				1433	int size,
				1434	const char *errors,
				1435	int byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1436	{
				1437	PyObject *v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1438	unsigned char *p;
				1439	int i, pairs;
				1440	/* Offsets from p for storing byte pairs in the right order. */
				1441	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1442	int ihi = 1, ilo = 0;
				1443	#else
				1444	int ihi = 0, ilo = 1;
				1445	#endif
				1446
				1447	#define STORECHAR(CH) \
				1448	do { \
				1449	p[ihi] = ((CH) >> 8) & 0xff; \
				1450	p[ilo] = (CH) & 0xff; \
				1451	p += 2; \
				1452	} while(0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1453
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1454	for (i = pairs = 0; i < size; i++)
				1455	if (s[i] >= 0x10000)
				1456	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1457	v = PyString_FromStringAndSize(NULL,
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1458	2 * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1459	if (v == NULL)
				1460	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1461
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1462	p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1463	if (byteorder == 0)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1464	STORECHAR(0xFEFF);
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1465	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1466	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1467
				1468	if (byteorder == -1) {
				1469	/* force LE */
				1470	ihi = 1;
				1471	ilo = 0;
				1472	}
				1473	else if (byteorder == 1) {
				1474	/* force BE */
				1475	ihi = 0;
				1476	ilo = 1;
				1477	}
				1478
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1479	while (size-- > 0) {
				1480	Py_UNICODE ch = *s++;
				1481	Py_UNICODE ch2 = 0;
				1482	if (ch >= 0x10000) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1483	ch2 = 0xDC00 \| ((ch-0x10000) & 0x3FF);
				1484	ch = 0xD800 \| ((ch-0x10000) >> 10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1485	}
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1486	STORECHAR(ch);
				1487	if (ch2)
				1488	STORECHAR(ch2);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1489	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1490	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1491	#undef STORECHAR
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1492	}
				1493
				1494	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1495	{
				1496	if (!PyUnicode_Check(unicode)) {
				1497	PyErr_BadArgument();
				1498	return NULL;
				1499	}
				1500	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1501	PyUnicode_GET_SIZE(unicode),
				1502	NULL,
				1503	0);
				1504	}
				1505
				1506	/* --- Unicode Escape Codec ----------------------------------------------- */
				1507
				1508	static
				1509	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1510	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1511	const char *errors,
				1512	const char *details)
				1513	{
				1514	if ((errors == NULL) \|\|
				1515	(strcmp(errors,"strict") == 0)) {
				1516	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1517	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1518	details);
				1519	return -1;
				1520	}
				1521	else if (strcmp(errors,"ignore") == 0) {
				1522	return 0;
				1523	}
				1524	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1525	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1526	return 0;
				1527	}
				1528	else {
				1529	PyErr_Format(PyExc_ValueError,
				1530	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1531	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1532	errors);
				1533	return -1;
				1534	}
				1535	}
				1536
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1537	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1538
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1539	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1540	int size,
				1541	const char *errors)
				1542	{
				1543	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1544	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1545	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1546	char* message;
				1547	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1548
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1549	/* Escaped strings will always be longer than the resulting
				1550	Unicode string, so we start with size here and then reduce the
				1551	length after conversion to the true value. */
				1552	v = _PyUnicode_New(size);
				1553	if (v == NULL)
				1554	goto onError;
				1555	if (size == 0)
				1556	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1557
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1558	p = buf = PyUnicode_AS_UNICODE(v);
				1559	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1560
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1561	while (s < end) {
				1562	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1563	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1564	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1565
				1566	/* Non-escape characters are interpreted as Unicode ordinals */
				1567	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1568	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1569	continue;
				1570	}
				1571
				1572	/* \ - Escapes */
				1573	s++;
				1574	switch (*s++) {
				1575
				1576	/* \x escapes */
				1577	case '\n': break;
				1578	case '\\': *p++ = '\\'; break;
				1579	case '\'': *p++ = '\''; break;
				1580	case '\"': *p++ = '\"'; break;
				1581	case 'b': *p++ = '\b'; break;
				1582	case 'f': p++ = '\014'; break; / FF */
				1583	case 't': *p++ = '\t'; break;
				1584	case 'n': *p++ = '\n'; break;
				1585	case 'r': *p++ = '\r'; break;
				1586	case 'v': p++ = '\013'; break; / VT */
				1587	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1588
				1589	/* \OOO (octal) escapes */
				1590	case '0': case '1': case '2': case '3':
				1591	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1592	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1593	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1594	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1595	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1596	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1597	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1598	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1599	break;
				1600
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1601	/* hex escapes */
				1602	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1603	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1604	digits = 2;
				1605	message = "truncated \\xXX escape";
				1606	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1607
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1608	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1609	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1610	digits = 4;
				1611	message = "truncated \\uXXXX escape";
				1612	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1613
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1614	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1615	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1616	digits = 8;
				1617	message = "truncated \\UXXXXXXXX escape";
				1618	hexescape:
				1619	chr = 0;
				1620	for (i = 0; i < digits; i++) {
				1621	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1622	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1623	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1624	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1625	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1626	i++;
				1627	break;
				1628	}
				1629	chr = (chr<<4) & ~0xF;
				1630	if (c >= '0' && c <= '9')
				1631	chr += c - '0';
				1632	else if (c >= 'a' && c <= 'f')
				1633	chr += 10 + c - 'a';
				1634	else
				1635	chr += 10 + c - 'A';
				1636	}
				1637	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1638	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1639	/* when we get here, chr is a 32-bit unicode character */
				1640	if (chr <= 0xffff)
				1641	/* UCS-2 character */
				1642	*p++ = (Py_UNICODE) chr;
				1643	else if (chr <= 0x10ffff) {
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1644	/* UCS-4 character. Either store directly, or as
				1645	surrogate pair. */
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1646	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1647	*p++ = chr;
				1648	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1649	chr -= 0x10000L;
				1650	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1651	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1652	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1653	} else {
				1654	if (unicodeescape_decoding_error(
				1655	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1656	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1657	)
				1658	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1659	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1660	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1661	break;
				1662
				1663	/* \N{name} */
				1664	case 'N':
				1665	message = "malformed \\N character escape";
				1666	if (ucnhash_CAPI == NULL) {
				1667	/* load the unicode data module */
				1668	PyObject m, v;
				1669	m = PyImport_ImportModule("unicodedata");
				1670	if (m == NULL)
				1671	goto ucnhashError;
				1672	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1673	Py_DECREF(m);
				1674	if (v == NULL)
				1675	goto ucnhashError;
				1676	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1677	Py_DECREF(v);
				1678	if (ucnhash_CAPI == NULL)
				1679	goto ucnhashError;
				1680	}
				1681	if (*s == '{') {
				1682	const char *start = s+1;
				1683	/* look for the closing brace */
				1684	while (*s != '}' && s < end)
				1685	s++;
				1686	if (s > start && s < end && *s == '}') {
				1687	/* found a name. look it up in the unicode database */
				1688	message = "unknown Unicode character name";
				1689	s++;
				1690	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1691	goto store;
				1692	}
				1693	}
				1694	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1695	goto onError;
				1696	*p++ = x;
				1697	break;
				1698
				1699	default:
				1700	*p++ = '\\';
				1701	*p++ = (unsigned char)s[-1];
				1702	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1703	}
				1704	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1705	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1706	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1707	return (PyObject *)v;
				1708
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1709	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1710	PyErr_SetString(
				1711	PyExc_UnicodeError,
				1712	"\\N escapes not supported (can't load unicodedata module)"
				1713	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1714	return NULL;
				1715
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1716	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1717	Py_XDECREF(v);
				1718	return NULL;
				1719	}
				1720
				1721	/* Return a Unicode-Escape string version of the Unicode object.
				1722
				1723	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1724	appropriate.
				1725
				1726	*/
				1727
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1728	static const Py_UNICODE findchar(const Py_UNICODE s,
				1729	int size,
				1730	Py_UNICODE ch);
				1731
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1732	static
				1733	PyObject unicodeescape_string(const Py_UNICODE s,
				1734	int size,
				1735	int quotes)
				1736	{
				1737	PyObject *repr;
				1738	char *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1739
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1740	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1741
				1742	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1743	if (repr == NULL)
				1744	return NULL;
				1745
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1746	p = PyString_AS_STRING(repr);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1747
				1748	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1749	*p++ = 'u';
				1750	*p++ = (findchar(s, size, '\'') &&
				1751	!findchar(s, size, '"')) ? '"' : '\'';
				1752	}
				1753	while (size-- > 0) {
				1754	Py_UNICODE ch = *s++;
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1755
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1756	/* Escape quotes */
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1757	if (quotes &&
				1758	(ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1759	*p++ = '\\';
				1760	*p++ = (char) ch;
Guido van Rossum	ad9744a	2001-09-21 15:38:17 +0000	[diff] [blame^]	1761	continue;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1762	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1763
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1764	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1765	/* Map 21-bit characters to '\U00xxxxxx' */
				1766	else if (ch >= 0x10000) {
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1767	int offset = p - PyString_AS_STRING(repr);
				1768
				1769	/* Resize the string if necessary */
				1770	if (offset + 12 > PyString_GET_SIZE(repr)) {
				1771	if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
				1772	goto onError;
				1773	p = PyString_AS_STRING(repr) + offset;
				1774	}
				1775
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1776	*p++ = '\\';
				1777	*p++ = 'U';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1778	*p++ = hexdigit[(ch >> 28) & 0x0000000F];
				1779	*p++ = hexdigit[(ch >> 24) & 0x0000000F];
				1780	*p++ = hexdigit[(ch >> 20) & 0x0000000F];
				1781	*p++ = hexdigit[(ch >> 16) & 0x0000000F];
				1782	*p++ = hexdigit[(ch >> 12) & 0x0000000F];
				1783	*p++ = hexdigit[(ch >> 8) & 0x0000000F];
				1784	*p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1785	*p++ = hexdigit[ch & 0x0000000F];
				1786	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1787	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1788	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1789	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
				1790	else if (ch >= 0xD800 && ch < 0xDC00) {
				1791	Py_UNICODE ch2;
				1792	Py_UCS4 ucs;
				1793
				1794	ch2 = *s++;
				1795	size--;
				1796	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
				1797	ucs = (((ch & 0x03FF) << 10) \| (ch2 & 0x03FF)) + 0x00010000;
				1798	*p++ = '\\';
				1799	*p++ = 'U';
				1800	*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
				1801	*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
				1802	*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
				1803	*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
				1804	*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
				1805	*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
				1806	*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
				1807	*p++ = hexdigit[ucs & 0x0000000F];
				1808	continue;
				1809	}
				1810	/* Fall through: isolated surrogates are copied as-is */
				1811	s--;
				1812	size++;
				1813	}
				1814
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1815	/* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1816	if (ch >= 256) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1817	*p++ = '\\';
				1818	*p++ = 'u';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1819	*p++ = hexdigit[(ch >> 12) & 0x000F];
				1820	*p++ = hexdigit[(ch >> 8) & 0x000F];
				1821	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1822	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1823	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1824
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1825	/* Map special whitespace to '\t', \n', '\r' */
				1826	else if (ch == '\t') {
				1827	*p++ = '\\';
				1828	*p++ = 't';
				1829	}
				1830	else if (ch == '\n') {
				1831	*p++ = '\\';
				1832	*p++ = 'n';
				1833	}
				1834	else if (ch == '\r') {
				1835	*p++ = '\\';
				1836	*p++ = 'r';
				1837	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1838
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1839	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1840	else if (ch < ' ' \|\| ch >= 128) {
				1841	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1842	*p++ = 'x';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1843	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1844	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1845	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1846
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1847	/* Copy everything else as-is */
				1848	else
				1849	*p++ = (char) ch;
				1850	}
				1851	if (quotes)
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1852	*p++ = PyString_AS_STRING(repr)[1];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1853
				1854	*p = '\0';
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1855	if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1856	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1857
				1858	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1859
				1860	onError:
				1861	Py_DECREF(repr);
				1862	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1863	}
				1864
				1865	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1866	int size)
				1867	{
				1868	return unicodeescape_string(s, size, 0);
				1869	}
				1870
				1871	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1872	{
				1873	if (!PyUnicode_Check(unicode)) {
				1874	PyErr_BadArgument();
				1875	return NULL;
				1876	}
				1877	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1878	PyUnicode_GET_SIZE(unicode));
				1879	}
				1880
				1881	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1882
				1883	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1884	int size,
				1885	const char *errors)
				1886	{
				1887	PyUnicodeObject *v;
				1888	Py_UNICODE p, buf;
				1889	const char *end;
				1890	const char *bs;
				1891
				1892	/* Escaped strings will always be longer than the resulting
				1893	Unicode string, so we start with size here and then reduce the
				1894	length after conversion to the true value. */
				1895	v = _PyUnicode_New(size);
				1896	if (v == NULL)
				1897	goto onError;
				1898	if (size == 0)
				1899	return (PyObject *)v;
				1900	p = buf = PyUnicode_AS_UNICODE(v);
				1901	end = s + size;
				1902	while (s < end) {
				1903	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1904	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1905	int i;
				1906
				1907	/* Non-escape characters are interpreted as Unicode ordinals */
				1908	if (*s != '\\') {
				1909	p++ = (unsigned char)s++;
				1910	continue;
				1911	}
				1912
				1913	/* \u-escapes are only interpreted iff the number of leading
				1914	backslashes if odd */
				1915	bs = s;
				1916	for (;s < end;) {
				1917	if (*s != '\\')
				1918	break;
				1919	p++ = (unsigned char)s++;
				1920	}
				1921	if (((s - bs) & 1) == 0 \|\|
				1922	s >= end \|\|
				1923	*s != 'u') {
				1924	continue;
				1925	}
				1926	p--;
				1927	s++;
				1928
				1929	/* \uXXXX with 4 hex digits */
				1930	for (x = 0, i = 0; i < 4; i++) {
				1931	c = (unsigned char)s[i];
				1932	if (!isxdigit(c)) {
				1933	if (unicodeescape_decoding_error(&s, &x, errors,
				1934	"truncated \\uXXXX"))
				1935	goto onError;
				1936	i++;
				1937	break;
				1938	}
				1939	x = (x<<4) & ~0xF;
				1940	if (c >= '0' && c <= '9')
				1941	x += c - '0';
				1942	else if (c >= 'a' && c <= 'f')
				1943	x += 10 + c - 'a';
				1944	else
				1945	x += 10 + c - 'A';
				1946	}
				1947	s += i;
				1948	*p++ = x;
				1949	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1950	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1951	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1952	return (PyObject *)v;
				1953
				1954	onError:
				1955	Py_XDECREF(v);
				1956	return NULL;
				1957	}
				1958
				1959	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1960	int size)
				1961	{
				1962	PyObject *repr;
				1963	char *p;
				1964	char *q;
				1965
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1966	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1967
				1968	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1969	if (repr == NULL)
				1970	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1971	if (size == 0)
				1972	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1973
				1974	p = q = PyString_AS_STRING(repr);
				1975	while (size-- > 0) {
				1976	Py_UNICODE ch = *s++;
				1977	/* Map 16-bit characters to '\uxxxx' */
				1978	if (ch >= 256) {
				1979	*p++ = '\\';
				1980	*p++ = 'u';
				1981	*p++ = hexdigit[(ch >> 12) & 0xf];
				1982	*p++ = hexdigit[(ch >> 8) & 0xf];
				1983	*p++ = hexdigit[(ch >> 4) & 0xf];
				1984	*p++ = hexdigit[ch & 15];
				1985	}
				1986	/* Copy everything else as-is */
				1987	else
				1988	*p++ = (char) ch;
				1989	}
				1990	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1991	if (_PyString_Resize(&repr, p - q))
				1992	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1993
				1994	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1995
				1996	onError:
				1997	Py_DECREF(repr);
				1998	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1999	}
				2000
				2001	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				2002	{
				2003	if (!PyUnicode_Check(unicode)) {
				2004	PyErr_BadArgument();
				2005	return NULL;
				2006	}
				2007	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				2008	PyUnicode_GET_SIZE(unicode));
				2009	}
				2010
				2011	/* --- Latin-1 Codec ------------------------------------------------------ */
				2012
				2013	PyObject PyUnicode_DecodeLatin1(const char s,
				2014	int size,
				2015	const char *errors)
				2016	{
				2017	PyUnicodeObject *v;
				2018	Py_UNICODE *p;
				2019
				2020	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2021	if (size == 1 && (unsigned char)s < 256) {
				2022	Py_UNICODE r = (unsigned char)s;
				2023	return PyUnicode_FromUnicode(&r, 1);
				2024	}
				2025
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2026	v = _PyUnicode_New(size);
				2027	if (v == NULL)
				2028	goto onError;
				2029	if (size == 0)
				2030	return (PyObject *)v;
				2031	p = PyUnicode_AS_UNICODE(v);
				2032	while (size-- > 0)
				2033	p++ = (unsigned char)s++;
				2034	return (PyObject *)v;
				2035
				2036	onError:
				2037	Py_XDECREF(v);
				2038	return NULL;
				2039	}
				2040
				2041	static
				2042	int latin1_encoding_error(const Py_UNICODE **source,
				2043	char **dest,
				2044	const char *errors,
				2045	const char *details)
				2046	{
				2047	if ((errors == NULL) \|\|
				2048	(strcmp(errors,"strict") == 0)) {
				2049	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2050	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2051	details);
				2052	return -1;
				2053	}
				2054	else if (strcmp(errors,"ignore") == 0) {
				2055	return 0;
				2056	}
				2057	else if (strcmp(errors,"replace") == 0) {
				2058	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2059	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2060	return 0;
				2061	}
				2062	else {
				2063	PyErr_Format(PyExc_ValueError,
				2064	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2065	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2066	errors);
				2067	return -1;
				2068	}
				2069	}
				2070
				2071	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				2072	int size,
				2073	const char *errors)
				2074	{
				2075	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2076	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2077
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2078	repr = PyString_FromStringAndSize(NULL, size);
				2079	if (repr == NULL)
				2080	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2081	if (size == 0)
				2082	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2083
				2084	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2085	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2086	while (size-- > 0) {
				2087	Py_UNICODE ch = *p++;
				2088	if (ch >= 256) {
				2089	if (latin1_encoding_error(&p, &s, errors,
				2090	"ordinal not in range(256)"))
				2091	goto onError;
				2092	}
				2093	else
				2094	*s++ = (char)ch;
				2095	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2096	/* Resize if error handling skipped some characters */
				2097	if (s - start < PyString_GET_SIZE(repr))
				2098	if (_PyString_Resize(&repr, s - start))
				2099	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2100	return repr;
				2101
				2102	onError:
				2103	Py_DECREF(repr);
				2104	return NULL;
				2105	}
				2106
				2107	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				2108	{
				2109	if (!PyUnicode_Check(unicode)) {
				2110	PyErr_BadArgument();
				2111	return NULL;
				2112	}
				2113	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				2114	PyUnicode_GET_SIZE(unicode),
				2115	NULL);
				2116	}
				2117
				2118	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				2119
				2120	static
				2121	int ascii_decoding_error(const char **source,
				2122	Py_UNICODE **dest,
				2123	const char *errors,
				2124	const char *details)
				2125	{
				2126	if ((errors == NULL) \|\|
				2127	(strcmp(errors,"strict") == 0)) {
				2128	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2129	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2130	details);
				2131	return -1;
				2132	}
				2133	else if (strcmp(errors,"ignore") == 0) {
				2134	return 0;
				2135	}
				2136	else if (strcmp(errors,"replace") == 0) {
				2137	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2138	(*dest)++;
				2139	return 0;
				2140	}
				2141	else {
				2142	PyErr_Format(PyExc_ValueError,
				2143	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2144	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2145	errors);
				2146	return -1;
				2147	}
				2148	}
				2149
				2150	PyObject PyUnicode_DecodeASCII(const char s,
				2151	int size,
				2152	const char *errors)
				2153	{
				2154	PyUnicodeObject *v;
				2155	Py_UNICODE *p;
				2156
				2157	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2158	if (size == 1 && (unsigned char)s < 128) {
				2159	Py_UNICODE r = (unsigned char)s;
				2160	return PyUnicode_FromUnicode(&r, 1);
				2161	}
				2162
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2163	v = _PyUnicode_New(size);
				2164	if (v == NULL)
				2165	goto onError;
				2166	if (size == 0)
				2167	return (PyObject *)v;
				2168	p = PyUnicode_AS_UNICODE(v);
				2169	while (size-- > 0) {
				2170	register unsigned char c;
				2171
				2172	c = (unsigned char)*s++;
				2173	if (c < 128)
				2174	*p++ = c;
				2175	else if (ascii_decoding_error(&s, &p, errors,
				2176	"ordinal not in range(128)"))
				2177	goto onError;
				2178	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2179	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2180	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2181	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2182	return (PyObject *)v;
				2183
				2184	onError:
				2185	Py_XDECREF(v);
				2186	return NULL;
				2187	}
				2188
				2189	static
				2190	int ascii_encoding_error(const Py_UNICODE **source,
				2191	char **dest,
				2192	const char *errors,
				2193	const char *details)
				2194	{
				2195	if ((errors == NULL) \|\|
				2196	(strcmp(errors,"strict") == 0)) {
				2197	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2198	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2199	details);
				2200	return -1;
				2201	}
				2202	else if (strcmp(errors,"ignore") == 0) {
				2203	return 0;
				2204	}
				2205	else if (strcmp(errors,"replace") == 0) {
				2206	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2207	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2208	return 0;
				2209	}
				2210	else {
				2211	PyErr_Format(PyExc_ValueError,
				2212	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2213	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2214	errors);
				2215	return -1;
				2216	}
				2217	}
				2218
				2219	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				2220	int size,
				2221	const char *errors)
				2222	{
				2223	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2224	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2225
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2226	repr = PyString_FromStringAndSize(NULL, size);
				2227	if (repr == NULL)
				2228	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2229	if (size == 0)
				2230	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2231
				2232	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2233	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2234	while (size-- > 0) {
				2235	Py_UNICODE ch = *p++;
				2236	if (ch >= 128) {
				2237	if (ascii_encoding_error(&p, &s, errors,
				2238	"ordinal not in range(128)"))
				2239	goto onError;
				2240	}
				2241	else
				2242	*s++ = (char)ch;
				2243	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2244	/* Resize if error handling skipped some characters */
				2245	if (s - start < PyString_GET_SIZE(repr))
				2246	if (_PyString_Resize(&repr, s - start))
				2247	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2248	return repr;
				2249
				2250	onError:
				2251	Py_DECREF(repr);
				2252	return NULL;
				2253	}
				2254
				2255	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				2256	{
				2257	if (!PyUnicode_Check(unicode)) {
				2258	PyErr_BadArgument();
				2259	return NULL;
				2260	}
				2261	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				2262	PyUnicode_GET_SIZE(unicode),
				2263	NULL);
				2264	}
				2265
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	2266	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2267
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2268	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2269
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2270	PyObject PyUnicode_DecodeMBCS(const char s,
				2271	int size,
				2272	const char *errors)
				2273	{
				2274	PyUnicodeObject *v;
				2275	Py_UNICODE *p;
				2276
				2277	/* First get the size of the result */
				2278	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2279	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2280	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2281
				2282	v = _PyUnicode_New(usize);
				2283	if (v == NULL)
				2284	return NULL;
				2285	if (usize == 0)
				2286	return (PyObject *)v;
				2287	p = PyUnicode_AS_UNICODE(v);
				2288	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				2289	Py_DECREF(v);
				2290	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2291	}
				2292
				2293	return (PyObject *)v;
				2294	}
				2295
				2296	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				2297	int size,
				2298	const char *errors)
				2299	{
				2300	PyObject *repr;
				2301	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2302	DWORD mbcssize;
				2303
				2304	/* If there are no characters, bail now! */
				2305	if (size==0)
				2306	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2307
				2308	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2309	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2310	if (mbcssize==0)
				2311	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2312
				2313	repr = PyString_FromStringAndSize(NULL, mbcssize);
				2314	if (repr == NULL)
				2315	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2316	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2317	return repr;
				2318
				2319	/* Do the conversion */
				2320	s = PyString_AS_STRING(repr);
				2321	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				2322	Py_DECREF(repr);
				2323	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2324	}
				2325	return repr;
				2326	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2327
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2328	#endif /* MS_WIN32 */
				2329
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2330	/* --- Character Mapping Codec -------------------------------------------- */
				2331
				2332	static
				2333	int charmap_decoding_error(const char **source,
				2334	Py_UNICODE **dest,
				2335	const char *errors,
				2336	const char *details)
				2337	{
				2338	if ((errors == NULL) \|\|
				2339	(strcmp(errors,"strict") == 0)) {
				2340	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2341	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2342	details);
				2343	return -1;
				2344	}
				2345	else if (strcmp(errors,"ignore") == 0) {
				2346	return 0;
				2347	}
				2348	else if (strcmp(errors,"replace") == 0) {
				2349	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2350	(*dest)++;
				2351	return 0;
				2352	}
				2353	else {
				2354	PyErr_Format(PyExc_ValueError,
				2355	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2356	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2357	errors);
				2358	return -1;
				2359	}
				2360	}
				2361
				2362	PyObject PyUnicode_DecodeCharmap(const char s,
				2363	int size,
				2364	PyObject *mapping,
				2365	const char *errors)
				2366	{
				2367	PyUnicodeObject *v;
				2368	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2369	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2370
				2371	/* Default to Latin-1 */
				2372	if (mapping == NULL)
				2373	return PyUnicode_DecodeLatin1(s, size, errors);
				2374
				2375	v = _PyUnicode_New(size);
				2376	if (v == NULL)
				2377	goto onError;
				2378	if (size == 0)
				2379	return (PyObject *)v;
				2380	p = PyUnicode_AS_UNICODE(v);
				2381	while (size-- > 0) {
				2382	unsigned char ch = *s++;
				2383	PyObject w, x;
				2384
				2385	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2386	w = PyInt_FromLong((long)ch);
				2387	if (w == NULL)
				2388	goto onError;
				2389	x = PyObject_GetItem(mapping, w);
				2390	Py_DECREF(w);
				2391	if (x == NULL) {
				2392	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2393	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2394	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2395	x = Py_None;
				2396	Py_INCREF(x);
				2397	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2398	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2399	}
				2400
				2401	/* Apply mapping */
				2402	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2403	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2404	if (value < 0 \|\| value > 65535) {
				2405	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2406	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2407	Py_DECREF(x);
				2408	goto onError;
				2409	}
				2410	*p++ = (Py_UNICODE)value;
				2411	}
				2412	else if (x == Py_None) {
				2413	/* undefined mapping */
				2414	if (charmap_decoding_error(&s, &p, errors,
				2415	"character maps to <undefined>")) {
				2416	Py_DECREF(x);
				2417	goto onError;
				2418	}
				2419	}
				2420	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2421	int targetsize = PyUnicode_GET_SIZE(x);
				2422
				2423	if (targetsize == 1)
				2424	/* 1-1 mapping */
				2425	p++ = PyUnicode_AS_UNICODE(x);
				2426
				2427	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2428	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2429	if (targetsize > extrachars) {
				2430	/* resize first */
				2431	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2432	int needed = (targetsize - extrachars) + \
				2433	(targetsize << 2);
				2434	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2435	if (_PyUnicode_Resize(&v,
				2436	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2437	Py_DECREF(x);
				2438	goto onError;
				2439	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2440	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2441	}
				2442	Py_UNICODE_COPY(p,
				2443	PyUnicode_AS_UNICODE(x),
				2444	targetsize);
				2445	p += targetsize;
				2446	extrachars -= targetsize;
				2447	}
				2448	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2449	}
				2450	else {
				2451	/* wrong return value */
				2452	PyErr_SetString(PyExc_TypeError,
				2453	"character mapping must return integer, None or unicode");
				2454	Py_DECREF(x);
				2455	goto onError;
				2456	}
				2457	Py_DECREF(x);
				2458	}
				2459	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2460	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2461	goto onError;
				2462	return (PyObject *)v;
				2463
				2464	onError:
				2465	Py_XDECREF(v);
				2466	return NULL;
				2467	}
				2468
				2469	static
				2470	int charmap_encoding_error(const Py_UNICODE **source,
				2471	char **dest,
				2472	const char *errors,
				2473	const char *details)
				2474	{
				2475	if ((errors == NULL) \|\|
				2476	(strcmp(errors,"strict") == 0)) {
				2477	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2478	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2479	details);
				2480	return -1;
				2481	}
				2482	else if (strcmp(errors,"ignore") == 0) {
				2483	return 0;
				2484	}
				2485	else if (strcmp(errors,"replace") == 0) {
				2486	**dest = '?';
				2487	(*dest)++;
				2488	return 0;
				2489	}
				2490	else {
				2491	PyErr_Format(PyExc_ValueError,
				2492	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2493	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2494	errors);
				2495	return -1;
				2496	}
				2497	}
				2498
				2499	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2500	int size,
				2501	PyObject *mapping,
				2502	const char *errors)
				2503	{
				2504	PyObject *v;
				2505	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2506	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2507
				2508	/* Default to Latin-1 */
				2509	if (mapping == NULL)
				2510	return PyUnicode_EncodeLatin1(p, size, errors);
				2511
				2512	v = PyString_FromStringAndSize(NULL, size);
				2513	if (v == NULL)
				2514	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2515	if (size == 0)
				2516	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2517	s = PyString_AS_STRING(v);
				2518	while (size-- > 0) {
				2519	Py_UNICODE ch = *p++;
				2520	PyObject w, x;
				2521
				2522	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2523	w = PyInt_FromLong((long)ch);
				2524	if (w == NULL)
				2525	goto onError;
				2526	x = PyObject_GetItem(mapping, w);
				2527	Py_DECREF(w);
				2528	if (x == NULL) {
				2529	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2530	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2531	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2532	x = Py_None;
				2533	Py_INCREF(x);
				2534	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2535	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2536	}
				2537
				2538	/* Apply mapping */
				2539	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2540	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2541	if (value < 0 \|\| value > 255) {
				2542	PyErr_SetString(PyExc_TypeError,
				2543	"character mapping must be in range(256)");
				2544	Py_DECREF(x);
				2545	goto onError;
				2546	}
				2547	*s++ = (char)value;
				2548	}
				2549	else if (x == Py_None) {
				2550	/* undefined mapping */
				2551	if (charmap_encoding_error(&p, &s, errors,
				2552	"character maps to <undefined>")) {
				2553	Py_DECREF(x);
				2554	goto onError;
				2555	}
				2556	}
				2557	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2558	int targetsize = PyString_GET_SIZE(x);
				2559
				2560	if (targetsize == 1)
				2561	/* 1-1 mapping */
				2562	s++ = PyString_AS_STRING(x);
				2563
				2564	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2565	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2566	if (targetsize > extrachars) {
				2567	/* resize first */
				2568	int oldpos = (int)(s - PyString_AS_STRING(v));
				2569	int needed = (targetsize - extrachars) + \
				2570	(targetsize << 2);
				2571	extrachars += needed;
				2572	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2573	Py_DECREF(x);
				2574	goto onError;
				2575	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2576	s = PyString_AS_STRING(v) + oldpos;
				2577	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2578	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2579	s += targetsize;
				2580	extrachars -= targetsize;
				2581	}
				2582	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2583	}
				2584	else {
				2585	/* wrong return value */
				2586	PyErr_SetString(PyExc_TypeError,
				2587	"character mapping must return integer, None or unicode");
				2588	Py_DECREF(x);
				2589	goto onError;
				2590	}
				2591	Py_DECREF(x);
				2592	}
				2593	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2594	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2595	goto onError;
				2596	return v;
				2597
				2598	onError:
				2599	Py_DECREF(v);
				2600	return NULL;
				2601	}
				2602
				2603	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2604	PyObject *mapping)
				2605	{
				2606	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2607	PyErr_BadArgument();
				2608	return NULL;
				2609	}
				2610	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2611	PyUnicode_GET_SIZE(unicode),
				2612	mapping,
				2613	NULL);
				2614	}
				2615
				2616	static
				2617	int translate_error(const Py_UNICODE **source,
				2618	Py_UNICODE **dest,
				2619	const char *errors,
				2620	const char *details)
				2621	{
				2622	if ((errors == NULL) \|\|
				2623	(strcmp(errors,"strict") == 0)) {
				2624	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2625	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2626	details);
				2627	return -1;
				2628	}
				2629	else if (strcmp(errors,"ignore") == 0) {
				2630	return 0;
				2631	}
				2632	else if (strcmp(errors,"replace") == 0) {
				2633	**dest = '?';
				2634	(*dest)++;
				2635	return 0;
				2636	}
				2637	else {
				2638	PyErr_Format(PyExc_ValueError,
				2639	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2640	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2641	errors);
				2642	return -1;
				2643	}
				2644	}
				2645
				2646	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2647	int size,
				2648	PyObject *mapping,
				2649	const char *errors)
				2650	{
				2651	PyUnicodeObject *v;
				2652	Py_UNICODE *p;
				2653
				2654	if (mapping == NULL) {
				2655	PyErr_BadArgument();
				2656	return NULL;
				2657	}
				2658
				2659	/* Output will never be longer than input */
				2660	v = _PyUnicode_New(size);
				2661	if (v == NULL)
				2662	goto onError;
				2663	if (size == 0)
				2664	goto done;
				2665	p = PyUnicode_AS_UNICODE(v);
				2666	while (size-- > 0) {
				2667	Py_UNICODE ch = *s++;
				2668	PyObject w, x;
				2669
				2670	/* Get mapping */
				2671	w = PyInt_FromLong(ch);
				2672	if (w == NULL)
				2673	goto onError;
				2674	x = PyObject_GetItem(mapping, w);
				2675	Py_DECREF(w);
				2676	if (x == NULL) {
				2677	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2678	/* No mapping found: default to 1-1 mapping */
				2679	PyErr_Clear();
				2680	*p++ = ch;
				2681	continue;
				2682	}
				2683	goto onError;
				2684	}
				2685
				2686	/* Apply mapping */
				2687	if (PyInt_Check(x))
				2688	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2689	else if (x == Py_None) {
				2690	/* undefined mapping */
				2691	if (translate_error(&s, &p, errors,
				2692	"character maps to <undefined>")) {
				2693	Py_DECREF(x);
				2694	goto onError;
				2695	}
				2696	}
				2697	else if (PyUnicode_Check(x)) {
				2698	if (PyUnicode_GET_SIZE(x) != 1) {
				2699	/* 1-n mapping */
				2700	PyErr_SetString(PyExc_NotImplementedError,
				2701	"1-n mappings are currently not implemented");
				2702	Py_DECREF(x);
				2703	goto onError;
				2704	}
				2705	p++ = PyUnicode_AS_UNICODE(x);
				2706	}
				2707	else {
				2708	/* wrong return value */
				2709	PyErr_SetString(PyExc_TypeError,
				2710	"translate mapping must return integer, None or unicode");
				2711	Py_DECREF(x);
				2712	goto onError;
				2713	}
				2714	Py_DECREF(x);
				2715	}
				2716	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2717	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2718	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2719
				2720	done:
				2721	return (PyObject *)v;
				2722
				2723	onError:
				2724	Py_XDECREF(v);
				2725	return NULL;
				2726	}
				2727
				2728	PyObject PyUnicode_Translate(PyObject str,
				2729	PyObject *mapping,
				2730	const char *errors)
				2731	{
				2732	PyObject *result;
				2733
				2734	str = PyUnicode_FromObject(str);
				2735	if (str == NULL)
				2736	goto onError;
				2737	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2738	PyUnicode_GET_SIZE(str),
				2739	mapping,
				2740	errors);
				2741	Py_DECREF(str);
				2742	return result;
				2743
				2744	onError:
				2745	Py_XDECREF(str);
				2746	return NULL;
				2747	}
				2748
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2749	/* --- Decimal Encoder ---------------------------------------------------- */
				2750
				2751	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2752	int length,
				2753	char *output,
				2754	const char *errors)
				2755	{
				2756	Py_UNICODE p, end;
				2757
				2758	if (output == NULL) {
				2759	PyErr_BadArgument();
				2760	return -1;
				2761	}
				2762
				2763	p = s;
				2764	end = s + length;
				2765	while (p < end) {
				2766	register Py_UNICODE ch = *p++;
				2767	int decimal;
				2768
				2769	if (Py_UNICODE_ISSPACE(ch)) {
				2770	*output++ = ' ';
				2771	continue;
				2772	}
				2773	decimal = Py_UNICODE_TODECIMAL(ch);
				2774	if (decimal >= 0) {
				2775	*output++ = '0' + decimal;
				2776	continue;
				2777	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2778	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2779	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2780	continue;
				2781	}
				2782	/* All other characters are considered invalid */
				2783	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2784	PyErr_SetString(PyExc_ValueError,
				2785	"invalid decimal Unicode string");
				2786	goto onError;
				2787	}
				2788	else if (strcmp(errors, "ignore") == 0)
				2789	continue;
				2790	else if (strcmp(errors, "replace") == 0) {
				2791	*output++ = '?';
				2792	continue;
				2793	}
				2794	}
				2795	/* 0-terminate the output string */
				2796	*output++ = '\0';
				2797	return 0;
				2798
				2799	onError:
				2800	return -1;
				2801	}
				2802
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2803	/* --- Helpers ------------------------------------------------------------ */
				2804
				2805	static
				2806	int count(PyUnicodeObject *self,
				2807	int start,
				2808	int end,
				2809	PyUnicodeObject *substring)
				2810	{
				2811	int count = 0;
				2812
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2813	if (start < 0)
				2814	start += self->length;
				2815	if (start < 0)
				2816	start = 0;
				2817	if (end > self->length)
				2818	end = self->length;
				2819	if (end < 0)
				2820	end += self->length;
				2821	if (end < 0)
				2822	end = 0;
				2823
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2824	if (substring->length == 0)
				2825	return (end - start + 1);
				2826
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2827	end -= substring->length;
				2828
				2829	while (start <= end)
				2830	if (Py_UNICODE_MATCH(self, start, substring)) {
				2831	count++;
				2832	start += substring->length;
				2833	} else
				2834	start++;
				2835
				2836	return count;
				2837	}
				2838
				2839	int PyUnicode_Count(PyObject *str,
				2840	PyObject *substr,
				2841	int start,
				2842	int end)
				2843	{
				2844	int result;
				2845
				2846	str = PyUnicode_FromObject(str);
				2847	if (str == NULL)
				2848	return -1;
				2849	substr = PyUnicode_FromObject(substr);
				2850	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2851	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2852	return -1;
				2853	}
				2854
				2855	result = count((PyUnicodeObject *)str,
				2856	start, end,
				2857	(PyUnicodeObject *)substr);
				2858
				2859	Py_DECREF(str);
				2860	Py_DECREF(substr);
				2861	return result;
				2862	}
				2863
				2864	static
				2865	int findstring(PyUnicodeObject *self,
				2866	PyUnicodeObject *substring,
				2867	int start,
				2868	int end,
				2869	int direction)
				2870	{
				2871	if (start < 0)
				2872	start += self->length;
				2873	if (start < 0)
				2874	start = 0;
				2875
				2876	if (substring->length == 0)
				2877	return start;
				2878
				2879	if (end > self->length)
				2880	end = self->length;
				2881	if (end < 0)
				2882	end += self->length;
				2883	if (end < 0)
				2884	end = 0;
				2885
				2886	end -= substring->length;
				2887
				2888	if (direction < 0) {
				2889	for (; end >= start; end--)
				2890	if (Py_UNICODE_MATCH(self, end, substring))
				2891	return end;
				2892	} else {
				2893	for (; start <= end; start++)
				2894	if (Py_UNICODE_MATCH(self, start, substring))
				2895	return start;
				2896	}
				2897
				2898	return -1;
				2899	}
				2900
				2901	int PyUnicode_Find(PyObject *str,
				2902	PyObject *substr,
				2903	int start,
				2904	int end,
				2905	int direction)
				2906	{
				2907	int result;
				2908
				2909	str = PyUnicode_FromObject(str);
				2910	if (str == NULL)
				2911	return -1;
				2912	substr = PyUnicode_FromObject(substr);
				2913	if (substr == NULL) {
				2914	Py_DECREF(substr);
				2915	return -1;
				2916	}
				2917
				2918	result = findstring((PyUnicodeObject *)str,
				2919	(PyUnicodeObject *)substr,
				2920	start, end, direction);
				2921	Py_DECREF(str);
				2922	Py_DECREF(substr);
				2923	return result;
				2924	}
				2925
				2926	static
				2927	int tailmatch(PyUnicodeObject *self,
				2928	PyUnicodeObject *substring,
				2929	int start,
				2930	int end,
				2931	int direction)
				2932	{
				2933	if (start < 0)
				2934	start += self->length;
				2935	if (start < 0)
				2936	start = 0;
				2937
				2938	if (substring->length == 0)
				2939	return 1;
				2940
				2941	if (end > self->length)
				2942	end = self->length;
				2943	if (end < 0)
				2944	end += self->length;
				2945	if (end < 0)
				2946	end = 0;
				2947
				2948	end -= substring->length;
				2949	if (end < start)
				2950	return 0;
				2951
				2952	if (direction > 0) {
				2953	if (Py_UNICODE_MATCH(self, end, substring))
				2954	return 1;
				2955	} else {
				2956	if (Py_UNICODE_MATCH(self, start, substring))
				2957	return 1;
				2958	}
				2959
				2960	return 0;
				2961	}
				2962
				2963	int PyUnicode_Tailmatch(PyObject *str,
				2964	PyObject *substr,
				2965	int start,
				2966	int end,
				2967	int direction)
				2968	{
				2969	int result;
				2970
				2971	str = PyUnicode_FromObject(str);
				2972	if (str == NULL)
				2973	return -1;
				2974	substr = PyUnicode_FromObject(substr);
				2975	if (substr == NULL) {
				2976	Py_DECREF(substr);
				2977	return -1;
				2978	}
				2979
				2980	result = tailmatch((PyUnicodeObject *)str,
				2981	(PyUnicodeObject *)substr,
				2982	start, end, direction);
				2983	Py_DECREF(str);
				2984	Py_DECREF(substr);
				2985	return result;
				2986	}
				2987
				2988	static
				2989	const Py_UNICODE findchar(const Py_UNICODE s,
				2990	int size,
				2991	Py_UNICODE ch)
				2992	{
				2993	/* like wcschr, but doesn't stop at NULL characters */
				2994
				2995	while (size-- > 0) {
				2996	if (*s == ch)
				2997	return s;
				2998	s++;
				2999	}
				3000
				3001	return NULL;
				3002	}
				3003
				3004	/* Apply fixfct filter to the Unicode object self and return a
				3005	reference to the modified object */
				3006
				3007	static
				3008	PyObject fixup(PyUnicodeObject self,
				3009	int (fixfct)(PyUnicodeObject s))
				3010	{
				3011
				3012	PyUnicodeObject *u;
				3013
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3014	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3015	if (u == NULL)
				3016	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3017
				3018	Py_UNICODE_COPY(u->str, self->str, self->length);
				3019
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3020	if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3021	/* fixfct should return TRUE if it modified the buffer. If
				3022	FALSE, return a reference to the original buffer instead
				3023	(to save space, not time) */
				3024	Py_INCREF(self);
				3025	Py_DECREF(u);
				3026	return (PyObject*) self;
				3027	}
				3028	return (PyObject*) u;
				3029	}
				3030
				3031	static
				3032	int fixupper(PyUnicodeObject *self)
				3033	{
				3034	int len = self->length;
				3035	Py_UNICODE *s = self->str;
				3036	int status = 0;
				3037
				3038	while (len-- > 0) {
				3039	register Py_UNICODE ch;
				3040
				3041	ch = Py_UNICODE_TOUPPER(*s);
				3042	if (ch != *s) {
				3043	status = 1;
				3044	*s = ch;
				3045	}
				3046	s++;
				3047	}
				3048
				3049	return status;
				3050	}
				3051
				3052	static
				3053	int fixlower(PyUnicodeObject *self)
				3054	{
				3055	int len = self->length;
				3056	Py_UNICODE *s = self->str;
				3057	int status = 0;
				3058
				3059	while (len-- > 0) {
				3060	register Py_UNICODE ch;
				3061
				3062	ch = Py_UNICODE_TOLOWER(*s);
				3063	if (ch != *s) {
				3064	status = 1;
				3065	*s = ch;
				3066	}
				3067	s++;
				3068	}
				3069
				3070	return status;
				3071	}
				3072
				3073	static
				3074	int fixswapcase(PyUnicodeObject *self)
				3075	{
				3076	int len = self->length;
				3077	Py_UNICODE *s = self->str;
				3078	int status = 0;
				3079
				3080	while (len-- > 0) {
				3081	if (Py_UNICODE_ISUPPER(*s)) {
				3082	s = Py_UNICODE_TOLOWER(s);
				3083	status = 1;
				3084	} else if (Py_UNICODE_ISLOWER(*s)) {
				3085	s = Py_UNICODE_TOUPPER(s);
				3086	status = 1;
				3087	}
				3088	s++;
				3089	}
				3090
				3091	return status;
				3092	}
				3093
				3094	static
				3095	int fixcapitalize(PyUnicodeObject *self)
				3096	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3097	int len = self->length;
				3098	Py_UNICODE *s = self->str;
				3099	int status = 0;
				3100
				3101	if (len == 0)
				3102	return 0;
				3103	if (Py_UNICODE_ISLOWER(*s)) {
				3104	s = Py_UNICODE_TOUPPER(s);
				3105	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3106	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3107	s++;
				3108	while (--len > 0) {
				3109	if (Py_UNICODE_ISUPPER(*s)) {
				3110	s = Py_UNICODE_TOLOWER(s);
				3111	status = 1;
				3112	}
				3113	s++;
				3114	}
				3115	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3116	}
				3117
				3118	static
				3119	int fixtitle(PyUnicodeObject *self)
				3120	{
				3121	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3122	register Py_UNICODE *e;
				3123	int previous_is_cased;
				3124
				3125	/* Shortcut for single character strings */
				3126	if (PyUnicode_GET_SIZE(self) == 1) {
				3127	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				3128	if (*p != ch) {
				3129	*p = ch;
				3130	return 1;
				3131	}
				3132	else
				3133	return 0;
				3134	}
				3135
				3136	e = p + PyUnicode_GET_SIZE(self);
				3137	previous_is_cased = 0;
				3138	for (; p < e; p++) {
				3139	register const Py_UNICODE ch = *p;
				3140
				3141	if (previous_is_cased)
				3142	*p = Py_UNICODE_TOLOWER(ch);
				3143	else
				3144	*p = Py_UNICODE_TOTITLE(ch);
				3145
				3146	if (Py_UNICODE_ISLOWER(ch) \|\|
				3147	Py_UNICODE_ISUPPER(ch) \|\|
				3148	Py_UNICODE_ISTITLE(ch))
				3149	previous_is_cased = 1;
				3150	else
				3151	previous_is_cased = 0;
				3152	}
				3153	return 1;
				3154	}
				3155
				3156	PyObject PyUnicode_Join(PyObject separator,
				3157	PyObject *seq)
				3158	{
				3159	Py_UNICODE *sep;
				3160	int seplen;
				3161	PyUnicodeObject *res = NULL;
				3162	int reslen = 0;
				3163	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3164	int sz = 100;
				3165	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3166	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3167
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3168	it = PyObject_GetIter(seq);
				3169	if (it == NULL)
				3170	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3171
				3172	if (separator == NULL) {
				3173	Py_UNICODE blank = ' ';
				3174	sep = &blank;
				3175	seplen = 1;
				3176	}
				3177	else {
				3178	separator = PyUnicode_FromObject(separator);
				3179	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3180	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3181	sep = PyUnicode_AS_UNICODE(separator);
				3182	seplen = PyUnicode_GET_SIZE(separator);
				3183	}
				3184
				3185	res = _PyUnicode_New(sz);
				3186	if (res == NULL)
				3187	goto onError;
				3188	p = PyUnicode_AS_UNICODE(res);
				3189	reslen = 0;
				3190
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3191	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3192	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3193	PyObject *item = PyIter_Next(it);
				3194	if (item == NULL) {
				3195	if (PyErr_Occurred())
				3196	goto onError;
				3197	break;
				3198	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3199	if (!PyUnicode_Check(item)) {
				3200	PyObject *v;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3201	if (!PyString_Check(item)) {
				3202	PyErr_Format(PyExc_TypeError,
				3203	"sequence item %i: expected string or Unicode,"
				3204	" %.80s found",
				3205	i, item->ob_type->tp_name);
				3206	Py_DECREF(item);
				3207	goto onError;
				3208	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3209	v = PyUnicode_FromObject(item);
				3210	Py_DECREF(item);
				3211	item = v;
				3212	if (item == NULL)
				3213	goto onError;
				3214	}
				3215	itemlen = PyUnicode_GET_SIZE(item);
				3216	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3217	if (_PyUnicode_Resize(&res, sz*2)) {
				3218	Py_DECREF(item);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3219	goto onError;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3220	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3221	sz *= 2;
				3222	p = PyUnicode_AS_UNICODE(res) + reslen;
				3223	}
				3224	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3225	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3226	p += seplen;
				3227	reslen += seplen;
				3228	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3229	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3230	p += itemlen;
				3231	reslen += itemlen;
				3232	Py_DECREF(item);
				3233	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3234	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3235	goto onError;
				3236
				3237	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3238	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3239	return (PyObject *)res;
				3240
				3241	onError:
				3242	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3243	Py_XDECREF(res);
				3244	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3245	return NULL;
				3246	}
				3247
				3248	static
				3249	PyUnicodeObject pad(PyUnicodeObject self,
				3250	int left,
				3251	int right,
				3252	Py_UNICODE fill)
				3253	{
				3254	PyUnicodeObject *u;
				3255
				3256	if (left < 0)
				3257	left = 0;
				3258	if (right < 0)
				3259	right = 0;
				3260
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3261	if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3262	Py_INCREF(self);
				3263	return self;
				3264	}
				3265
				3266	u = _PyUnicode_New(left + self->length + right);
				3267	if (u) {
				3268	if (left)
				3269	Py_UNICODE_FILL(u->str, fill, left);
				3270	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				3271	if (right)
				3272	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				3273	}
				3274
				3275	return u;
				3276	}
				3277
				3278	#define SPLIT_APPEND(data, left, right) \
				3279	str = PyUnicode_FromUnicode(data + left, right - left); \
				3280	if (!str) \
				3281	goto onError; \
				3282	if (PyList_Append(list, str)) { \
				3283	Py_DECREF(str); \
				3284	goto onError; \
				3285	} \
				3286	else \
				3287	Py_DECREF(str);
				3288
				3289	static
				3290	PyObject split_whitespace(PyUnicodeObject self,
				3291	PyObject *list,
				3292	int maxcount)
				3293	{
				3294	register int i;
				3295	register int j;
				3296	int len = self->length;
				3297	PyObject *str;
				3298
				3299	for (i = j = 0; i < len; ) {
				3300	/* find a token */
				3301	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3302	i++;
				3303	j = i;
				3304	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				3305	i++;
				3306	if (j < i) {
				3307	if (maxcount-- <= 0)
				3308	break;
				3309	SPLIT_APPEND(self->str, j, i);
				3310	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3311	i++;
				3312	j = i;
				3313	}
				3314	}
				3315	if (j < len) {
				3316	SPLIT_APPEND(self->str, j, len);
				3317	}
				3318	return list;
				3319
				3320	onError:
				3321	Py_DECREF(list);
				3322	return NULL;
				3323	}
				3324
				3325	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3326	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3327	{
				3328	register int i;
				3329	register int j;
				3330	int len;
				3331	PyObject *list;
				3332	PyObject *str;
				3333	Py_UNICODE *data;
				3334
				3335	string = PyUnicode_FromObject(string);
				3336	if (string == NULL)
				3337	return NULL;
				3338	data = PyUnicode_AS_UNICODE(string);
				3339	len = PyUnicode_GET_SIZE(string);
				3340
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3341	list = PyList_New(0);
				3342	if (!list)
				3343	goto onError;
				3344
				3345	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3346	int eol;
				3347
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3348	/* Find a line and append it */
				3349	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				3350	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3351
				3352	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3353	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3354	if (i < len) {
				3355	if (data[i] == '\r' && i + 1 < len &&
				3356	data[i+1] == '\n')
				3357	i += 2;
				3358	else
				3359	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3360	if (keepends)
				3361	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3362	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3363	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3364	j = i;
				3365	}
				3366	if (j < len) {
				3367	SPLIT_APPEND(data, j, len);
				3368	}
				3369
				3370	Py_DECREF(string);
				3371	return list;
				3372
				3373	onError:
				3374	Py_DECREF(list);
				3375	Py_DECREF(string);
				3376	return NULL;
				3377	}
				3378
				3379	static
				3380	PyObject split_char(PyUnicodeObject self,
				3381	PyObject *list,
				3382	Py_UNICODE ch,
				3383	int maxcount)
				3384	{
				3385	register int i;
				3386	register int j;
				3387	int len = self->length;
				3388	PyObject *str;
				3389
				3390	for (i = j = 0; i < len; ) {
				3391	if (self->str[i] == ch) {
				3392	if (maxcount-- <= 0)
				3393	break;
				3394	SPLIT_APPEND(self->str, j, i);
				3395	i = j = i + 1;
				3396	} else
				3397	i++;
				3398	}
				3399	if (j <= len) {
				3400	SPLIT_APPEND(self->str, j, len);
				3401	}
				3402	return list;
				3403
				3404	onError:
				3405	Py_DECREF(list);
				3406	return NULL;
				3407	}
				3408
				3409	static
				3410	PyObject split_substring(PyUnicodeObject self,
				3411	PyObject *list,
				3412	PyUnicodeObject *substring,
				3413	int maxcount)
				3414	{
				3415	register int i;
				3416	register int j;
				3417	int len = self->length;
				3418	int sublen = substring->length;
				3419	PyObject *str;
				3420
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3421	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3422	if (Py_UNICODE_MATCH(self, i, substring)) {
				3423	if (maxcount-- <= 0)
				3424	break;
				3425	SPLIT_APPEND(self->str, j, i);
				3426	i = j = i + sublen;
				3427	} else
				3428	i++;
				3429	}
				3430	if (j <= len) {
				3431	SPLIT_APPEND(self->str, j, len);
				3432	}
				3433	return list;
				3434
				3435	onError:
				3436	Py_DECREF(list);
				3437	return NULL;
				3438	}
				3439
				3440	#undef SPLIT_APPEND
				3441
				3442	static
				3443	PyObject split(PyUnicodeObject self,
				3444	PyUnicodeObject *substring,
				3445	int maxcount)
				3446	{
				3447	PyObject *list;
				3448
				3449	if (maxcount < 0)
				3450	maxcount = INT_MAX;
				3451
				3452	list = PyList_New(0);
				3453	if (!list)
				3454	return NULL;
				3455
				3456	if (substring == NULL)
				3457	return split_whitespace(self,list,maxcount);
				3458
				3459	else if (substring->length == 1)
				3460	return split_char(self,list,substring->str[0],maxcount);
				3461
				3462	else if (substring->length == 0) {
				3463	Py_DECREF(list);
				3464	PyErr_SetString(PyExc_ValueError, "empty separator");
				3465	return NULL;
				3466	}
				3467	else
				3468	return split_substring(self,list,substring,maxcount);
				3469	}
				3470
				3471	static
				3472	PyObject strip(PyUnicodeObject self,
				3473	int left,
				3474	int right)
				3475	{
				3476	Py_UNICODE *p = self->str;
				3477	int start = 0;
				3478	int end = self->length;
				3479
				3480	if (left)
				3481	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3482	start++;
				3483
				3484	if (right)
				3485	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3486	end--;
				3487
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3488	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3489	/* couldn't strip anything off, return original string */
				3490	Py_INCREF(self);
				3491	return (PyObject*) self;
				3492	}
				3493
				3494	return (PyObject*) PyUnicode_FromUnicode(
				3495	self->str + start,
				3496	end - start
				3497	);
				3498	}
				3499
				3500	static
				3501	PyObject replace(PyUnicodeObject self,
				3502	PyUnicodeObject *str1,
				3503	PyUnicodeObject *str2,
				3504	int maxcount)
				3505	{
				3506	PyUnicodeObject *u;
				3507
				3508	if (maxcount < 0)
				3509	maxcount = INT_MAX;
				3510
				3511	if (str1->length == 1 && str2->length == 1) {
				3512	int i;
				3513
				3514	/* replace characters */
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3515	if (!findchar(self->str, self->length, str1->str[0]) &&
				3516	PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3517	/* nothing to replace, return original string */
				3518	Py_INCREF(self);
				3519	u = self;
				3520	} else {
				3521	Py_UNICODE u1 = str1->str[0];
				3522	Py_UNICODE u2 = str2->str[0];
				3523
				3524	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3525	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3526	self->length
				3527	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3528	if (u != NULL) {
				3529	Py_UNICODE_COPY(u->str, self->str,
				3530	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3531	for (i = 0; i < u->length; i++)
				3532	if (u->str[i] == u1) {
				3533	if (--maxcount < 0)
				3534	break;
				3535	u->str[i] = u2;
				3536	}
				3537	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3538	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3539
				3540	} else {
				3541	int n, i;
				3542	Py_UNICODE *p;
				3543
				3544	/* replace strings */
				3545	n = count(self, 0, self->length, str1);
				3546	if (n > maxcount)
				3547	n = maxcount;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3548	if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3549	/* nothing to replace, return original string */
				3550	Py_INCREF(self);
				3551	u = self;
				3552	} else {
				3553	u = _PyUnicode_New(
				3554	self->length + n * (str2->length - str1->length));
				3555	if (u) {
				3556	i = 0;
				3557	p = u->str;
				3558	while (i <= self->length - str1->length)
				3559	if (Py_UNICODE_MATCH(self, i, str1)) {
				3560	/* replace string segment */
				3561	Py_UNICODE_COPY(p, str2->str, str2->length);
				3562	p += str2->length;
				3563	i += str1->length;
				3564	if (--n <= 0) {
				3565	/* copy remaining part */
				3566	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3567	break;
				3568	}
				3569	} else
				3570	*p++ = self->str[i++];
				3571	}
				3572	}
				3573	}
				3574
				3575	return (PyObject *) u;
				3576	}
				3577
				3578	/* --- Unicode Object Methods --------------------------------------------- */
				3579
				3580	static char title__doc__[] =
				3581	"S.title() -> unicode\n\
				3582	\n\
				3583	Return a titlecased version of S, i.e. words start with title case\n\
				3584	characters, all remaining cased characters have lower case.";
				3585
				3586	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3587	unicode_title(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3588	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3589	return fixup(self, fixtitle);
				3590	}
				3591
				3592	static char capitalize__doc__[] =
				3593	"S.capitalize() -> unicode\n\
				3594	\n\
				3595	Return a capitalized version of S, i.e. make the first character\n\
				3596	have upper case.";
				3597
				3598	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3599	unicode_capitalize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3600	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3601	return fixup(self, fixcapitalize);
				3602	}
				3603
				3604	#if 0
				3605	static char capwords__doc__[] =
				3606	"S.capwords() -> unicode\n\
				3607	\n\
				3608	Apply .capitalize() to all words in S and return the result with\n\
				3609	normalized whitespace (all whitespace strings are replaced by ' ').";
				3610
				3611	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3612	unicode_capwords(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3613	{
				3614	PyObject *list;
				3615	PyObject *item;
				3616	int i;
				3617
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3618	/* Split into words */
				3619	list = split(self, NULL, -1);
				3620	if (!list)
				3621	return NULL;
				3622
				3623	/* Capitalize each word */
				3624	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3625	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3626	fixcapitalize);
				3627	if (item == NULL)
				3628	goto onError;
				3629	Py_DECREF(PyList_GET_ITEM(list, i));
				3630	PyList_SET_ITEM(list, i, item);
				3631	}
				3632
				3633	/* Join the words to form a new string */
				3634	item = PyUnicode_Join(NULL, list);
				3635
				3636	onError:
				3637	Py_DECREF(list);
				3638	return (PyObject *)item;
				3639	}
				3640	#endif
				3641
				3642	static char center__doc__[] =
				3643	"S.center(width) -> unicode\n\
				3644	\n\
				3645	Return S centered in a Unicode string of length width. Padding is done\n\
				3646	using spaces.";
				3647
				3648	static PyObject *
				3649	unicode_center(PyUnicodeObject self, PyObject args)
				3650	{
				3651	int marg, left;
				3652	int width;
				3653
				3654	if (!PyArg_ParseTuple(args, "i:center", &width))
				3655	return NULL;
				3656
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3657	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3658	Py_INCREF(self);
				3659	return (PyObject*) self;
				3660	}
				3661
				3662	marg = width - self->length;
				3663	left = marg / 2 + (marg & width & 1);
				3664
				3665	return (PyObject*) pad(self, left, marg - left, ' ');
				3666	}
				3667
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3668	#if 0
				3669
				3670	/* This code should go into some future Unicode collation support
				3671	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3672	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3673
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3674	/* speedy UTF-16 code point order comparison */
				3675	/* gleaned from: */
				3676	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3677
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3678	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3679	{
				3680	0, 0, 0, 0, 0, 0, 0, 0,
				3681	0, 0, 0, 0, 0, 0, 0, 0,
				3682	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3683	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3684	};
				3685
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3686	static int
				3687	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3688	{
				3689	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3690
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3691	Py_UNICODE *s1 = str1->str;
				3692	Py_UNICODE *s2 = str2->str;
				3693
				3694	len1 = str1->length;
				3695	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3696
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3697	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3698	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3699
				3700	c1 = *s1++;
				3701	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3702
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3703	if (c1 > (1<<11) * 26)
				3704	c1 += utf16Fixup[c1>>11];
				3705	if (c2 > (1<<11) * 26)
				3706	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3707	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3708
				3709	if (c1 != c2)
				3710	return (c1 < c2) ? -1 : 1;
				3711
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3712	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3713	}
				3714
				3715	return (len1 < len2) ? -1 : (len1 != len2);
				3716	}
				3717
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3718	#else
				3719
				3720	static int
				3721	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3722	{
				3723	register int len1, len2;
				3724
				3725	Py_UNICODE *s1 = str1->str;
				3726	Py_UNICODE *s2 = str2->str;
				3727
				3728	len1 = str1->length;
				3729	len2 = str2->length;
				3730
				3731	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3732	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3733
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3734	c1 = *s1++;
				3735	c2 = *s2++;
				3736
				3737	if (c1 != c2)
				3738	return (c1 < c2) ? -1 : 1;
				3739
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3740	len1--; len2--;
				3741	}
				3742
				3743	return (len1 < len2) ? -1 : (len1 != len2);
				3744	}
				3745
				3746	#endif
				3747
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3748	int PyUnicode_Compare(PyObject *left,
				3749	PyObject *right)
				3750	{
				3751	PyUnicodeObject u = NULL, v = NULL;
				3752	int result;
				3753
				3754	/* Coerce the two arguments */
				3755	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3756	if (u == NULL)
				3757	goto onError;
				3758	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3759	if (v == NULL)
				3760	goto onError;
				3761
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3762	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3763	if (v == u) {
				3764	Py_DECREF(u);
				3765	Py_DECREF(v);
				3766	return 0;
				3767	}
				3768
				3769	result = unicode_compare(u, v);
				3770
				3771	Py_DECREF(u);
				3772	Py_DECREF(v);
				3773	return result;
				3774
				3775	onError:
				3776	Py_XDECREF(u);
				3777	Py_XDECREF(v);
				3778	return -1;
				3779	}
				3780
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3781	int PyUnicode_Contains(PyObject *container,
				3782	PyObject *element)
				3783	{
				3784	PyUnicodeObject u = NULL, v = NULL;
				3785	int result;
				3786	register const Py_UNICODE p, e;
				3787	register Py_UNICODE ch;
				3788
				3789	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3790	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3791	if (v == NULL) {
				3792	PyErr_SetString(PyExc_TypeError,
				3793	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3794	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3795	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3796	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3797	if (u == NULL) {
				3798	Py_DECREF(v);
				3799	goto onError;
				3800	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3801
				3802	/* Check v in u */
				3803	if (PyUnicode_GET_SIZE(v) != 1) {
				3804	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3805	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3806	goto onError;
				3807	}
				3808	ch = *PyUnicode_AS_UNICODE(v);
				3809	p = PyUnicode_AS_UNICODE(u);
				3810	e = p + PyUnicode_GET_SIZE(u);
				3811	result = 0;
				3812	while (p < e) {
				3813	if (*p++ == ch) {
				3814	result = 1;
				3815	break;
				3816	}
				3817	}
				3818
				3819	Py_DECREF(u);
				3820	Py_DECREF(v);
				3821	return result;
				3822
				3823	onError:
				3824	Py_XDECREF(u);
				3825	Py_XDECREF(v);
				3826	return -1;
				3827	}
				3828
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3829	/* Concat to string or Unicode object giving a new Unicode object. */
				3830
				3831	PyObject PyUnicode_Concat(PyObject left,
				3832	PyObject *right)
				3833	{
				3834	PyUnicodeObject u = NULL, v = NULL, *w;
				3835
				3836	/* Coerce the two arguments */
				3837	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3838	if (u == NULL)
				3839	goto onError;
				3840	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3841	if (v == NULL)
				3842	goto onError;
				3843
				3844	/* Shortcuts */
				3845	if (v == unicode_empty) {
				3846	Py_DECREF(v);
				3847	return (PyObject *)u;
				3848	}
				3849	if (u == unicode_empty) {
				3850	Py_DECREF(u);
				3851	return (PyObject *)v;
				3852	}
				3853
				3854	/* Concat the two Unicode strings */
				3855	w = _PyUnicode_New(u->length + v->length);
				3856	if (w == NULL)
				3857	goto onError;
				3858	Py_UNICODE_COPY(w->str, u->str, u->length);
				3859	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3860
				3861	Py_DECREF(u);
				3862	Py_DECREF(v);
				3863	return (PyObject *)w;
				3864
				3865	onError:
				3866	Py_XDECREF(u);
				3867	Py_XDECREF(v);
				3868	return NULL;
				3869	}
				3870
				3871	static char count__doc__[] =
				3872	"S.count(sub[, start[, end]]) -> int\n\
				3873	\n\
				3874	Return the number of occurrences of substring sub in Unicode string\n\
				3875	S[start:end]. Optional arguments start and end are\n\
				3876	interpreted as in slice notation.";
				3877
				3878	static PyObject *
				3879	unicode_count(PyUnicodeObject self, PyObject args)
				3880	{
				3881	PyUnicodeObject *substring;
				3882	int start = 0;
				3883	int end = INT_MAX;
				3884	PyObject *result;
				3885
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3886	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3887	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3888	return NULL;
				3889
				3890	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3891	(PyObject *)substring);
				3892	if (substring == NULL)
				3893	return NULL;
				3894
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3895	if (start < 0)
				3896	start += self->length;
				3897	if (start < 0)
				3898	start = 0;
				3899	if (end > self->length)
				3900	end = self->length;
				3901	if (end < 0)
				3902	end += self->length;
				3903	if (end < 0)
				3904	end = 0;
				3905
				3906	result = PyInt_FromLong((long) count(self, start, end, substring));
				3907
				3908	Py_DECREF(substring);
				3909	return result;
				3910	}
				3911
				3912	static char encode__doc__[] =
				3913	"S.encode([encoding[,errors]]) -> string\n\
				3914	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3915	Return an encoded string version of S. Default encoding is the current\n\
				3916	default string encoding. errors may be given to set a different error\n\
				3917	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3918	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3919
				3920	static PyObject *
				3921	unicode_encode(PyUnicodeObject self, PyObject args)
				3922	{
				3923	char *encoding = NULL;
				3924	char *errors = NULL;
				3925	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3926	return NULL;
				3927	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3928	}
				3929
				3930	static char expandtabs__doc__[] =
				3931	"S.expandtabs([tabsize]) -> unicode\n\
				3932	\n\
				3933	Return a copy of S where all tab characters are expanded using spaces.\n\
				3934	If tabsize is not given, a tab size of 8 characters is assumed.";
				3935
				3936	static PyObject*
				3937	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3938	{
				3939	Py_UNICODE *e;
				3940	Py_UNICODE *p;
				3941	Py_UNICODE *q;
				3942	int i, j;
				3943	PyUnicodeObject *u;
				3944	int tabsize = 8;
				3945
				3946	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3947	return NULL;
				3948
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3949	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3950	i = j = 0;
				3951	e = self->str + self->length;
				3952	for (p = self->str; p < e; p++)
				3953	if (*p == '\t') {
				3954	if (tabsize > 0)
				3955	j += tabsize - (j % tabsize);
				3956	}
				3957	else {
				3958	j++;
				3959	if (p == '\n' \|\| p == '\r') {
				3960	i += j;
				3961	j = 0;
				3962	}
				3963	}
				3964
				3965	/* Second pass: create output string and fill it */
				3966	u = _PyUnicode_New(i + j);
				3967	if (!u)
				3968	return NULL;
				3969
				3970	j = 0;
				3971	q = u->str;
				3972
				3973	for (p = self->str; p < e; p++)
				3974	if (*p == '\t') {
				3975	if (tabsize > 0) {
				3976	i = tabsize - (j % tabsize);
				3977	j += i;
				3978	while (i--)
				3979	*q++ = ' ';
				3980	}
				3981	}
				3982	else {
				3983	j++;
				3984	q++ = p;
				3985	if (p == '\n' \|\| p == '\r')
				3986	j = 0;
				3987	}
				3988
				3989	return (PyObject*) u;
				3990	}
				3991
				3992	static char find__doc__[] =
				3993	"S.find(sub [,start [,end]]) -> int\n\
				3994	\n\
				3995	Return the lowest index in S where substring sub is found,\n\
				3996	such that sub is contained within s[start,end]. Optional\n\
				3997	arguments start and end are interpreted as in slice notation.\n\
				3998	\n\
				3999	Return -1 on failure.";
				4000
				4001	static PyObject *
				4002	unicode_find(PyUnicodeObject self, PyObject args)
				4003	{
				4004	PyUnicodeObject *substring;
				4005	int start = 0;
				4006	int end = INT_MAX;
				4007	PyObject *result;
				4008
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4009	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				4010	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4011	return NULL;
				4012	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4013	(PyObject *)substring);
				4014	if (substring == NULL)
				4015	return NULL;
				4016
				4017	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				4018
				4019	Py_DECREF(substring);
				4020	return result;
				4021	}
				4022
				4023	static PyObject *
				4024	unicode_getitem(PyUnicodeObject *self, int index)
				4025	{
				4026	if (index < 0 \|\| index >= self->length) {
				4027	PyErr_SetString(PyExc_IndexError, "string index out of range");
				4028	return NULL;
				4029	}
				4030
				4031	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				4032	}
				4033
				4034	static long
				4035	unicode_hash(PyUnicodeObject *self)
				4036	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4037	/* Since Unicode objects compare equal to their ASCII string
				4038	counterparts, they should use the individual character values
				4039	as basis for their hash value. This is needed to assure that
				4040	strings and Unicode objects behave in the same way as
				4041	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4042
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4043	register int len;
				4044	register Py_UNICODE *p;
				4045	register long x;
				4046
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4047	if (self->hash != -1)
				4048	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4049	len = PyUnicode_GET_SIZE(self);
				4050	p = PyUnicode_AS_UNICODE(self);
				4051	x = *p << 7;
				4052	while (--len >= 0)
				4053	x = (1000003x) ^ p++;
				4054	x ^= PyUnicode_GET_SIZE(self);
				4055	if (x == -1)
				4056	x = -2;
				4057	self->hash = x;
				4058	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4059	}
				4060
				4061	static char index__doc__[] =
				4062	"S.index(sub [,start [,end]]) -> int\n\
				4063	\n\
				4064	Like S.find() but raise ValueError when the substring is not found.";
				4065
				4066	static PyObject *
				4067	unicode_index(PyUnicodeObject self, PyObject args)
				4068	{
				4069	int result;
				4070	PyUnicodeObject *substring;
				4071	int start = 0;
				4072	int end = INT_MAX;
				4073
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4074	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				4075	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4076	return NULL;
				4077
				4078	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4079	(PyObject *)substring);
				4080	if (substring == NULL)
				4081	return NULL;
				4082
				4083	result = findstring(self, substring, start, end, 1);
				4084
				4085	Py_DECREF(substring);
				4086	if (result < 0) {
				4087	PyErr_SetString(PyExc_ValueError, "substring not found");
				4088	return NULL;
				4089	}
				4090	return PyInt_FromLong(result);
				4091	}
				4092
				4093	static char islower__doc__[] =
				4094	"S.islower() -> int\n\
				4095	\n\
				4096	Return 1 if all cased characters in S are lowercase and there is\n\
				4097	at least one cased character in S, 0 otherwise.";
				4098
				4099	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4100	unicode_islower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4101	{
				4102	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4103	register const Py_UNICODE *e;
				4104	int cased;
				4105
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4106	/* Shortcut for single character strings */
				4107	if (PyUnicode_GET_SIZE(self) == 1)
				4108	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				4109
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4110	/* Special case for empty strings */
				4111	if (PyString_GET_SIZE(self) == 0)
				4112	return PyInt_FromLong(0);
				4113
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4114	e = p + PyUnicode_GET_SIZE(self);
				4115	cased = 0;
				4116	for (; p < e; p++) {
				4117	register const Py_UNICODE ch = *p;
				4118
				4119	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4120	return PyInt_FromLong(0);
				4121	else if (!cased && Py_UNICODE_ISLOWER(ch))
				4122	cased = 1;
				4123	}
				4124	return PyInt_FromLong(cased);
				4125	}
				4126
				4127	static char isupper__doc__[] =
				4128	"S.isupper() -> int\n\
				4129	\n\
				4130	Return 1 if all cased characters in S are uppercase and there is\n\
				4131	at least one cased character in S, 0 otherwise.";
				4132
				4133	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4134	unicode_isupper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4135	{
				4136	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4137	register const Py_UNICODE *e;
				4138	int cased;
				4139
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4140	/* Shortcut for single character strings */
				4141	if (PyUnicode_GET_SIZE(self) == 1)
				4142	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				4143
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4144	/* Special case for empty strings */
				4145	if (PyString_GET_SIZE(self) == 0)
				4146	return PyInt_FromLong(0);
				4147
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4148	e = p + PyUnicode_GET_SIZE(self);
				4149	cased = 0;
				4150	for (; p < e; p++) {
				4151	register const Py_UNICODE ch = *p;
				4152
				4153	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4154	return PyInt_FromLong(0);
				4155	else if (!cased && Py_UNICODE_ISUPPER(ch))
				4156	cased = 1;
				4157	}
				4158	return PyInt_FromLong(cased);
				4159	}
				4160
				4161	static char istitle__doc__[] =
				4162	"S.istitle() -> int\n\
				4163	\n\
				4164	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				4165	may only follow uncased characters and lowercase characters only cased\n\
				4166	ones. Return 0 otherwise.";
				4167
				4168	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4169	unicode_istitle(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4170	{
				4171	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4172	register const Py_UNICODE *e;
				4173	int cased, previous_is_cased;
				4174
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4175	/* Shortcut for single character strings */
				4176	if (PyUnicode_GET_SIZE(self) == 1)
				4177	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				4178	(Py_UNICODE_ISUPPER(*p) != 0));
				4179
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4180	/* Special case for empty strings */
				4181	if (PyString_GET_SIZE(self) == 0)
				4182	return PyInt_FromLong(0);
				4183
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4184	e = p + PyUnicode_GET_SIZE(self);
				4185	cased = 0;
				4186	previous_is_cased = 0;
				4187	for (; p < e; p++) {
				4188	register const Py_UNICODE ch = *p;
				4189
				4190	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				4191	if (previous_is_cased)
				4192	return PyInt_FromLong(0);
				4193	previous_is_cased = 1;
				4194	cased = 1;
				4195	}
				4196	else if (Py_UNICODE_ISLOWER(ch)) {
				4197	if (!previous_is_cased)
				4198	return PyInt_FromLong(0);
				4199	previous_is_cased = 1;
				4200	cased = 1;
				4201	}
				4202	else
				4203	previous_is_cased = 0;
				4204	}
				4205	return PyInt_FromLong(cased);
				4206	}
				4207
				4208	static char isspace__doc__[] =
				4209	"S.isspace() -> int\n\
				4210	\n\
				4211	Return 1 if there are only whitespace characters in S,\n\
				4212	0 otherwise.";
				4213
				4214	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4215	unicode_isspace(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4216	{
				4217	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4218	register const Py_UNICODE *e;
				4219
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4220	/* Shortcut for single character strings */
				4221	if (PyUnicode_GET_SIZE(self) == 1 &&
				4222	Py_UNICODE_ISSPACE(*p))
				4223	return PyInt_FromLong(1);
				4224
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4225	/* Special case for empty strings */
				4226	if (PyString_GET_SIZE(self) == 0)
				4227	return PyInt_FromLong(0);
				4228
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4229	e = p + PyUnicode_GET_SIZE(self);
				4230	for (; p < e; p++) {
				4231	if (!Py_UNICODE_ISSPACE(*p))
				4232	return PyInt_FromLong(0);
				4233	}
				4234	return PyInt_FromLong(1);
				4235	}
				4236
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4237	static char isalpha__doc__[] =
				4238	"S.isalpha() -> int\n\
				4239	\n\
				4240	Return 1 if all characters in S are alphabetic\n\
				4241	and there is at least one character in S, 0 otherwise.";
				4242
				4243	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4244	unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4245	{
				4246	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4247	register const Py_UNICODE *e;
				4248
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4249	/* Shortcut for single character strings */
				4250	if (PyUnicode_GET_SIZE(self) == 1 &&
				4251	Py_UNICODE_ISALPHA(*p))
				4252	return PyInt_FromLong(1);
				4253
				4254	/* Special case for empty strings */
				4255	if (PyString_GET_SIZE(self) == 0)
				4256	return PyInt_FromLong(0);
				4257
				4258	e = p + PyUnicode_GET_SIZE(self);
				4259	for (; p < e; p++) {
				4260	if (!Py_UNICODE_ISALPHA(*p))
				4261	return PyInt_FromLong(0);
				4262	}
				4263	return PyInt_FromLong(1);
				4264	}
				4265
				4266	static char isalnum__doc__[] =
				4267	"S.isalnum() -> int\n\
				4268	\n\
				4269	Return 1 if all characters in S are alphanumeric\n\
				4270	and there is at least one character in S, 0 otherwise.";
				4271
				4272	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4273	unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4274	{
				4275	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4276	register const Py_UNICODE *e;
				4277
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4278	/* Shortcut for single character strings */
				4279	if (PyUnicode_GET_SIZE(self) == 1 &&
				4280	Py_UNICODE_ISALNUM(*p))
				4281	return PyInt_FromLong(1);
				4282
				4283	/* Special case for empty strings */
				4284	if (PyString_GET_SIZE(self) == 0)
				4285	return PyInt_FromLong(0);
				4286
				4287	e = p + PyUnicode_GET_SIZE(self);
				4288	for (; p < e; p++) {
				4289	if (!Py_UNICODE_ISALNUM(*p))
				4290	return PyInt_FromLong(0);
				4291	}
				4292	return PyInt_FromLong(1);
				4293	}
				4294
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4295	static char isdecimal__doc__[] =
				4296	"S.isdecimal() -> int\n\
				4297	\n\
				4298	Return 1 if there are only decimal characters in S,\n\
				4299	0 otherwise.";
				4300
				4301	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4302	unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4303	{
				4304	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4305	register const Py_UNICODE *e;
				4306
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4307	/* Shortcut for single character strings */
				4308	if (PyUnicode_GET_SIZE(self) == 1 &&
				4309	Py_UNICODE_ISDECIMAL(*p))
				4310	return PyInt_FromLong(1);
				4311
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4312	/* Special case for empty strings */
				4313	if (PyString_GET_SIZE(self) == 0)
				4314	return PyInt_FromLong(0);
				4315
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4316	e = p + PyUnicode_GET_SIZE(self);
				4317	for (; p < e; p++) {
				4318	if (!Py_UNICODE_ISDECIMAL(*p))
				4319	return PyInt_FromLong(0);
				4320	}
				4321	return PyInt_FromLong(1);
				4322	}
				4323
				4324	static char isdigit__doc__[] =
				4325	"S.isdigit() -> int\n\
				4326	\n\
				4327	Return 1 if there are only digit characters in S,\n\
				4328	0 otherwise.";
				4329
				4330	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4331	unicode_isdigit(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4332	{
				4333	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4334	register const Py_UNICODE *e;
				4335
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4336	/* Shortcut for single character strings */
				4337	if (PyUnicode_GET_SIZE(self) == 1 &&
				4338	Py_UNICODE_ISDIGIT(*p))
				4339	return PyInt_FromLong(1);
				4340
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4341	/* Special case for empty strings */
				4342	if (PyString_GET_SIZE(self) == 0)
				4343	return PyInt_FromLong(0);
				4344
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4345	e = p + PyUnicode_GET_SIZE(self);
				4346	for (; p < e; p++) {
				4347	if (!Py_UNICODE_ISDIGIT(*p))
				4348	return PyInt_FromLong(0);
				4349	}
				4350	return PyInt_FromLong(1);
				4351	}
				4352
				4353	static char isnumeric__doc__[] =
				4354	"S.isnumeric() -> int\n\
				4355	\n\
				4356	Return 1 if there are only numeric characters in S,\n\
				4357	0 otherwise.";
				4358
				4359	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4360	unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4361	{
				4362	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4363	register const Py_UNICODE *e;
				4364
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4365	/* Shortcut for single character strings */
				4366	if (PyUnicode_GET_SIZE(self) == 1 &&
				4367	Py_UNICODE_ISNUMERIC(*p))
				4368	return PyInt_FromLong(1);
				4369
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4370	/* Special case for empty strings */
				4371	if (PyString_GET_SIZE(self) == 0)
				4372	return PyInt_FromLong(0);
				4373
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4374	e = p + PyUnicode_GET_SIZE(self);
				4375	for (; p < e; p++) {
				4376	if (!Py_UNICODE_ISNUMERIC(*p))
				4377	return PyInt_FromLong(0);
				4378	}
				4379	return PyInt_FromLong(1);
				4380	}
				4381
				4382	static char join__doc__[] =
				4383	"S.join(sequence) -> unicode\n\
				4384	\n\
				4385	Return a string which is the concatenation of the strings in the\n\
				4386	sequence. The separator between elements is S.";
				4387
				4388	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4389	unicode_join(PyObject self, PyObject data)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4390	{
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4391	return PyUnicode_Join(self, data);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4392	}
				4393
				4394	static int
				4395	unicode_length(PyUnicodeObject *self)
				4396	{
				4397	return self->length;
				4398	}
				4399
				4400	static char ljust__doc__[] =
				4401	"S.ljust(width) -> unicode\n\
				4402	\n\
				4403	Return S left justified in a Unicode string of length width. Padding is\n\
				4404	done using spaces.";
				4405
				4406	static PyObject *
				4407	unicode_ljust(PyUnicodeObject self, PyObject args)
				4408	{
				4409	int width;
				4410	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4411	return NULL;
				4412
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4413	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4414	Py_INCREF(self);
				4415	return (PyObject*) self;
				4416	}
				4417
				4418	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4419	}
				4420
				4421	static char lower__doc__[] =
				4422	"S.lower() -> unicode\n\
				4423	\n\
				4424	Return a copy of the string S converted to lowercase.";
				4425
				4426	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4427	unicode_lower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4428	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4429	return fixup(self, fixlower);
				4430	}
				4431
				4432	static char lstrip__doc__[] =
				4433	"S.lstrip() -> unicode\n\
				4434	\n\
				4435	Return a copy of the string S with leading whitespace removed.";
				4436
				4437	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4438	unicode_lstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4439	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4440	return strip(self, 1, 0);
				4441	}
				4442
				4443	static PyObject*
				4444	unicode_repeat(PyUnicodeObject *str, int len)
				4445	{
				4446	PyUnicodeObject *u;
				4447	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4448	int nchars;
				4449	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4450
				4451	if (len < 0)
				4452	len = 0;
				4453
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4454	if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4455	/* no repeat, return original string */
				4456	Py_INCREF(str);
				4457	return (PyObject*) str;
				4458	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4459
				4460	/* ensure # of chars needed doesn't overflow int and # of bytes
				4461	* needed doesn't overflow size_t
				4462	*/
				4463	nchars = len * str->length;
				4464	if (len && nchars / len != str->length) {
				4465	PyErr_SetString(PyExc_OverflowError,
				4466	"repeated string is too long");
				4467	return NULL;
				4468	}
				4469	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4470	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4471	PyErr_SetString(PyExc_OverflowError,
				4472	"repeated string is too long");
				4473	return NULL;
				4474	}
				4475	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4476	if (!u)
				4477	return NULL;
				4478
				4479	p = u->str;
				4480
				4481	while (len-- > 0) {
				4482	Py_UNICODE_COPY(p, str->str, str->length);
				4483	p += str->length;
				4484	}
				4485
				4486	return (PyObject*) u;
				4487	}
				4488
				4489	PyObject PyUnicode_Replace(PyObject obj,
				4490	PyObject *subobj,
				4491	PyObject *replobj,
				4492	int maxcount)
				4493	{
				4494	PyObject *self;
				4495	PyObject *str1;
				4496	PyObject *str2;
				4497	PyObject *result;
				4498
				4499	self = PyUnicode_FromObject(obj);
				4500	if (self == NULL)
				4501	return NULL;
				4502	str1 = PyUnicode_FromObject(subobj);
				4503	if (str1 == NULL) {
				4504	Py_DECREF(self);
				4505	return NULL;
				4506	}
				4507	str2 = PyUnicode_FromObject(replobj);
				4508	if (str2 == NULL) {
				4509	Py_DECREF(self);
				4510	Py_DECREF(str1);
				4511	return NULL;
				4512	}
				4513	result = replace((PyUnicodeObject *)self,
				4514	(PyUnicodeObject *)str1,
				4515	(PyUnicodeObject *)str2,
				4516	maxcount);
				4517	Py_DECREF(self);
				4518	Py_DECREF(str1);
				4519	Py_DECREF(str2);
				4520	return result;
				4521	}
				4522
				4523	static char replace__doc__[] =
				4524	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4525	\n\
				4526	Return a copy of S with all occurrences of substring\n\
				4527	old replaced by new. If the optional argument maxsplit is\n\
				4528	given, only the first maxsplit occurrences are replaced.";
				4529
				4530	static PyObject*
				4531	unicode_replace(PyUnicodeObject self, PyObject args)
				4532	{
				4533	PyUnicodeObject *str1;
				4534	PyUnicodeObject *str2;
				4535	int maxcount = -1;
				4536	PyObject *result;
				4537
				4538	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4539	return NULL;
				4540	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4541	if (str1 == NULL)
				4542	return NULL;
				4543	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4544	if (str2 == NULL)
				4545	return NULL;
				4546
				4547	result = replace(self, str1, str2, maxcount);
				4548
				4549	Py_DECREF(str1);
				4550	Py_DECREF(str2);
				4551	return result;
				4552	}
				4553
				4554	static
				4555	PyObject unicode_repr(PyObject unicode)
				4556	{
				4557	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4558	PyUnicode_GET_SIZE(unicode),
				4559	1);
				4560	}
				4561
				4562	static char rfind__doc__[] =
				4563	"S.rfind(sub [,start [,end]]) -> int\n\
				4564	\n\
				4565	Return the highest index in S where substring sub is found,\n\
				4566	such that sub is contained within s[start,end]. Optional\n\
				4567	arguments start and end are interpreted as in slice notation.\n\
				4568	\n\
				4569	Return -1 on failure.";
				4570
				4571	static PyObject *
				4572	unicode_rfind(PyUnicodeObject self, PyObject args)
				4573	{
				4574	PyUnicodeObject *substring;
				4575	int start = 0;
				4576	int end = INT_MAX;
				4577	PyObject *result;
				4578
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4579	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4580	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4581	return NULL;
				4582	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4583	(PyObject *)substring);
				4584	if (substring == NULL)
				4585	return NULL;
				4586
				4587	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4588
				4589	Py_DECREF(substring);
				4590	return result;
				4591	}
				4592
				4593	static char rindex__doc__[] =
				4594	"S.rindex(sub [,start [,end]]) -> int\n\
				4595	\n\
				4596	Like S.rfind() but raise ValueError when the substring is not found.";
				4597
				4598	static PyObject *
				4599	unicode_rindex(PyUnicodeObject self, PyObject args)
				4600	{
				4601	int result;
				4602	PyUnicodeObject *substring;
				4603	int start = 0;
				4604	int end = INT_MAX;
				4605
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4606	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4607	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4608	return NULL;
				4609	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4610	(PyObject *)substring);
				4611	if (substring == NULL)
				4612	return NULL;
				4613
				4614	result = findstring(self, substring, start, end, -1);
				4615
				4616	Py_DECREF(substring);
				4617	if (result < 0) {
				4618	PyErr_SetString(PyExc_ValueError, "substring not found");
				4619	return NULL;
				4620	}
				4621	return PyInt_FromLong(result);
				4622	}
				4623
				4624	static char rjust__doc__[] =
				4625	"S.rjust(width) -> unicode\n\
				4626	\n\
				4627	Return S right justified in a Unicode string of length width. Padding is\n\
				4628	done using spaces.";
				4629
				4630	static PyObject *
				4631	unicode_rjust(PyUnicodeObject self, PyObject args)
				4632	{
				4633	int width;
				4634	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4635	return NULL;
				4636
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4637	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4638	Py_INCREF(self);
				4639	return (PyObject*) self;
				4640	}
				4641
				4642	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4643	}
				4644
				4645	static char rstrip__doc__[] =
				4646	"S.rstrip() -> unicode\n\
				4647	\n\
				4648	Return a copy of the string S with trailing whitespace removed.";
				4649
				4650	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4651	unicode_rstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4652	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4653	return strip(self, 0, 1);
				4654	}
				4655
				4656	static PyObject*
				4657	unicode_slice(PyUnicodeObject *self, int start, int end)
				4658	{
				4659	/* standard clamping */
				4660	if (start < 0)
				4661	start = 0;
				4662	if (end < 0)
				4663	end = 0;
				4664	if (end > self->length)
				4665	end = self->length;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4666	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4667	/* full slice, return original string */
				4668	Py_INCREF(self);
				4669	return (PyObject*) self;
				4670	}
				4671	if (start > end)
				4672	start = end;
				4673	/* copy slice */
				4674	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4675	end - start);
				4676	}
				4677
				4678	PyObject PyUnicode_Split(PyObject s,
				4679	PyObject *sep,
				4680	int maxsplit)
				4681	{
				4682	PyObject *result;
				4683
				4684	s = PyUnicode_FromObject(s);
				4685	if (s == NULL)
				4686	return NULL;
				4687	if (sep != NULL) {
				4688	sep = PyUnicode_FromObject(sep);
				4689	if (sep == NULL) {
				4690	Py_DECREF(s);
				4691	return NULL;
				4692	}
				4693	}
				4694
				4695	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4696
				4697	Py_DECREF(s);
				4698	Py_XDECREF(sep);
				4699	return result;
				4700	}
				4701
				4702	static char split__doc__[] =
				4703	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4704	\n\
				4705	Return a list of the words in S, using sep as the\n\
				4706	delimiter string. If maxsplit is given, at most maxsplit\n\
				4707	splits are done. If sep is not specified, any whitespace string\n\
				4708	is a separator.";
				4709
				4710	static PyObject*
				4711	unicode_split(PyUnicodeObject self, PyObject args)
				4712	{
				4713	PyObject *substring = Py_None;
				4714	int maxcount = -1;
				4715
				4716	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4717	return NULL;
				4718
				4719	if (substring == Py_None)
				4720	return split(self, NULL, maxcount);
				4721	else if (PyUnicode_Check(substring))
				4722	return split(self, (PyUnicodeObject *)substring, maxcount);
				4723	else
				4724	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4725	}
				4726
				4727	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4728	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4729	\n\
				4730	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4731	Line breaks are not included in the resulting list unless keepends\n\
				4732	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4733
				4734	static PyObject*
				4735	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4736	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4737	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4738
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4739	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4740	return NULL;
				4741
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4742	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4743	}
				4744
				4745	static
				4746	PyObject unicode_str(PyUnicodeObject self)
				4747	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4748	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4749	}
				4750
				4751	static char strip__doc__[] =
				4752	"S.strip() -> unicode\n\
				4753	\n\
				4754	Return a copy of S with leading and trailing whitespace removed.";
				4755
				4756	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4757	unicode_strip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4758	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4759	return strip(self, 1, 1);
				4760	}
				4761
				4762	static char swapcase__doc__[] =
				4763	"S.swapcase() -> unicode\n\
				4764	\n\
				4765	Return a copy of S with uppercase characters converted to lowercase\n\
				4766	and vice versa.";
				4767
				4768	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4769	unicode_swapcase(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4770	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4771	return fixup(self, fixswapcase);
				4772	}
				4773
				4774	static char translate__doc__[] =
				4775	"S.translate(table) -> unicode\n\
				4776	\n\
				4777	Return a copy of the string S, where all characters have been mapped\n\
				4778	through the given translation table, which must be a mapping of\n\
				4779	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4780	are left untouched. Characters mapped to None are deleted.";
				4781
				4782	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4783	unicode_translate(PyUnicodeObject self, PyObject table)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4784	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4785	return PyUnicode_TranslateCharmap(self->str,
				4786	self->length,
				4787	table,
				4788	"ignore");
				4789	}
				4790
				4791	static char upper__doc__[] =
				4792	"S.upper() -> unicode\n\
				4793	\n\
				4794	Return a copy of S converted to uppercase.";
				4795
				4796	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4797	unicode_upper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4798	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4799	return fixup(self, fixupper);
				4800	}
				4801
				4802	#if 0
				4803	static char zfill__doc__[] =
				4804	"S.zfill(width) -> unicode\n\
				4805	\n\
				4806	Pad a numeric string x with zeros on the left, to fill a field\n\
				4807	of the specified width. The string x is never truncated.";
				4808
				4809	static PyObject *
				4810	unicode_zfill(PyUnicodeObject self, PyObject args)
				4811	{
				4812	int fill;
				4813	PyUnicodeObject *u;
				4814
				4815	int width;
				4816	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4817	return NULL;
				4818
				4819	if (self->length >= width) {
				4820	Py_INCREF(self);
				4821	return (PyObject*) self;
				4822	}
				4823
				4824	fill = width - self->length;
				4825
				4826	u = pad(self, fill, 0, '0');
				4827
				4828	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4829	/* move sign to beginning of string */
				4830	u->str[0] = u->str[fill];
				4831	u->str[fill] = '0';
				4832	}
				4833
				4834	return (PyObject*) u;
				4835	}
				4836	#endif
				4837
				4838	#if 0
				4839	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4840	unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4841	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4842	return PyInt_FromLong(unicode_freelist_size);
				4843	}
				4844	#endif
				4845
				4846	static char startswith__doc__[] =
				4847	"S.startswith(prefix[, start[, end]]) -> int\n\
				4848	\n\
				4849	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4850	optional start, test S beginning at that position. With optional end, stop\n\
				4851	comparing S at that position.";
				4852
				4853	static PyObject *
				4854	unicode_startswith(PyUnicodeObject *self,
				4855	PyObject *args)
				4856	{
				4857	PyUnicodeObject *substring;
				4858	int start = 0;
				4859	int end = INT_MAX;
				4860	PyObject *result;
				4861
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4862	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4863	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4864	return NULL;
				4865	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4866	(PyObject *)substring);
				4867	if (substring == NULL)
				4868	return NULL;
				4869
				4870	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4871
				4872	Py_DECREF(substring);
				4873	return result;
				4874	}
				4875
				4876
				4877	static char endswith__doc__[] =
				4878	"S.endswith(suffix[, start[, end]]) -> int\n\
				4879	\n\
				4880	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4881	optional start, test S beginning at that position. With optional end, stop\n\
				4882	comparing S at that position.";
				4883
				4884	static PyObject *
				4885	unicode_endswith(PyUnicodeObject *self,
				4886	PyObject *args)
				4887	{
				4888	PyUnicodeObject *substring;
				4889	int start = 0;
				4890	int end = INT_MAX;
				4891	PyObject *result;
				4892
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4893	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4894	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4895	return NULL;
				4896	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4897	(PyObject *)substring);
				4898	if (substring == NULL)
				4899	return NULL;
				4900
				4901	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4902
				4903	Py_DECREF(substring);
				4904	return result;
				4905	}
				4906
				4907
				4908	static PyMethodDef unicode_methods[] = {
				4909
				4910	/* Order is according to common usage: often used methods should
				4911	appear first, since lookup is done sequentially. */
				4912
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4913	{"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
				4914	{"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
				4915	{"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
				4916	{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
				4917	{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
				4918	{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
				4919	{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
				4920	{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
				4921	{"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
				4922	{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
				4923	{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
				4924	{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
				4925	{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
				4926	{"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
				4927	/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
				4928	{"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
				4929	{"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
				4930	{"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
				4931	{"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
				4932	{"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
				4933	{"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
				4934	{"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
				4935	{"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
				4936	{"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
				4937	{"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
				4938	{"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
				4939	{"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
				4940	{"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
				4941	{"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
				4942	{"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
				4943	{"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
				4944	{"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
				4945	{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
				4946	{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
				4947	{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4948	#if 0
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4949	{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
				4950	{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4951	#endif
				4952
				4953	#if 0
				4954	/* This one is just used for debugging the implementation. */
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4955	{"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4956	#endif
				4957
				4958	{NULL, NULL}
				4959	};
				4960
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4961	static PySequenceMethods unicode_as_sequence = {
				4962	(inquiry) unicode_length, /* sq_length */
				4963	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4964	(intargfunc) unicode_repeat, /* sq_repeat */
				4965	(intargfunc) unicode_getitem, /* sq_item */
				4966	(intintargfunc) unicode_slice, /* sq_slice */
				4967	0, /* sq_ass_item */
				4968	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4969	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4970	};
				4971
				4972	static int
				4973	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4974	int index,
				4975	const void **ptr)
				4976	{
				4977	if (index != 0) {
				4978	PyErr_SetString(PyExc_SystemError,
				4979	"accessing non-existent unicode segment");
				4980	return -1;
				4981	}
				4982	ptr = (void ) self->str;
				4983	return PyUnicode_GET_DATA_SIZE(self);
				4984	}
				4985
				4986	static int
				4987	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4988	const void **ptr)
				4989	{
				4990	PyErr_SetString(PyExc_TypeError,
				4991	"cannot use unicode as modifyable buffer");
				4992	return -1;
				4993	}
				4994
				4995	static int
				4996	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4997	int *lenp)
				4998	{
				4999	if (lenp)
				5000	*lenp = PyUnicode_GET_DATA_SIZE(self);
				5001	return 1;
				5002	}
				5003
				5004	static int
				5005	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				5006	int index,
				5007	const void **ptr)
				5008	{
				5009	PyObject *str;
				5010
				5011	if (index != 0) {
				5012	PyErr_SetString(PyExc_SystemError,
				5013	"accessing non-existent unicode segment");
				5014	return -1;
				5015	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5016	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5017	if (str == NULL)
				5018	return -1;
				5019	ptr = (void ) PyString_AS_STRING(str);
				5020	return PyString_GET_SIZE(str);
				5021	}
				5022
				5023	/* Helpers for PyUnicode_Format() */
				5024
				5025	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5026	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5027	{
				5028	int argidx = *p_argidx;
				5029	if (argidx < arglen) {
				5030	(*p_argidx)++;
				5031	if (arglen < 0)
				5032	return args;
				5033	else
				5034	return PyTuple_GetItem(args, argidx);
				5035	}
				5036	PyErr_SetString(PyExc_TypeError,
				5037	"not enough arguments for format string");
				5038	return NULL;
				5039	}
				5040
				5041	#define F_LJUST (1<<0)
				5042	#define F_SIGN (1<<1)
				5043	#define F_BLANK (1<<2)
				5044	#define F_ALT (1<<3)
				5045	#define F_ZERO (1<<4)
				5046
				5047	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5048	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5049	{
				5050	register int i;
				5051	int len;
				5052	va_list va;
				5053	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5054	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5055
				5056	/* First, format the string as char array, then expand to Py_UNICODE
				5057	array. */
				5058	charbuffer = (char *)buffer;
				5059	len = vsprintf(charbuffer, format, va);
				5060	for (i = len - 1; i >= 0; i--)
				5061	buffer[i] = (Py_UNICODE) charbuffer[i];
				5062
				5063	va_end(va);
				5064	return len;
				5065	}
				5066
				5067	static int
				5068	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5069	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5070	int flags,
				5071	int prec,
				5072	int type,
				5073	PyObject *v)
				5074	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5075	/* fmt = '%#.' + `prec` + `type`
				5076	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5077	char fmt[20];
				5078	double x;
				5079
				5080	x = PyFloat_AsDouble(v);
				5081	if (x == -1.0 && PyErr_Occurred())
				5082	return -1;
				5083	if (prec < 0)
				5084	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5085	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				5086	type = 'g';
				5087	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5088	/* worst case length calc to ensure no buffer overrun:
				5089	fmt = %#.<prec>g
				5090	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				5091	for any double rep.)
				5092	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				5093	If prec=0 the effective precision is 1 (the leading digit is
				5094	always given), therefore increase by one to 10+prec. */
				5095	if (buflen <= (size_t)10 + (size_t)prec) {
				5096	PyErr_SetString(PyExc_OverflowError,
				5097	"formatted float is too long (precision too long?)");
				5098	return -1;
				5099	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5100	return usprintf(buf, fmt, x);
				5101	}
				5102
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5103	static PyObject*
				5104	formatlong(PyObject *val, int flags, int prec, int type)
				5105	{
				5106	char *buf;
				5107	int i, len;
				5108	PyObject str; / temporary string object. */
				5109	PyUnicodeObject *result;
				5110
				5111	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				5112	if (!str)
				5113	return NULL;
				5114	result = _PyUnicode_New(len);
				5115	for (i = 0; i < len; i++)
				5116	result->str[i] = buf[i];
				5117	result->str[len] = 0;
				5118	Py_DECREF(str);
				5119	return (PyObject*)result;
				5120	}
				5121
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5122	static int
				5123	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5124	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5125	int flags,
				5126	int prec,
				5127	int type,
				5128	PyObject *v)
				5129	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5130	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5131	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				5132	+ 1 + 1 = 24*/
				5133	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5134	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5135	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5136
				5137	x = PyInt_AsLong(v);
				5138	if (x == -1 && PyErr_Occurred())
				5139	return -1;
				5140	if (prec < 0)
				5141	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5142	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				5143	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				5144	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				5145	PyErr_SetString(PyExc_OverflowError,
				5146	"formatted integer is too long (precision too long?)");
				5147	return -1;
				5148	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5149	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				5150	* but we want it (for consistency with other %#x conversions, and
				5151	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5152	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				5153	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				5154	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5155	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5156	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				5157	/* Only way to know what the platform does is to try it. */
				5158	sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
				5159	if (fmt[1] != (char)type) {
				5160	/* Supply our own leading 0x/0X -- needed under std C */
				5161	use_native_c_format = 0;
				5162	sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
				5163	}
				5164	}
				5165	if (use_native_c_format)
				5166	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5167	return usprintf(buf, fmt, x);
				5168	}
				5169
				5170	static int
				5171	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5172	size_t buflen,
				5173	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5174	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5175	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5176	if (PyUnicode_Check(v)) {
				5177	if (PyUnicode_GET_SIZE(v) != 1)
				5178	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5179	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5180	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5181
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5182	else if (PyString_Check(v)) {
				5183	if (PyString_GET_SIZE(v) != 1)
				5184	goto onError;
				5185	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				5186	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5187
				5188	else {
				5189	/* Integer input truncated to a character */
				5190	long x;
				5191	x = PyInt_AsLong(v);
				5192	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5193	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5194	buf[0] = (char) x;
				5195	}
				5196	buf[1] = '\0';
				5197	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5198
				5199	onError:
				5200	PyErr_SetString(PyExc_TypeError,
				5201	"%c requires int or char");
				5202	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5203	}
				5204
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5205	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				5206
				5207	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				5208	chars are formatted. XXX This is a magic number. Each formatting
				5209	routine does bounds checking to ensure no overflow, but a better
				5210	solution may be to malloc a buffer of appropriate size for each
				5211	format. For now, the current solution is sufficient.
				5212	*/
				5213	#define FORMATBUFLEN (size_t)120
				5214
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5215	PyObject PyUnicode_Format(PyObject format,
				5216	PyObject *args)
				5217	{
				5218	Py_UNICODE fmt, res;
				5219	int fmtcnt, rescnt, reslen, arglen, argidx;
				5220	int args_owned = 0;
				5221	PyUnicodeObject *result = NULL;
				5222	PyObject *dict = NULL;
				5223	PyObject *uformat;
				5224
				5225	if (format == NULL \|\| args == NULL) {
				5226	PyErr_BadInternalCall();
				5227	return NULL;
				5228	}
				5229	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5230	if (uformat == NULL)
				5231	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5232	fmt = PyUnicode_AS_UNICODE(uformat);
				5233	fmtcnt = PyUnicode_GET_SIZE(uformat);
				5234
				5235	reslen = rescnt = fmtcnt + 100;
				5236	result = _PyUnicode_New(reslen);
				5237	if (result == NULL)
				5238	goto onError;
				5239	res = PyUnicode_AS_UNICODE(result);
				5240
				5241	if (PyTuple_Check(args)) {
				5242	arglen = PyTuple_Size(args);
				5243	argidx = 0;
				5244	}
				5245	else {
				5246	arglen = -1;
				5247	argidx = -2;
				5248	}
				5249	if (args->ob_type->tp_as_mapping)
				5250	dict = args;
				5251
				5252	while (--fmtcnt >= 0) {
				5253	if (*fmt != '%') {
				5254	if (--rescnt < 0) {
				5255	rescnt = fmtcnt + 100;
				5256	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5257	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5258	return NULL;
				5259	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				5260	--rescnt;
				5261	}
				5262	res++ = fmt++;
				5263	}
				5264	else {
				5265	/* Got a format specifier */
				5266	int flags = 0;
				5267	int width = -1;
				5268	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5269	Py_UNICODE c = '\0';
				5270	Py_UNICODE fill;
				5271	PyObject *v = NULL;
				5272	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5273	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5274	Py_UNICODE sign;
				5275	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5276	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5277
				5278	fmt++;
				5279	if (*fmt == '(') {
				5280	Py_UNICODE *keystart;
				5281	int keylen;
				5282	PyObject *key;
				5283	int pcount = 1;
				5284
				5285	if (dict == NULL) {
				5286	PyErr_SetString(PyExc_TypeError,
				5287	"format requires a mapping");
				5288	goto onError;
				5289	}
				5290	++fmt;
				5291	--fmtcnt;
				5292	keystart = fmt;
				5293	/* Skip over balanced parentheses */
				5294	while (pcount > 0 && --fmtcnt >= 0) {
				5295	if (*fmt == ')')
				5296	--pcount;
				5297	else if (*fmt == '(')
				5298	++pcount;
				5299	fmt++;
				5300	}
				5301	keylen = fmt - keystart - 1;
				5302	if (fmtcnt < 0 \|\| pcount > 0) {
				5303	PyErr_SetString(PyExc_ValueError,
				5304	"incomplete format key");
				5305	goto onError;
				5306	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5307	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5308	then looked up since Python uses strings to hold
				5309	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5310	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5311	key = PyUnicode_EncodeUTF8(keystart,
				5312	keylen,
				5313	NULL);
				5314	if (key == NULL)
				5315	goto onError;
				5316	if (args_owned) {
				5317	Py_DECREF(args);
				5318	args_owned = 0;
				5319	}
				5320	args = PyObject_GetItem(dict, key);
				5321	Py_DECREF(key);
				5322	if (args == NULL) {
				5323	goto onError;
				5324	}
				5325	args_owned = 1;
				5326	arglen = -1;
				5327	argidx = -2;
				5328	}
				5329	while (--fmtcnt >= 0) {
				5330	switch (c = *fmt++) {
				5331	case '-': flags \|= F_LJUST; continue;
				5332	case '+': flags \|= F_SIGN; continue;
				5333	case ' ': flags \|= F_BLANK; continue;
				5334	case '#': flags \|= F_ALT; continue;
				5335	case '0': flags \|= F_ZERO; continue;
				5336	}
				5337	break;
				5338	}
				5339	if (c == '*') {
				5340	v = getnextarg(args, arglen, &argidx);
				5341	if (v == NULL)
				5342	goto onError;
				5343	if (!PyInt_Check(v)) {
				5344	PyErr_SetString(PyExc_TypeError,
				5345	"* wants int");
				5346	goto onError;
				5347	}
				5348	width = PyInt_AsLong(v);
				5349	if (width < 0) {
				5350	flags \|= F_LJUST;
				5351	width = -width;
				5352	}
				5353	if (--fmtcnt >= 0)
				5354	c = *fmt++;
				5355	}
				5356	else if (c >= '0' && c <= '9') {
				5357	width = c - '0';
				5358	while (--fmtcnt >= 0) {
				5359	c = *fmt++;
				5360	if (c < '0' \|\| c > '9')
				5361	break;
				5362	if ((width*10) / 10 != width) {
				5363	PyErr_SetString(PyExc_ValueError,
				5364	"width too big");
				5365	goto onError;
				5366	}
				5367	width = width*10 + (c - '0');
				5368	}
				5369	}
				5370	if (c == '.') {
				5371	prec = 0;
				5372	if (--fmtcnt >= 0)
				5373	c = *fmt++;
				5374	if (c == '*') {
				5375	v = getnextarg(args, arglen, &argidx);
				5376	if (v == NULL)
				5377	goto onError;
				5378	if (!PyInt_Check(v)) {
				5379	PyErr_SetString(PyExc_TypeError,
				5380	"* wants int");
				5381	goto onError;
				5382	}
				5383	prec = PyInt_AsLong(v);
				5384	if (prec < 0)
				5385	prec = 0;
				5386	if (--fmtcnt >= 0)
				5387	c = *fmt++;
				5388	}
				5389	else if (c >= '0' && c <= '9') {
				5390	prec = c - '0';
				5391	while (--fmtcnt >= 0) {
				5392	c = Py_CHARMASK(*fmt++);
				5393	if (c < '0' \|\| c > '9')
				5394	break;
				5395	if ((prec*10) / 10 != prec) {
				5396	PyErr_SetString(PyExc_ValueError,
				5397	"prec too big");
				5398	goto onError;
				5399	}
				5400	prec = prec*10 + (c - '0');
				5401	}
				5402	}
				5403	} /* prec */
				5404	if (fmtcnt >= 0) {
				5405	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5406	if (--fmtcnt >= 0)
				5407	c = *fmt++;
				5408	}
				5409	}
				5410	if (fmtcnt < 0) {
				5411	PyErr_SetString(PyExc_ValueError,
				5412	"incomplete format");
				5413	goto onError;
				5414	}
				5415	if (c != '%') {
				5416	v = getnextarg(args, arglen, &argidx);
				5417	if (v == NULL)
				5418	goto onError;
				5419	}
				5420	sign = 0;
				5421	fill = ' ';
				5422	switch (c) {
				5423
				5424	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5425	pbuf = formatbuf;
				5426	/* presume that buffer length is at least 1 */
				5427	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5428	len = 1;
				5429	break;
				5430
				5431	case 's':
				5432	case 'r':
				5433	if (PyUnicode_Check(v) && c == 's') {
				5434	temp = v;
				5435	Py_INCREF(temp);
				5436	}
				5437	else {
				5438	PyObject *unicode;
				5439	if (c == 's')
				5440	temp = PyObject_Str(v);
				5441	else
				5442	temp = PyObject_Repr(v);
				5443	if (temp == NULL)
				5444	goto onError;
				5445	if (!PyString_Check(temp)) {
				5446	/* XXX Note: this should never happen, since
				5447	PyObject_Repr() and PyObject_Str() assure
				5448	this */
				5449	Py_DECREF(temp);
				5450	PyErr_SetString(PyExc_TypeError,
				5451	"%s argument has non-string str()");
				5452	goto onError;
				5453	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5454	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5455	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5456	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5457	"strict");
				5458	Py_DECREF(temp);
				5459	temp = unicode;
				5460	if (temp == NULL)
				5461	goto onError;
				5462	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5463	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5464	len = PyUnicode_GET_SIZE(temp);
				5465	if (prec >= 0 && len > prec)
				5466	len = prec;
				5467	break;
				5468
				5469	case 'i':
				5470	case 'd':
				5471	case 'u':
				5472	case 'o':
				5473	case 'x':
				5474	case 'X':
				5475	if (c == 'i')
				5476	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5477	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5478	temp = formatlong(v, flags, prec, c);
				5479	if (!temp)
				5480	goto onError;
				5481	pbuf = PyUnicode_AS_UNICODE(temp);
				5482	len = PyUnicode_GET_SIZE(temp);
				5483	/* unbounded ints can always produce
				5484	a sign character! */
				5485	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5486	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5487	else {
				5488	pbuf = formatbuf;
				5489	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5490	flags, prec, c, v);
				5491	if (len < 0)
				5492	goto onError;
				5493	/* only d conversion is signed */
				5494	sign = c == 'd';
				5495	}
				5496	if (flags & F_ZERO)
				5497	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5498	break;
				5499
				5500	case 'e':
				5501	case 'E':
				5502	case 'f':
				5503	case 'g':
				5504	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5505	pbuf = formatbuf;
				5506	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5507	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5508	if (len < 0)
				5509	goto onError;
				5510	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5511	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5512	fill = '0';
				5513	break;
				5514
				5515	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5516	pbuf = formatbuf;
				5517	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5518	if (len < 0)
				5519	goto onError;
				5520	break;
				5521
				5522	default:
				5523	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5524	"unsupported format character '%c' (0x%x) "
				5525	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5526	(31<=c && c<=126) ? c : '?',
				5527	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5528	goto onError;
				5529	}
				5530	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5531	if (pbuf == '-' \|\| pbuf == '+') {
				5532	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5533	len--;
				5534	}
				5535	else if (flags & F_SIGN)
				5536	sign = '+';
				5537	else if (flags & F_BLANK)
				5538	sign = ' ';
				5539	else
				5540	sign = 0;
				5541	}
				5542	if (width < len)
				5543	width = len;
				5544	if (rescnt < width + (sign != 0)) {
				5545	reslen -= rescnt;
				5546	rescnt = width + fmtcnt + 100;
				5547	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5548	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5549	return NULL;
				5550	res = PyUnicode_AS_UNICODE(result)
				5551	+ reslen - rescnt;
				5552	}
				5553	if (sign) {
				5554	if (fill != ' ')
				5555	*res++ = sign;
				5556	rescnt--;
				5557	if (width > len)
				5558	width--;
				5559	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5560	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5561	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5562	assert(pbuf[1] == c);
				5563	if (fill != ' ') {
				5564	res++ = pbuf++;
				5565	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5566	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5567	rescnt -= 2;
				5568	width -= 2;
				5569	if (width < 0)
				5570	width = 0;
				5571	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5572	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5573	if (width > len && !(flags & F_LJUST)) {
				5574	do {
				5575	--rescnt;
				5576	*res++ = fill;
				5577	} while (--width > len);
				5578	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5579	if (fill == ' ') {
				5580	if (sign)
				5581	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5582	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5583	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5584	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5585	res++ = pbuf++;
				5586	res++ = pbuf++;
				5587	}
				5588	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5589	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5590	res += len;
				5591	rescnt -= len;
				5592	while (--width >= len) {
				5593	--rescnt;
				5594	*res++ = ' ';
				5595	}
				5596	if (dict && (argidx < arglen) && c != '%') {
				5597	PyErr_SetString(PyExc_TypeError,
				5598	"not all arguments converted");
				5599	goto onError;
				5600	}
				5601	Py_XDECREF(temp);
				5602	} /* '%' */
				5603	} /* until end */
				5604	if (argidx < arglen && !dict) {
				5605	PyErr_SetString(PyExc_TypeError,
				5606	"not all arguments converted");
				5607	goto onError;
				5608	}
				5609
				5610	if (args_owned) {
				5611	Py_DECREF(args);
				5612	}
				5613	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5614	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5615	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5616	return (PyObject *)result;
				5617
				5618	onError:
				5619	Py_XDECREF(result);
				5620	Py_DECREF(uformat);
				5621	if (args_owned) {
				5622	Py_DECREF(args);
				5623	}
				5624	return NULL;
				5625	}
				5626
				5627	static PyBufferProcs unicode_as_buffer = {
				5628	(getreadbufferproc) unicode_buffer_getreadbuf,
				5629	(getwritebufferproc) unicode_buffer_getwritebuf,
				5630	(getsegcountproc) unicode_buffer_getsegcount,
				5631	(getcharbufferproc) unicode_buffer_getcharbuf,
				5632	};
				5633
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5634	staticforward PyObject *
				5635	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds);
				5636
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5637	static PyObject *
				5638	unicode_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5639	{
				5640	PyObject *x = NULL;
				5641	static char *kwlist[] = {"string", "encoding", "errors", 0};
				5642	char *encoding = NULL;
				5643	char *errors = NULL;
				5644
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5645	if (type != &PyUnicode_Type)
				5646	return unicode_subtype_new(type, args, kwds);
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5647	if (!PyArg_ParseTupleAndKeywords(args, kwds, "\|Oss:unicode",
				5648	kwlist, &x, &encoding, &errors))
				5649	return NULL;
				5650	if (x == NULL)
				5651	return (PyObject *)_PyUnicode_New(0);
				5652	return PyUnicode_FromEncodedObject(x, encoding, errors);
				5653	}
				5654
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5655	static PyObject *
				5656	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5657	{
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5658	PyUnicodeObject tmp, pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5659	int n;
				5660
				5661	assert(PyType_IsSubtype(type, &PyUnicode_Type));
				5662	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
				5663	if (tmp == NULL)
				5664	return NULL;
				5665	assert(PyUnicode_Check(tmp));
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5666	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
				5667	if (pnew == NULL)
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5668	return NULL;
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5669	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
				5670	if (pnew->str == NULL) {
				5671	_Py_ForgetReference((PyObject *)pnew);
				5672	PyObject_DEL(pnew);
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5673	return NULL;
				5674	}
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5675	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
				5676	pnew->length = n;
				5677	pnew->hash = tmp->hash;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5678	Py_DECREF(tmp);
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5679	return (PyObject *)pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5680	}
				5681
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5682	static char unicode_doc[] =
				5683	"unicode(string [, encoding[, errors]]) -> object\n\
				5684	\n\
				5685	Create a new Unicode object from the given encoded string.\n\
				5686	encoding defaults to the current default string encoding and \n\
				5687	errors, defining the error handling, to 'strict'.";
				5688
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5689	PyTypeObject PyUnicode_Type = {
				5690	PyObject_HEAD_INIT(&PyType_Type)
				5691	0, /* ob_size */
				5692	"unicode", /* tp_name */
				5693	sizeof(PyUnicodeObject), /* tp_size */
				5694	0, /* tp_itemsize */
				5695	/* Slots */
				5696	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5697	0, /* tp_print */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5698	0, /* tp_getattr */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5699	0, /* tp_setattr */
				5700	(cmpfunc) unicode_compare, /* tp_compare */
				5701	(reprfunc) unicode_repr, /* tp_repr */
				5702	0, /* tp_as_number */
				5703	&unicode_as_sequence, /* tp_as_sequence */
				5704	0, /* tp_as_mapping */
				5705	(hashfunc) unicode_hash, /* tp_hash*/
				5706	0, /* tp_call*/
				5707	(reprfunc) unicode_str, /* tp_str */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5708	PyObject_GenericGetAttr, /* tp_getattro */
				5709	0, /* tp_setattro */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5710	&unicode_as_buffer, /* tp_as_buffer */
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5711	Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5712	unicode_doc, /* tp_doc */
				5713	0, /* tp_traverse */
				5714	0, /* tp_clear */
				5715	0, /* tp_richcompare */
				5716	0, /* tp_weaklistoffset */
				5717	0, /* tp_iter */
				5718	0, /* tp_iternext */
				5719	unicode_methods, /* tp_methods */
				5720	0, /* tp_members */
				5721	0, /* tp_getset */
				5722	0, /* tp_base */
				5723	0, /* tp_dict */
				5724	0, /* tp_descr_get */
				5725	0, /* tp_descr_set */
				5726	0, /* tp_dictoffset */
				5727	0, /* tp_init */
				5728	0, /* tp_alloc */
				5729	unicode_new, /* tp_new */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5730	};
				5731
				5732	/* Initialize the Unicode implementation */
				5733
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5734	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5735	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5736	int i;
				5737
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5738	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5739	unicode_freelist = NULL;
				5740	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5741	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5742	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5743	for (i = 0; i < 256; i++)
				5744	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5745	}
				5746
				5747	/* Finalize the Unicode implementation */
				5748
				5749	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5750	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5751	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5752	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5753	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5754
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5755	Py_XDECREF(unicode_empty);
				5756	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5757
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5758	for (i = 0; i < 256; i++) {
				5759	if (unicode_latin1[i]) {
				5760	Py_DECREF(unicode_latin1[i]);
				5761	unicode_latin1[i] = NULL;
				5762	}
				5763	}
				5764
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5765	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5766	PyUnicodeObject *v = u;
				5767	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5768	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5769	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5770	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5771	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5772	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5773	unicode_freelist = NULL;
				5774	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5775	}