Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 7dc370a48d423d8bb1f803666023f5383106c182 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	106	/* --- Unicode Object ----------------------------------------------------- */
				107
				108	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	109	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	110	int length)
				111	{
				112	void *oldstr;
				113
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	114	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	115	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	116	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	117
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	118	/* Resizing shared object (unicode_empty or single character
				119	objects) in-place is not allowed. Use PyUnicode_Resize()
				120	instead ! */
				121	if (unicode == unicode_empty \|\|
				122	(unicode->length == 1 &&
				123	unicode->str[0] < 256 &&
				124	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	125	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	126	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	return -1;
				128	}
				129
				130	/* We allocate one more byte to make sure the string is
				131	Ux0000 terminated -- XXX is this needed ? */
				132	oldstr = unicode->str;
				133	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				134	if (!unicode->str) {
				135	unicode->str = oldstr;
				136	PyErr_NoMemory();
				137	return -1;
				138	}
				139	unicode->str[length] = 0;
				140	unicode->length = length;
				141
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	142	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	143	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	144	if (unicode->defenc) {
				145	Py_DECREF(unicode->defenc);
				146	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	147	}
				148	unicode->hash = -1;
				149
				150	return 0;
				151	}
				152
				153	/* We allocate one more byte to make sure the string is
				154	Ux0000 terminated -- XXX is this needed ?
				155
				156	XXX This allocator could further be enhanced by assuring that the
				157	free list never reduces its size below 1.
				158
				159	*/
				160
				161	static
				162	PyUnicodeObject *_PyUnicode_New(int length)
				163	{
				164	register PyUnicodeObject *unicode;
				165
				166	/* Optimization for empty strings */
				167	if (length == 0 && unicode_empty != NULL) {
				168	Py_INCREF(unicode_empty);
				169	return unicode_empty;
				170	}
				171
				172	/* Unicode freelist & memory allocation */
				173	if (unicode_freelist) {
				174	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	175	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	176	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	177	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	178	/* Keep-Alive optimization: we only upsize the buffer,
				179	never downsize it. */
				180	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	181	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	182	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	183	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	184	}
				185	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	186	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	187	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	188	}
				189	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	190	}
				191	else {
				192	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				193	if (unicode == NULL)
				194	return NULL;
				195	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				196	}
				197
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	198	if (!unicode->str) {
				199	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	200	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	201	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	unicode->str[length] = 0;
				203	unicode->length = length;
				204	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	205	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	206	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	207
				208	onError:
				209	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	210	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	211	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	212	}
				213
				214	static
				215	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				216	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	217	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	218	/* Keep-Alive optimization */
				219	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	220	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	221	unicode->str = NULL;
				222	unicode->length = 0;
				223	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	224	if (unicode->defenc) {
				225	Py_DECREF(unicode->defenc);
				226	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	227	}
				228	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	(PyUnicodeObject *)unicode = unicode_freelist;
				230	unicode_freelist = unicode;
				231	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	232	}
				233	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	234	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	235	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	236	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	237	}
				238	}
				239
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	240	int PyUnicode_Resize(PyObject **unicode,
				241	int length)
				242	{
				243	register PyUnicodeObject *v;
				244
				245	/* Argument checks */
				246	if (unicode == NULL) {
				247	PyErr_BadInternalCall();
				248	return -1;
				249	}
				250	v = (PyUnicodeObject )unicode;
				251	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				252	PyErr_BadInternalCall();
				253	return -1;
				254	}
				255
				256	/* Resizing unicode_empty and single character objects is not
				257	possible since these are being shared. We simply return a fresh
				258	copy with the same Unicode content. */
				259	if (v->length != length &&
				260	(v == unicode_empty \|\| v->length == 1)) {
				261	PyUnicodeObject *w = _PyUnicode_New(length);
				262	if (w == NULL)
				263	return -1;
				264	Py_UNICODE_COPY(w->str, v->str,
				265	length < v->length ? length : v->length);
				266	unicode = (PyObject )w;
				267	return 0;
				268	}
				269
				270	/* Note that we don't have to modify *unicode for unshared Unicode
				271	objects, since we can modify them in-place. */
				272	return unicode_resize(v, length);
				273	}
				274
				275	/* Internal API for use in unicodeobject.c only ! */
				276	#define _PyUnicode_Resize(unicodevar, length) \
				277	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				278
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	279	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				280	int size)
				281	{
				282	PyUnicodeObject *unicode;
				283
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	284	/* If the Unicode data is known at construction time, we can apply
				285	some optimizations which share commonly used objects. */
				286	if (u != NULL) {
				287
				288	/* Optimization for empty strings */
				289	if (size == 0 && unicode_empty != NULL) {
				290	Py_INCREF(unicode_empty);
				291	return (PyObject *)unicode_empty;
				292	}
				293
				294	/* Single character Unicode objects in the Latin-1 range are
				295	shared when using this constructor */
				296	if (size == 1 && *u < 256) {
				297	unicode = unicode_latin1[*u];
				298	if (!unicode) {
				299	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	300	if (!unicode)
				301	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	302	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	303	unicode_latin1[*u] = unicode;
				304	}
				305	Py_INCREF(unicode);
				306	return (PyObject *)unicode;
				307	}
				308	}
				309
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	310	unicode = _PyUnicode_New(size);
				311	if (!unicode)
				312	return NULL;
				313
				314	/* Copy the Unicode data into the new object */
				315	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	316	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	317
				318	return (PyObject *)unicode;
				319	}
				320
				321	#ifdef HAVE_WCHAR_H
				322
				323	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				324	int size)
				325	{
				326	PyUnicodeObject *unicode;
				327
				328	if (w == NULL) {
				329	PyErr_BadInternalCall();
				330	return NULL;
				331	}
				332
				333	unicode = _PyUnicode_New(size);
				334	if (!unicode)
				335	return NULL;
				336
				337	/* Copy the wchar_t data into the new object */
				338	#ifdef HAVE_USABLE_WCHAR_T
				339	memcpy(unicode->str, w, size * sizeof(wchar_t));
				340	#else
				341	{
				342	register Py_UNICODE *u;
				343	register int i;
				344	u = PyUnicode_AS_UNICODE(unicode);
				345	for (i = size; i >= 0; i--)
				346	u++ = w++;
				347	}
				348	#endif
				349
				350	return (PyObject *)unicode;
				351	}
				352
				353	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				354	register wchar_t *w,
				355	int size)
				356	{
				357	if (unicode == NULL) {
				358	PyErr_BadInternalCall();
				359	return -1;
				360	}
				361	if (size > PyUnicode_GET_SIZE(unicode))
				362	size = PyUnicode_GET_SIZE(unicode);
				363	#ifdef HAVE_USABLE_WCHAR_T
				364	memcpy(w, unicode->str, size * sizeof(wchar_t));
				365	#else
				366	{
				367	register Py_UNICODE *u;
				368	register int i;
				369	u = PyUnicode_AS_UNICODE(unicode);
				370	for (i = size; i >= 0; i--)
				371	w++ = u++;
				372	}
				373	#endif
				374
				375	return size;
				376	}
				377
				378	#endif
				379
				380	PyObject PyUnicode_FromObject(register PyObject obj)
				381	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	382	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				383	}
				384
				385	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				386	const char *encoding,
				387	const char *errors)
				388	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	389	const char *s;
				390	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	391	int owned = 0;
				392	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	393
				394	if (obj == NULL) {
				395	PyErr_BadInternalCall();
				396	return NULL;
				397	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	398
				399	/* Coerce object */
				400	if (PyInstance_Check(obj)) {
				401	PyObject *func;
				402	func = PyObject_GetAttrString(obj, "__str__");
				403	if (func == NULL) {
				404	PyErr_SetString(PyExc_TypeError,
				405	"coercing to Unicode: instance doesn't define __str__");
				406	return NULL;
				407	}
				408	obj = PyEval_CallObject(func, NULL);
				409	Py_DECREF(func);
				410	if (obj == NULL)
				411	return NULL;
				412	owned = 1;
				413	}
				414	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	415	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	416	v = obj;
				417	if (encoding) {
				418	PyErr_SetString(PyExc_TypeError,
				419	"decoding Unicode is not supported");
				420	return NULL;
				421	}
				422	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	423	}
				424	else if (PyString_Check(obj)) {
				425	s = PyString_AS_STRING(obj);
				426	len = PyString_GET_SIZE(obj);
				427	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	428	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				429	/* Overwrite the error message with something more useful in
				430	case of a TypeError. */
				431	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	432	PyErr_Format(PyExc_TypeError,
				433	"coercing to Unicode: need string or buffer, "
				434	"%.80s found",
				435	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	436	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	437	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	438
				439	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	440	if (len == 0) {
				441	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	442	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	443	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	444	else
				445	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	446
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	447	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	448	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	449	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	450	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	451	return v;
				452
				453	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	454	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	455	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	456	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	457	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	458	}
				459
				460	PyObject PyUnicode_Decode(const char s,
				461	int size,
				462	const char *encoding,
				463	const char *errors)
				464	{
				465	PyObject buffer = NULL, unicode;
				466
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	467	if (encoding == NULL)
				468	encoding = PyUnicode_GetDefaultEncoding();
				469
				470	/* Shortcuts for common default encodings */
				471	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	472	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	473	else if (strcmp(encoding, "latin-1") == 0)
				474	return PyUnicode_DecodeLatin1(s, size, errors);
				475	else if (strcmp(encoding, "ascii") == 0)
				476	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	477
				478	/* Decode via the codec registry */
				479	buffer = PyBuffer_FromMemory((void *)s, size);
				480	if (buffer == NULL)
				481	goto onError;
				482	unicode = PyCodec_Decode(buffer, encoding, errors);
				483	if (unicode == NULL)
				484	goto onError;
				485	if (!PyUnicode_Check(unicode)) {
				486	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	487	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	488	unicode->ob_type->tp_name);
				489	Py_DECREF(unicode);
				490	goto onError;
				491	}
				492	Py_DECREF(buffer);
				493	return unicode;
				494
				495	onError:
				496	Py_XDECREF(buffer);
				497	return NULL;
				498	}
				499
				500	PyObject PyUnicode_Encode(const Py_UNICODE s,
				501	int size,
				502	const char *encoding,
				503	const char *errors)
				504	{
				505	PyObject v, unicode;
				506
				507	unicode = PyUnicode_FromUnicode(s, size);
				508	if (unicode == NULL)
				509	return NULL;
				510	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				511	Py_DECREF(unicode);
				512	return v;
				513	}
				514
				515	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				516	const char *encoding,
				517	const char *errors)
				518	{
				519	PyObject *v;
				520
				521	if (!PyUnicode_Check(unicode)) {
				522	PyErr_BadArgument();
				523	goto onError;
				524	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	525
				526	if (encoding == NULL)
				527	encoding = PyUnicode_GetDefaultEncoding();
				528
				529	/* Shortcuts for common default encodings */
				530	if (errors == NULL) {
				531	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	532	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	533	else if (strcmp(encoding, "latin-1") == 0)
				534	return PyUnicode_AsLatin1String(unicode);
				535	else if (strcmp(encoding, "ascii") == 0)
				536	return PyUnicode_AsASCIIString(unicode);
				537	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	538
				539	/* Encode via the codec registry */
				540	v = PyCodec_Encode(unicode, encoding, errors);
				541	if (v == NULL)
				542	goto onError;
				543	/* XXX Should we really enforce this ? */
				544	if (!PyString_Check(v)) {
				545	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	546	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	547	v->ob_type->tp_name);
				548	Py_DECREF(v);
				549	goto onError;
				550	}
				551	return v;
				552
				553	onError:
				554	return NULL;
				555	}
				556
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	557	/* Return a Python string holding the default encoded value of the
				558	Unicode object.
				559
				560	The resulting string is cached in the Unicode object for subsequent
				561	usage by this function. The cached version is needed to implement
				562	the character buffer interface and will live (at least) as long as
				563	the Unicode object itself.
				564
				565	The refcount of the string is not incremented.
				566
				567	* Exported for internal use by the interpreter only !!! *
				568
				569	*/
				570
				571	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				572	const char *errors)
				573	{
				574	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				575
				576	if (v)
				577	return v;
				578	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				579	if (v && errors == NULL)
				580	((PyUnicodeObject *)unicode)->defenc = v;
				581	return v;
				582	}
				583
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	584	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				585	{
				586	if (!PyUnicode_Check(unicode)) {
				587	PyErr_BadArgument();
				588	goto onError;
				589	}
				590	return PyUnicode_AS_UNICODE(unicode);
				591
				592	onError:
				593	return NULL;
				594	}
				595
				596	int PyUnicode_GetSize(PyObject *unicode)
				597	{
				598	if (!PyUnicode_Check(unicode)) {
				599	PyErr_BadArgument();
				600	goto onError;
				601	}
				602	return PyUnicode_GET_SIZE(unicode);
				603
				604	onError:
				605	return -1;
				606	}
				607
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	608	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	609	{
				610	return unicode_default_encoding;
				611	}
				612
				613	int PyUnicode_SetDefaultEncoding(const char *encoding)
				614	{
				615	PyObject *v;
				616
				617	/* Make sure the encoding is valid. As side effect, this also
				618	loads the encoding into the codec registry cache. */
				619	v = _PyCodec_Lookup(encoding);
				620	if (v == NULL)
				621	goto onError;
				622	Py_DECREF(v);
				623	strncpy(unicode_default_encoding,
				624	encoding,
				625	sizeof(unicode_default_encoding));
				626	return 0;
				627
				628	onError:
				629	return -1;
				630	}
				631
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	632	/* --- UTF-8 Codec -------------------------------------------------------- */
				633
				634	static
				635	char utf8_code_length[256] = {
				636	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				637	illegal prefix. see RFC 2279 for details */
				638	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				639	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				640	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				641	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				642	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				643	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				644	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				645	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				646	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				647	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				648	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				649	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				650	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				651	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				652	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				653	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				654	};
				655
				656	static
				657	int utf8_decoding_error(const char **source,
				658	Py_UNICODE **dest,
				659	const char *errors,
				660	const char *details)
				661	{
				662	if ((errors == NULL) \|\|
				663	(strcmp(errors,"strict") == 0)) {
				664	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	665	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	666	details);
				667	return -1;
				668	}
				669	else if (strcmp(errors,"ignore") == 0) {
				670	(*source)++;
				671	return 0;
				672	}
				673	else if (strcmp(errors,"replace") == 0) {
				674	(*source)++;
				675	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				676	(*dest)++;
				677	return 0;
				678	}
				679	else {
				680	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	681	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	682	errors);
				683	return -1;
				684	}
				685	}
				686
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	687	PyObject PyUnicode_DecodeUTF8(const char s,
				688	int size,
				689	const char *errors)
				690	{
				691	int n;
				692	const char *e;
				693	PyUnicodeObject *unicode;
				694	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	695	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	696
				697	/* Note: size will always be longer than the resulting Unicode
				698	character count */
				699	unicode = _PyUnicode_New(size);
				700	if (!unicode)
				701	return NULL;
				702	if (size == 0)
				703	return (PyObject *)unicode;
				704
				705	/* Unpack UTF-8 encoded data */
				706	p = unicode->str;
				707	e = s + size;
				708
				709	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	710	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	711
				712	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	713	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	714	s++;
				715	continue;
				716	}
				717
				718	n = utf8_code_length[ch];
				719
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	720	if (s + n > e) {
				721	errmsg = "unexpected end of data";
				722	goto utf8Error;
				723	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	724
				725	switch (n) {
				726
				727	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	728	errmsg = "unexpected code byte";
				729	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	730
				731	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	732	errmsg = "internal error";
				733	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	734
				735	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	736	if ((s[1] & 0xc0) != 0x80) {
				737	errmsg = "invalid data";
				738	goto utf8Error;
				739	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	740	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	741	if (ch < 0x80) {
				742	errmsg = "illegal encoding";
				743	goto utf8Error;
				744	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	745	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	746	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	747	break;
				748
				749	case 3:
				750	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	751	(s[2] & 0xc0) != 0x80) {
				752	errmsg = "invalid data";
				753	goto utf8Error;
				754	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	755	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	756	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				757	errmsg = "illegal encoding";
				758	goto utf8Error;
				759	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	760	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	761	*p++ = (Py_UNICODE)ch;
				762	break;
				763
				764	case 4:
				765	if ((s[1] & 0xc0) != 0x80 \|\|
				766	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	767	(s[3] & 0xc0) != 0x80) {
				768	errmsg = "invalid data";
				769	goto utf8Error;
				770	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	771	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				772	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				773	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	774	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	775	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	776	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	777	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	778	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	779	errmsg = "illegal encoding";
				780	goto utf8Error;
				781	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	782	#if Py_UNICODE_SIZE == 4
				783	*p++ = (Py_UNICODE)ch;
				784	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	785	/* compute and append the two surrogates: */
				786
				787	/* translate from 10000..10FFFF to 0..FFFF */
				788	ch -= 0x10000;
				789
				790	/* high surrogate = top 10 bits added to D800 */
				791	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				792
				793	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	794	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	795	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	796	break;
				797
				798	default:
				799	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	800	errmsg = "unsupported Unicode code range";
				801	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	802	}
				803	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	804	continue;
				805
				806	utf8Error:
				807	if (utf8_decoding_error(&s, &p, errors, errmsg))
				808	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	809	}
				810
				811	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	812	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	813	goto onError;
				814
				815	return (PyObject *)unicode;
				816
				817	onError:
				818	Py_DECREF(unicode);
				819	return NULL;
				820	}
				821
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	822	/* Not used anymore, now that the encoder supports UTF-16
				823	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	824	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	825	static
				826	int utf8_encoding_error(const Py_UNICODE **source,
				827	char **dest,
				828	const char *errors,
				829	const char *details)
				830	{
				831	if ((errors == NULL) \|\|
				832	(strcmp(errors,"strict") == 0)) {
				833	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	834	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	835	details);
				836	return -1;
				837	}
				838	else if (strcmp(errors,"ignore") == 0) {
				839	return 0;
				840	}
				841	else if (strcmp(errors,"replace") == 0) {
				842	**dest = '?';
				843	(*dest)++;
				844	return 0;
				845	}
				846	else {
				847	PyErr_Format(PyExc_ValueError,
				848	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	849	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	850	errors);
				851	return -1;
				852	}
				853	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	854	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	855
				856	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				857	int size,
				858	const char *errors)
				859	{
				860	PyObject *v;
				861	char *p;
				862	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	863	Py_UCS4 ch2;
				864	unsigned int cbAllocated = 3 * size;
				865	unsigned int cbWritten = 0;
				866	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	867
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	868	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	869	if (v == NULL)
				870	return NULL;
				871	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	872	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	873
				874	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	875	while (i < size) {
				876	Py_UCS4 ch = s[i++];
				877	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	878	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	879	cbWritten++;
				880	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	881	else if (ch < 0x0800) {
				882	*p++ = 0xc0 \| (ch >> 6);
				883	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	884	cbWritten += 2;
				885	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	886	else if (ch < 0x10000) {
				887	#if Py_UNICODE_SIZE == 4
				888	*p++ = 0xe0 \| (ch>>12);
				889	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				890	*p++ = 0x80 \| (ch & 0x3f);
				891	cbWritten += 3;
				892	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	893	/* Check for high surrogate */
				894	if (0xD800 <= ch && ch <= 0xDBFF) {
				895	if (i != size) {
				896	ch2 = s[i];
				897	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				898
				899	if (cbWritten >= (cbAllocated - 4)) {
				900	/* Provide enough room for some more
				901	surrogates */
				902	cbAllocated += 4*10;
				903	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	904	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	905	}
				906
				907	/* combine the two values */
				908	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				909
				910	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	911	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	912	i++;
				913	cbWritten += 4;
				914	}
				915	}
				916	}
				917	else {
				918	*p++ = (char)(0xe0 \| (ch >> 12));
				919	cbWritten += 3;
				920	}
				921	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				922	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	923	#endif
				924	} else {
				925	*p++ = 0xf0 \| (ch>>18);
				926	*p++ = 0x80 \| ((ch>>12) & 0x3f);
				927	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				928	*p++ = 0x80 \| (ch & 0x3f);
				929	cbWritten += 4;
				930	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	931	}
				932	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	933	if (_PyString_Resize(&v, p - q))
				934	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	935	return v;
				936
				937	onError:
				938	Py_DECREF(v);
				939	return NULL;
				940	}
				941
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	942	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				943	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	944	if (!PyUnicode_Check(unicode)) {
				945	PyErr_BadArgument();
				946	return NULL;
				947	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	948	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				949	PyUnicode_GET_SIZE(unicode),
				950	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	951	}
				952
				953	/* --- UTF-16 Codec ------------------------------------------------------- */
				954
				955	static
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	956	int utf16_decoding_error(const Py_UCS2 **source,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	957	Py_UNICODE **dest,
				958	const char *errors,
				959	const char *details)
				960	{
				961	if ((errors == NULL) \|\|
				962	(strcmp(errors,"strict") == 0)) {
				963	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	964	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	965	details);
				966	return -1;
				967	}
				968	else if (strcmp(errors,"ignore") == 0) {
				969	return 0;
				970	}
				971	else if (strcmp(errors,"replace") == 0) {
				972	if (dest) {
				973	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				974	(*dest)++;
				975	}
				976	return 0;
				977	}
				978	else {
				979	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	980	"UTF-16 decoding error; "
				981	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	982	errors);
				983	return -1;
				984	}
				985	}
				986
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	987	PyObject PyUnicode_DecodeUTF16(const char s,
				988	int size,
				989	const char *errors,
				990	int *byteorder)
				991	{
				992	PyUnicodeObject *unicode;
				993	Py_UNICODE *p;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	994	const Py_UCS2 q, e;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	995	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	996	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	997
				998	/* size should be an even number */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	999	if (size % sizeof(Py_UCS2) != 0) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1000	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				1001	return NULL;
				1002	/* The remaining input chars are ignored if we fall through
				1003	here... */
				1004	}
				1005
				1006	/* Note: size will always be longer than the resulting Unicode
				1007	character count */
				1008	unicode = _PyUnicode_New(size);
				1009	if (!unicode)
				1010	return NULL;
				1011	if (size == 0)
				1012	return (PyObject *)unicode;
				1013
				1014	/* Unpack UTF-16 encoded data */
				1015	p = unicode->str;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1016	q = (Py_UCS2 *)s;
				1017	e = q + (size / sizeof(Py_UCS2));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1018
				1019	if (byteorder)
				1020	bo = *byteorder;
				1021
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1022	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1023	byte order setting accordingly. In native mode, the leading BOM
				1024	mark is skipped, in all other modes, it is copied to the output
				1025	stream as-is (giving a ZWNBSP character). */
				1026	if (bo == 0) {
				1027	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1028	if (*q == 0xFEFF) {
				1029	q++;
				1030	bo = -1;
				1031	} else if (*q == 0xFFFE) {
				1032	q++;
				1033	bo = 1;
				1034	}
				1035	#else
				1036	if (*q == 0xFEFF) {
				1037	q++;
				1038	bo = 1;
				1039	} else if (*q == 0xFFFE) {
				1040	q++;
				1041	bo = -1;
				1042	}
				1043	#endif
				1044	}
				1045
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1046	while (q < e) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1047	register Py_UCS2 ch = *q++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1048
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1049	/* Swap input bytes if needed. (This assumes
				1050	sizeof(Py_UNICODE) == 2 !) */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1051	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1052	if (bo == 1)
				1053	ch = (ch >> 8) \| (ch << 8);
				1054	#else
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1055	if (bo == -1)
				1056	ch = (ch >> 8) \| (ch << 8);
				1057	#endif
				1058	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1059	*p++ = ch;
				1060	continue;
				1061	}
				1062
				1063	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1064	if (q >= e) {
				1065	errmsg = "unexpected end of data";
				1066	goto utf16Error;
				1067	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1068	if (0xDC00 <= q && q <= 0xDFFF) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1069	Py_UCS2 ch2 = *q++;
				1070	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1071	if (bo == 1)
				1072	ch = (ch >> 8) \| (ch << 8);
				1073	#else
				1074	if (bo == -1)
				1075	ch = (ch >> 8) \| (ch << 8);
				1076	#endif
				1077	if (0xD800 <= ch && ch <= 0xDBFF) {
				1078	#if Py_UNICODE_SIZE == 2
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1079	/* This is valid data (a UTF-16 surrogate pair), but
				1080	we are not able to store this information since our
				1081	Py_UNICODE type only has 16 bits... this might
				1082	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1083	errmsg = "code pairs are not supported";
				1084	goto utf16Error;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1085	#else
				1086	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1087	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1088	#endif
				1089
				1090	}
				1091	else {
				1092	errmsg = "illegal UTF-16 surrogate";
				1093	goto utf16Error;
				1094	}
				1095
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1096	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1097	errmsg = "illegal encoding";
				1098	/* Fall through to report the error */
				1099
				1100	utf16Error:
				1101	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1102	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1103	}
				1104
				1105	if (byteorder)
				1106	*byteorder = bo;
				1107
				1108	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1109	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1110	goto onError;
				1111
				1112	return (PyObject *)unicode;
				1113
				1114	onError:
				1115	Py_DECREF(unicode);
				1116	return NULL;
				1117	}
				1118
				1119	#undef UTF16_ERROR
				1120
				1121	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1122	int size,
				1123	const char *errors,
				1124	int byteorder)
				1125	{
				1126	PyObject *v;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1127	Py_UCS2 *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1128	char *q;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1129	int i, pairs, doswap = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1130
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1131	for (i = pairs = 0; i < size; i++)
				1132	if (s[i] >= 0x10000)
				1133	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1134	v = PyString_FromStringAndSize(NULL,
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1135	sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1136	if (v == NULL)
				1137	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1138
				1139	q = PyString_AS_STRING(v);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1140	p = (Py_UCS2 *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1141	if (byteorder == 0)
				1142	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1143	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1144	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1145	if (byteorder == 0 \|\|
				1146	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1147	byteorder == -1
				1148	#else
				1149	byteorder == 1
				1150	#endif
				1151	)
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1152	doswap = 0;
				1153	while (size-- > 0) {
				1154	Py_UNICODE ch = *s++;
				1155	Py_UNICODE ch2 = 0;
				1156	if (ch >= 0x10000) {
				1157	ch2 = 0xDC00\|((ch-0x10000) & 0x3FF);
				1158	ch = 0xD800\|((ch-0x10000)>>10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1159	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1160	if (doswap){
				1161	*p++ = (ch >> 8) \| (ch << 8);
				1162	if (ch2)
				1163	*p++ = (ch2 >> 8) \| (ch2 << 8);
				1164	}else{
				1165	*p++ = ch;
				1166	if(ch2)
				1167	*p++ = ch2;
				1168	}
				1169	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1170	return v;
				1171	}
				1172
				1173	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1174	{
				1175	if (!PyUnicode_Check(unicode)) {
				1176	PyErr_BadArgument();
				1177	return NULL;
				1178	}
				1179	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1180	PyUnicode_GET_SIZE(unicode),
				1181	NULL,
				1182	0);
				1183	}
				1184
				1185	/* --- Unicode Escape Codec ----------------------------------------------- */
				1186
				1187	static
				1188	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1189	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1190	const char *errors,
				1191	const char *details)
				1192	{
				1193	if ((errors == NULL) \|\|
				1194	(strcmp(errors,"strict") == 0)) {
				1195	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1196	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1197	details);
				1198	return -1;
				1199	}
				1200	else if (strcmp(errors,"ignore") == 0) {
				1201	return 0;
				1202	}
				1203	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1204	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1205	return 0;
				1206	}
				1207	else {
				1208	PyErr_Format(PyExc_ValueError,
				1209	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1210	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1211	errors);
				1212	return -1;
				1213	}
				1214	}
				1215
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1216	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1217
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1218	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1219	int size,
				1220	const char *errors)
				1221	{
				1222	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1223	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1224	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1225	char* message;
				1226	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1227
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1228	/* Escaped strings will always be longer than the resulting
				1229	Unicode string, so we start with size here and then reduce the
				1230	length after conversion to the true value. */
				1231	v = _PyUnicode_New(size);
				1232	if (v == NULL)
				1233	goto onError;
				1234	if (size == 0)
				1235	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1236
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1237	p = buf = PyUnicode_AS_UNICODE(v);
				1238	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1239
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1240	while (s < end) {
				1241	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1242	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1243	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1244
				1245	/* Non-escape characters are interpreted as Unicode ordinals */
				1246	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1247	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1248	continue;
				1249	}
				1250
				1251	/* \ - Escapes */
				1252	s++;
				1253	switch (*s++) {
				1254
				1255	/* \x escapes */
				1256	case '\n': break;
				1257	case '\\': *p++ = '\\'; break;
				1258	case '\'': *p++ = '\''; break;
				1259	case '\"': *p++ = '\"'; break;
				1260	case 'b': *p++ = '\b'; break;
				1261	case 'f': p++ = '\014'; break; / FF */
				1262	case 't': *p++ = '\t'; break;
				1263	case 'n': *p++ = '\n'; break;
				1264	case 'r': *p++ = '\r'; break;
				1265	case 'v': p++ = '\013'; break; / VT */
				1266	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1267
				1268	/* \OOO (octal) escapes */
				1269	case '0': case '1': case '2': case '3':
				1270	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1271	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1272	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1273	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1274	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1275	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1276	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1277	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1278	break;
				1279
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1280	/* hex escapes */
				1281	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1282	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1283	digits = 2;
				1284	message = "truncated \\xXX escape";
				1285	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1286
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1287	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1288	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1289	digits = 4;
				1290	message = "truncated \\uXXXX escape";
				1291	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1292
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1293	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1294	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1295	digits = 8;
				1296	message = "truncated \\UXXXXXXXX escape";
				1297	hexescape:
				1298	chr = 0;
				1299	for (i = 0; i < digits; i++) {
				1300	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1301	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1302	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1303	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1304	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1305	i++;
				1306	break;
				1307	}
				1308	chr = (chr<<4) & ~0xF;
				1309	if (c >= '0' && c <= '9')
				1310	chr += c - '0';
				1311	else if (c >= 'a' && c <= 'f')
				1312	chr += 10 + c - 'a';
				1313	else
				1314	chr += 10 + c - 'A';
				1315	}
				1316	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1317	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1318	/* when we get here, chr is a 32-bit unicode character */
				1319	if (chr <= 0xffff)
				1320	/* UCS-2 character */
				1321	*p++ = (Py_UNICODE) chr;
				1322	else if (chr <= 0x10ffff) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1323	/* UCS-4 character. Either store directly, or as surrogate pair. */
				1324	#if Py_UNICODE_SIZE == 4
				1325	*p++ = chr;
				1326	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1327	chr -= 0x10000L;
				1328	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1329	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1330	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1331	} else {
				1332	if (unicodeescape_decoding_error(
				1333	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1334	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1335	)
				1336	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1337	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1338	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1339	break;
				1340
				1341	/* \N{name} */
				1342	case 'N':
				1343	message = "malformed \\N character escape";
				1344	if (ucnhash_CAPI == NULL) {
				1345	/* load the unicode data module */
				1346	PyObject m, v;
				1347	m = PyImport_ImportModule("unicodedata");
				1348	if (m == NULL)
				1349	goto ucnhashError;
				1350	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1351	Py_DECREF(m);
				1352	if (v == NULL)
				1353	goto ucnhashError;
				1354	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1355	Py_DECREF(v);
				1356	if (ucnhash_CAPI == NULL)
				1357	goto ucnhashError;
				1358	}
				1359	if (*s == '{') {
				1360	const char *start = s+1;
				1361	/* look for the closing brace */
				1362	while (*s != '}' && s < end)
				1363	s++;
				1364	if (s > start && s < end && *s == '}') {
				1365	/* found a name. look it up in the unicode database */
				1366	message = "unknown Unicode character name";
				1367	s++;
				1368	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1369	goto store;
				1370	}
				1371	}
				1372	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1373	goto onError;
				1374	*p++ = x;
				1375	break;
				1376
				1377	default:
				1378	*p++ = '\\';
				1379	*p++ = (unsigned char)s[-1];
				1380	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1381	}
				1382	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1383	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1384	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1385	return (PyObject *)v;
				1386
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1387	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1388	PyErr_SetString(
				1389	PyExc_UnicodeError,
				1390	"\\N escapes not supported (can't load unicodedata module)"
				1391	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1392	return NULL;
				1393
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1394	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1395	Py_XDECREF(v);
				1396	return NULL;
				1397	}
				1398
				1399	/* Return a Unicode-Escape string version of the Unicode object.
				1400
				1401	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1402	appropriate.
				1403
				1404	*/
				1405
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1406	static const Py_UNICODE findchar(const Py_UNICODE s,
				1407	int size,
				1408	Py_UNICODE ch);
				1409
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1410	static
				1411	PyObject unicodeescape_string(const Py_UNICODE s,
				1412	int size,
				1413	int quotes)
				1414	{
				1415	PyObject *repr;
				1416	char *p;
				1417	char *q;
				1418
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1419	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1420
				1421	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1422	if (repr == NULL)
				1423	return NULL;
				1424
				1425	p = q = PyString_AS_STRING(repr);
				1426
				1427	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1428	*p++ = 'u';
				1429	*p++ = (findchar(s, size, '\'') &&
				1430	!findchar(s, size, '"')) ? '"' : '\'';
				1431	}
				1432	while (size-- > 0) {
				1433	Py_UNICODE ch = *s++;
				1434	/* Escape quotes */
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	1435	if (quotes && (ch == (Py_UNICODE) q[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1436	*p++ = '\\';
				1437	*p++ = (char) ch;
				1438	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame^]	1439	/* Map 21-bit characters to '\U00xxxxxx' */
				1440	else if (ch >= 0x10000) {
				1441	*p++ = '\\';
				1442	*p++ = 'U';
				1443	*p++ = hexdigit[(ch >> 28) & 0xf];
				1444	*p++ = hexdigit[(ch >> 24) & 0xf];
				1445	*p++ = hexdigit[(ch >> 20) & 0xf];
				1446	*p++ = hexdigit[(ch >> 16) & 0xf];
				1447	*p++ = hexdigit[(ch >> 12) & 0xf];
				1448	*p++ = hexdigit[(ch >> 8) & 0xf];
				1449	*p++ = hexdigit[(ch >> 4) & 0xf];
				1450	*p++ = hexdigit[ch & 15];
				1451	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1452	/* Map 16-bit characters to '\uxxxx' */
				1453	else if (ch >= 256) {
				1454	*p++ = '\\';
				1455	*p++ = 'u';
				1456	*p++ = hexdigit[(ch >> 12) & 0xf];
				1457	*p++ = hexdigit[(ch >> 8) & 0xf];
				1458	*p++ = hexdigit[(ch >> 4) & 0xf];
				1459	*p++ = hexdigit[ch & 15];
				1460	}
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1461	/* Map special whitespace to '\t', \n', '\r' */
				1462	else if (ch == '\t') {
				1463	*p++ = '\\';
				1464	*p++ = 't';
				1465	}
				1466	else if (ch == '\n') {
				1467	*p++ = '\\';
				1468	*p++ = 'n';
				1469	}
				1470	else if (ch == '\r') {
				1471	*p++ = '\\';
				1472	*p++ = 'r';
				1473	}
				1474	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1475	else if (ch < ' ' \|\| ch >= 128) {
				1476	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1477	*p++ = 'x';
				1478	*p++ = hexdigit[(ch >> 4) & 0xf];
				1479	*p++ = hexdigit[ch & 15];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1480	}
				1481	/* Copy everything else as-is */
				1482	else
				1483	*p++ = (char) ch;
				1484	}
				1485	if (quotes)
				1486	*p++ = q[1];
				1487
				1488	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1489	if (_PyString_Resize(&repr, p - q))
				1490	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1491
				1492	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1493
				1494	onError:
				1495	Py_DECREF(repr);
				1496	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1497	}
				1498
				1499	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1500	int size)
				1501	{
				1502	return unicodeescape_string(s, size, 0);
				1503	}
				1504
				1505	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1506	{
				1507	if (!PyUnicode_Check(unicode)) {
				1508	PyErr_BadArgument();
				1509	return NULL;
				1510	}
				1511	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1512	PyUnicode_GET_SIZE(unicode));
				1513	}
				1514
				1515	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1516
				1517	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1518	int size,
				1519	const char *errors)
				1520	{
				1521	PyUnicodeObject *v;
				1522	Py_UNICODE p, buf;
				1523	const char *end;
				1524	const char *bs;
				1525
				1526	/* Escaped strings will always be longer than the resulting
				1527	Unicode string, so we start with size here and then reduce the
				1528	length after conversion to the true value. */
				1529	v = _PyUnicode_New(size);
				1530	if (v == NULL)
				1531	goto onError;
				1532	if (size == 0)
				1533	return (PyObject *)v;
				1534	p = buf = PyUnicode_AS_UNICODE(v);
				1535	end = s + size;
				1536	while (s < end) {
				1537	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1538	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1539	int i;
				1540
				1541	/* Non-escape characters are interpreted as Unicode ordinals */
				1542	if (*s != '\\') {
				1543	p++ = (unsigned char)s++;
				1544	continue;
				1545	}
				1546
				1547	/* \u-escapes are only interpreted iff the number of leading
				1548	backslashes if odd */
				1549	bs = s;
				1550	for (;s < end;) {
				1551	if (*s != '\\')
				1552	break;
				1553	p++ = (unsigned char)s++;
				1554	}
				1555	if (((s - bs) & 1) == 0 \|\|
				1556	s >= end \|\|
				1557	*s != 'u') {
				1558	continue;
				1559	}
				1560	p--;
				1561	s++;
				1562
				1563	/* \uXXXX with 4 hex digits */
				1564	for (x = 0, i = 0; i < 4; i++) {
				1565	c = (unsigned char)s[i];
				1566	if (!isxdigit(c)) {
				1567	if (unicodeescape_decoding_error(&s, &x, errors,
				1568	"truncated \\uXXXX"))
				1569	goto onError;
				1570	i++;
				1571	break;
				1572	}
				1573	x = (x<<4) & ~0xF;
				1574	if (c >= '0' && c <= '9')
				1575	x += c - '0';
				1576	else if (c >= 'a' && c <= 'f')
				1577	x += 10 + c - 'a';
				1578	else
				1579	x += 10 + c - 'A';
				1580	}
				1581	s += i;
				1582	*p++ = x;
				1583	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1584	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1585	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1586	return (PyObject *)v;
				1587
				1588	onError:
				1589	Py_XDECREF(v);
				1590	return NULL;
				1591	}
				1592
				1593	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1594	int size)
				1595	{
				1596	PyObject *repr;
				1597	char *p;
				1598	char *q;
				1599
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1600	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1601
				1602	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1603	if (repr == NULL)
				1604	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1605	if (size == 0)
				1606	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1607
				1608	p = q = PyString_AS_STRING(repr);
				1609	while (size-- > 0) {
				1610	Py_UNICODE ch = *s++;
				1611	/* Map 16-bit characters to '\uxxxx' */
				1612	if (ch >= 256) {
				1613	*p++ = '\\';
				1614	*p++ = 'u';
				1615	*p++ = hexdigit[(ch >> 12) & 0xf];
				1616	*p++ = hexdigit[(ch >> 8) & 0xf];
				1617	*p++ = hexdigit[(ch >> 4) & 0xf];
				1618	*p++ = hexdigit[ch & 15];
				1619	}
				1620	/* Copy everything else as-is */
				1621	else
				1622	*p++ = (char) ch;
				1623	}
				1624	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1625	if (_PyString_Resize(&repr, p - q))
				1626	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1627
				1628	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1629
				1630	onError:
				1631	Py_DECREF(repr);
				1632	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1633	}
				1634
				1635	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1636	{
				1637	if (!PyUnicode_Check(unicode)) {
				1638	PyErr_BadArgument();
				1639	return NULL;
				1640	}
				1641	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1642	PyUnicode_GET_SIZE(unicode));
				1643	}
				1644
				1645	/* --- Latin-1 Codec ------------------------------------------------------ */
				1646
				1647	PyObject PyUnicode_DecodeLatin1(const char s,
				1648	int size,
				1649	const char *errors)
				1650	{
				1651	PyUnicodeObject *v;
				1652	Py_UNICODE *p;
				1653
				1654	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1655	if (size == 1 && (unsigned char)s < 256) {
				1656	Py_UNICODE r = (unsigned char)s;
				1657	return PyUnicode_FromUnicode(&r, 1);
				1658	}
				1659
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1660	v = _PyUnicode_New(size);
				1661	if (v == NULL)
				1662	goto onError;
				1663	if (size == 0)
				1664	return (PyObject *)v;
				1665	p = PyUnicode_AS_UNICODE(v);
				1666	while (size-- > 0)
				1667	p++ = (unsigned char)s++;
				1668	return (PyObject *)v;
				1669
				1670	onError:
				1671	Py_XDECREF(v);
				1672	return NULL;
				1673	}
				1674
				1675	static
				1676	int latin1_encoding_error(const Py_UNICODE **source,
				1677	char **dest,
				1678	const char *errors,
				1679	const char *details)
				1680	{
				1681	if ((errors == NULL) \|\|
				1682	(strcmp(errors,"strict") == 0)) {
				1683	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1684	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1685	details);
				1686	return -1;
				1687	}
				1688	else if (strcmp(errors,"ignore") == 0) {
				1689	return 0;
				1690	}
				1691	else if (strcmp(errors,"replace") == 0) {
				1692	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1693	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1694	return 0;
				1695	}
				1696	else {
				1697	PyErr_Format(PyExc_ValueError,
				1698	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1699	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1700	errors);
				1701	return -1;
				1702	}
				1703	}
				1704
				1705	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1706	int size,
				1707	const char *errors)
				1708	{
				1709	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1710	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1711
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1712	repr = PyString_FromStringAndSize(NULL, size);
				1713	if (repr == NULL)
				1714	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1715	if (size == 0)
				1716	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1717
				1718	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1719	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1720	while (size-- > 0) {
				1721	Py_UNICODE ch = *p++;
				1722	if (ch >= 256) {
				1723	if (latin1_encoding_error(&p, &s, errors,
				1724	"ordinal not in range(256)"))
				1725	goto onError;
				1726	}
				1727	else
				1728	*s++ = (char)ch;
				1729	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1730	/* Resize if error handling skipped some characters */
				1731	if (s - start < PyString_GET_SIZE(repr))
				1732	if (_PyString_Resize(&repr, s - start))
				1733	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1734	return repr;
				1735
				1736	onError:
				1737	Py_DECREF(repr);
				1738	return NULL;
				1739	}
				1740
				1741	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1742	{
				1743	if (!PyUnicode_Check(unicode)) {
				1744	PyErr_BadArgument();
				1745	return NULL;
				1746	}
				1747	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1748	PyUnicode_GET_SIZE(unicode),
				1749	NULL);
				1750	}
				1751
				1752	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1753
				1754	static
				1755	int ascii_decoding_error(const char **source,
				1756	Py_UNICODE **dest,
				1757	const char *errors,
				1758	const char *details)
				1759	{
				1760	if ((errors == NULL) \|\|
				1761	(strcmp(errors,"strict") == 0)) {
				1762	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1763	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1764	details);
				1765	return -1;
				1766	}
				1767	else if (strcmp(errors,"ignore") == 0) {
				1768	return 0;
				1769	}
				1770	else if (strcmp(errors,"replace") == 0) {
				1771	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1772	(*dest)++;
				1773	return 0;
				1774	}
				1775	else {
				1776	PyErr_Format(PyExc_ValueError,
				1777	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1778	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1779	errors);
				1780	return -1;
				1781	}
				1782	}
				1783
				1784	PyObject PyUnicode_DecodeASCII(const char s,
				1785	int size,
				1786	const char *errors)
				1787	{
				1788	PyUnicodeObject *v;
				1789	Py_UNICODE *p;
				1790
				1791	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1792	if (size == 1 && (unsigned char)s < 128) {
				1793	Py_UNICODE r = (unsigned char)s;
				1794	return PyUnicode_FromUnicode(&r, 1);
				1795	}
				1796
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1797	v = _PyUnicode_New(size);
				1798	if (v == NULL)
				1799	goto onError;
				1800	if (size == 0)
				1801	return (PyObject *)v;
				1802	p = PyUnicode_AS_UNICODE(v);
				1803	while (size-- > 0) {
				1804	register unsigned char c;
				1805
				1806	c = (unsigned char)*s++;
				1807	if (c < 128)
				1808	*p++ = c;
				1809	else if (ascii_decoding_error(&s, &p, errors,
				1810	"ordinal not in range(128)"))
				1811	goto onError;
				1812	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1813	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1814	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1815	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1816	return (PyObject *)v;
				1817
				1818	onError:
				1819	Py_XDECREF(v);
				1820	return NULL;
				1821	}
				1822
				1823	static
				1824	int ascii_encoding_error(const Py_UNICODE **source,
				1825	char **dest,
				1826	const char *errors,
				1827	const char *details)
				1828	{
				1829	if ((errors == NULL) \|\|
				1830	(strcmp(errors,"strict") == 0)) {
				1831	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1832	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1833	details);
				1834	return -1;
				1835	}
				1836	else if (strcmp(errors,"ignore") == 0) {
				1837	return 0;
				1838	}
				1839	else if (strcmp(errors,"replace") == 0) {
				1840	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1841	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1842	return 0;
				1843	}
				1844	else {
				1845	PyErr_Format(PyExc_ValueError,
				1846	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1847	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1848	errors);
				1849	return -1;
				1850	}
				1851	}
				1852
				1853	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1854	int size,
				1855	const char *errors)
				1856	{
				1857	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1858	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1859
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1860	repr = PyString_FromStringAndSize(NULL, size);
				1861	if (repr == NULL)
				1862	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1863	if (size == 0)
				1864	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1865
				1866	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1867	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1868	while (size-- > 0) {
				1869	Py_UNICODE ch = *p++;
				1870	if (ch >= 128) {
				1871	if (ascii_encoding_error(&p, &s, errors,
				1872	"ordinal not in range(128)"))
				1873	goto onError;
				1874	}
				1875	else
				1876	*s++ = (char)ch;
				1877	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1878	/* Resize if error handling skipped some characters */
				1879	if (s - start < PyString_GET_SIZE(repr))
				1880	if (_PyString_Resize(&repr, s - start))
				1881	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1882	return repr;
				1883
				1884	onError:
				1885	Py_DECREF(repr);
				1886	return NULL;
				1887	}
				1888
				1889	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1890	{
				1891	if (!PyUnicode_Check(unicode)) {
				1892	PyErr_BadArgument();
				1893	return NULL;
				1894	}
				1895	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1896	PyUnicode_GET_SIZE(unicode),
				1897	NULL);
				1898	}
				1899
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	1900	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1901
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1902	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1903
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1904	PyObject PyUnicode_DecodeMBCS(const char s,
				1905	int size,
				1906	const char *errors)
				1907	{
				1908	PyUnicodeObject *v;
				1909	Py_UNICODE *p;
				1910
				1911	/* First get the size of the result */
				1912	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1913	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1914	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1915
				1916	v = _PyUnicode_New(usize);
				1917	if (v == NULL)
				1918	return NULL;
				1919	if (usize == 0)
				1920	return (PyObject *)v;
				1921	p = PyUnicode_AS_UNICODE(v);
				1922	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1923	Py_DECREF(v);
				1924	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1925	}
				1926
				1927	return (PyObject *)v;
				1928	}
				1929
				1930	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1931	int size,
				1932	const char *errors)
				1933	{
				1934	PyObject *repr;
				1935	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1936	DWORD mbcssize;
				1937
				1938	/* If there are no characters, bail now! */
				1939	if (size==0)
				1940	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1941
				1942	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1943	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1944	if (mbcssize==0)
				1945	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1946
				1947	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1948	if (repr == NULL)
				1949	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1950	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1951	return repr;
				1952
				1953	/* Do the conversion */
				1954	s = PyString_AS_STRING(repr);
				1955	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1956	Py_DECREF(repr);
				1957	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1958	}
				1959	return repr;
				1960	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1961
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1962	#endif /* MS_WIN32 */
				1963
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1964	/* --- Character Mapping Codec -------------------------------------------- */
				1965
				1966	static
				1967	int charmap_decoding_error(const char **source,
				1968	Py_UNICODE **dest,
				1969	const char *errors,
				1970	const char *details)
				1971	{
				1972	if ((errors == NULL) \|\|
				1973	(strcmp(errors,"strict") == 0)) {
				1974	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1975	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1976	details);
				1977	return -1;
				1978	}
				1979	else if (strcmp(errors,"ignore") == 0) {
				1980	return 0;
				1981	}
				1982	else if (strcmp(errors,"replace") == 0) {
				1983	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1984	(*dest)++;
				1985	return 0;
				1986	}
				1987	else {
				1988	PyErr_Format(PyExc_ValueError,
				1989	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1990	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1991	errors);
				1992	return -1;
				1993	}
				1994	}
				1995
				1996	PyObject PyUnicode_DecodeCharmap(const char s,
				1997	int size,
				1998	PyObject *mapping,
				1999	const char *errors)
				2000	{
				2001	PyUnicodeObject *v;
				2002	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2003	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2004
				2005	/* Default to Latin-1 */
				2006	if (mapping == NULL)
				2007	return PyUnicode_DecodeLatin1(s, size, errors);
				2008
				2009	v = _PyUnicode_New(size);
				2010	if (v == NULL)
				2011	goto onError;
				2012	if (size == 0)
				2013	return (PyObject *)v;
				2014	p = PyUnicode_AS_UNICODE(v);
				2015	while (size-- > 0) {
				2016	unsigned char ch = *s++;
				2017	PyObject w, x;
				2018
				2019	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2020	w = PyInt_FromLong((long)ch);
				2021	if (w == NULL)
				2022	goto onError;
				2023	x = PyObject_GetItem(mapping, w);
				2024	Py_DECREF(w);
				2025	if (x == NULL) {
				2026	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2027	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2028	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2029	x = Py_None;
				2030	Py_INCREF(x);
				2031	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2032	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2033	}
				2034
				2035	/* Apply mapping */
				2036	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2037	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2038	if (value < 0 \|\| value > 65535) {
				2039	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2040	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2041	Py_DECREF(x);
				2042	goto onError;
				2043	}
				2044	*p++ = (Py_UNICODE)value;
				2045	}
				2046	else if (x == Py_None) {
				2047	/* undefined mapping */
				2048	if (charmap_decoding_error(&s, &p, errors,
				2049	"character maps to <undefined>")) {
				2050	Py_DECREF(x);
				2051	goto onError;
				2052	}
				2053	}
				2054	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2055	int targetsize = PyUnicode_GET_SIZE(x);
				2056
				2057	if (targetsize == 1)
				2058	/* 1-1 mapping */
				2059	p++ = PyUnicode_AS_UNICODE(x);
				2060
				2061	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2062	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2063	if (targetsize > extrachars) {
				2064	/* resize first */
				2065	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2066	int needed = (targetsize - extrachars) + \
				2067	(targetsize << 2);
				2068	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2069	if (_PyUnicode_Resize(&v,
				2070	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2071	Py_DECREF(x);
				2072	goto onError;
				2073	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2074	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2075	}
				2076	Py_UNICODE_COPY(p,
				2077	PyUnicode_AS_UNICODE(x),
				2078	targetsize);
				2079	p += targetsize;
				2080	extrachars -= targetsize;
				2081	}
				2082	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2083	}
				2084	else {
				2085	/* wrong return value */
				2086	PyErr_SetString(PyExc_TypeError,
				2087	"character mapping must return integer, None or unicode");
				2088	Py_DECREF(x);
				2089	goto onError;
				2090	}
				2091	Py_DECREF(x);
				2092	}
				2093	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2094	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2095	goto onError;
				2096	return (PyObject *)v;
				2097
				2098	onError:
				2099	Py_XDECREF(v);
				2100	return NULL;
				2101	}
				2102
				2103	static
				2104	int charmap_encoding_error(const Py_UNICODE **source,
				2105	char **dest,
				2106	const char *errors,
				2107	const char *details)
				2108	{
				2109	if ((errors == NULL) \|\|
				2110	(strcmp(errors,"strict") == 0)) {
				2111	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2112	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2113	details);
				2114	return -1;
				2115	}
				2116	else if (strcmp(errors,"ignore") == 0) {
				2117	return 0;
				2118	}
				2119	else if (strcmp(errors,"replace") == 0) {
				2120	**dest = '?';
				2121	(*dest)++;
				2122	return 0;
				2123	}
				2124	else {
				2125	PyErr_Format(PyExc_ValueError,
				2126	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2127	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2128	errors);
				2129	return -1;
				2130	}
				2131	}
				2132
				2133	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2134	int size,
				2135	PyObject *mapping,
				2136	const char *errors)
				2137	{
				2138	PyObject *v;
				2139	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2140	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2141
				2142	/* Default to Latin-1 */
				2143	if (mapping == NULL)
				2144	return PyUnicode_EncodeLatin1(p, size, errors);
				2145
				2146	v = PyString_FromStringAndSize(NULL, size);
				2147	if (v == NULL)
				2148	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2149	if (size == 0)
				2150	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2151	s = PyString_AS_STRING(v);
				2152	while (size-- > 0) {
				2153	Py_UNICODE ch = *p++;
				2154	PyObject w, x;
				2155
				2156	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2157	w = PyInt_FromLong((long)ch);
				2158	if (w == NULL)
				2159	goto onError;
				2160	x = PyObject_GetItem(mapping, w);
				2161	Py_DECREF(w);
				2162	if (x == NULL) {
				2163	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2164	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2165	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2166	x = Py_None;
				2167	Py_INCREF(x);
				2168	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2169	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2170	}
				2171
				2172	/* Apply mapping */
				2173	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2174	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2175	if (value < 0 \|\| value > 255) {
				2176	PyErr_SetString(PyExc_TypeError,
				2177	"character mapping must be in range(256)");
				2178	Py_DECREF(x);
				2179	goto onError;
				2180	}
				2181	*s++ = (char)value;
				2182	}
				2183	else if (x == Py_None) {
				2184	/* undefined mapping */
				2185	if (charmap_encoding_error(&p, &s, errors,
				2186	"character maps to <undefined>")) {
				2187	Py_DECREF(x);
				2188	goto onError;
				2189	}
				2190	}
				2191	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2192	int targetsize = PyString_GET_SIZE(x);
				2193
				2194	if (targetsize == 1)
				2195	/* 1-1 mapping */
				2196	s++ = PyString_AS_STRING(x);
				2197
				2198	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2199	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2200	if (targetsize > extrachars) {
				2201	/* resize first */
				2202	int oldpos = (int)(s - PyString_AS_STRING(v));
				2203	int needed = (targetsize - extrachars) + \
				2204	(targetsize << 2);
				2205	extrachars += needed;
				2206	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2207	Py_DECREF(x);
				2208	goto onError;
				2209	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2210	s = PyString_AS_STRING(v) + oldpos;
				2211	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2212	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2213	s += targetsize;
				2214	extrachars -= targetsize;
				2215	}
				2216	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2217	}
				2218	else {
				2219	/* wrong return value */
				2220	PyErr_SetString(PyExc_TypeError,
				2221	"character mapping must return integer, None or unicode");
				2222	Py_DECREF(x);
				2223	goto onError;
				2224	}
				2225	Py_DECREF(x);
				2226	}
				2227	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2228	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2229	goto onError;
				2230	return v;
				2231
				2232	onError:
				2233	Py_DECREF(v);
				2234	return NULL;
				2235	}
				2236
				2237	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2238	PyObject *mapping)
				2239	{
				2240	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2241	PyErr_BadArgument();
				2242	return NULL;
				2243	}
				2244	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2245	PyUnicode_GET_SIZE(unicode),
				2246	mapping,
				2247	NULL);
				2248	}
				2249
				2250	static
				2251	int translate_error(const Py_UNICODE **source,
				2252	Py_UNICODE **dest,
				2253	const char *errors,
				2254	const char *details)
				2255	{
				2256	if ((errors == NULL) \|\|
				2257	(strcmp(errors,"strict") == 0)) {
				2258	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2259	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2260	details);
				2261	return -1;
				2262	}
				2263	else if (strcmp(errors,"ignore") == 0) {
				2264	return 0;
				2265	}
				2266	else if (strcmp(errors,"replace") == 0) {
				2267	**dest = '?';
				2268	(*dest)++;
				2269	return 0;
				2270	}
				2271	else {
				2272	PyErr_Format(PyExc_ValueError,
				2273	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2274	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2275	errors);
				2276	return -1;
				2277	}
				2278	}
				2279
				2280	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2281	int size,
				2282	PyObject *mapping,
				2283	const char *errors)
				2284	{
				2285	PyUnicodeObject *v;
				2286	Py_UNICODE *p;
				2287
				2288	if (mapping == NULL) {
				2289	PyErr_BadArgument();
				2290	return NULL;
				2291	}
				2292
				2293	/* Output will never be longer than input */
				2294	v = _PyUnicode_New(size);
				2295	if (v == NULL)
				2296	goto onError;
				2297	if (size == 0)
				2298	goto done;
				2299	p = PyUnicode_AS_UNICODE(v);
				2300	while (size-- > 0) {
				2301	Py_UNICODE ch = *s++;
				2302	PyObject w, x;
				2303
				2304	/* Get mapping */
				2305	w = PyInt_FromLong(ch);
				2306	if (w == NULL)
				2307	goto onError;
				2308	x = PyObject_GetItem(mapping, w);
				2309	Py_DECREF(w);
				2310	if (x == NULL) {
				2311	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2312	/* No mapping found: default to 1-1 mapping */
				2313	PyErr_Clear();
				2314	*p++ = ch;
				2315	continue;
				2316	}
				2317	goto onError;
				2318	}
				2319
				2320	/* Apply mapping */
				2321	if (PyInt_Check(x))
				2322	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2323	else if (x == Py_None) {
				2324	/* undefined mapping */
				2325	if (translate_error(&s, &p, errors,
				2326	"character maps to <undefined>")) {
				2327	Py_DECREF(x);
				2328	goto onError;
				2329	}
				2330	}
				2331	else if (PyUnicode_Check(x)) {
				2332	if (PyUnicode_GET_SIZE(x) != 1) {
				2333	/* 1-n mapping */
				2334	PyErr_SetString(PyExc_NotImplementedError,
				2335	"1-n mappings are currently not implemented");
				2336	Py_DECREF(x);
				2337	goto onError;
				2338	}
				2339	p++ = PyUnicode_AS_UNICODE(x);
				2340	}
				2341	else {
				2342	/* wrong return value */
				2343	PyErr_SetString(PyExc_TypeError,
				2344	"translate mapping must return integer, None or unicode");
				2345	Py_DECREF(x);
				2346	goto onError;
				2347	}
				2348	Py_DECREF(x);
				2349	}
				2350	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2351	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2352	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2353
				2354	done:
				2355	return (PyObject *)v;
				2356
				2357	onError:
				2358	Py_XDECREF(v);
				2359	return NULL;
				2360	}
				2361
				2362	PyObject PyUnicode_Translate(PyObject str,
				2363	PyObject *mapping,
				2364	const char *errors)
				2365	{
				2366	PyObject *result;
				2367
				2368	str = PyUnicode_FromObject(str);
				2369	if (str == NULL)
				2370	goto onError;
				2371	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2372	PyUnicode_GET_SIZE(str),
				2373	mapping,
				2374	errors);
				2375	Py_DECREF(str);
				2376	return result;
				2377
				2378	onError:
				2379	Py_XDECREF(str);
				2380	return NULL;
				2381	}
				2382
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2383	/* --- Decimal Encoder ---------------------------------------------------- */
				2384
				2385	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2386	int length,
				2387	char *output,
				2388	const char *errors)
				2389	{
				2390	Py_UNICODE p, end;
				2391
				2392	if (output == NULL) {
				2393	PyErr_BadArgument();
				2394	return -1;
				2395	}
				2396
				2397	p = s;
				2398	end = s + length;
				2399	while (p < end) {
				2400	register Py_UNICODE ch = *p++;
				2401	int decimal;
				2402
				2403	if (Py_UNICODE_ISSPACE(ch)) {
				2404	*output++ = ' ';
				2405	continue;
				2406	}
				2407	decimal = Py_UNICODE_TODECIMAL(ch);
				2408	if (decimal >= 0) {
				2409	*output++ = '0' + decimal;
				2410	continue;
				2411	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2412	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2413	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2414	continue;
				2415	}
				2416	/* All other characters are considered invalid */
				2417	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2418	PyErr_SetString(PyExc_ValueError,
				2419	"invalid decimal Unicode string");
				2420	goto onError;
				2421	}
				2422	else if (strcmp(errors, "ignore") == 0)
				2423	continue;
				2424	else if (strcmp(errors, "replace") == 0) {
				2425	*output++ = '?';
				2426	continue;
				2427	}
				2428	}
				2429	/* 0-terminate the output string */
				2430	*output++ = '\0';
				2431	return 0;
				2432
				2433	onError:
				2434	return -1;
				2435	}
				2436
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2437	/* --- Helpers ------------------------------------------------------------ */
				2438
				2439	static
				2440	int count(PyUnicodeObject *self,
				2441	int start,
				2442	int end,
				2443	PyUnicodeObject *substring)
				2444	{
				2445	int count = 0;
				2446
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2447	if (start < 0)
				2448	start += self->length;
				2449	if (start < 0)
				2450	start = 0;
				2451	if (end > self->length)
				2452	end = self->length;
				2453	if (end < 0)
				2454	end += self->length;
				2455	if (end < 0)
				2456	end = 0;
				2457
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2458	if (substring->length == 0)
				2459	return (end - start + 1);
				2460
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2461	end -= substring->length;
				2462
				2463	while (start <= end)
				2464	if (Py_UNICODE_MATCH(self, start, substring)) {
				2465	count++;
				2466	start += substring->length;
				2467	} else
				2468	start++;
				2469
				2470	return count;
				2471	}
				2472
				2473	int PyUnicode_Count(PyObject *str,
				2474	PyObject *substr,
				2475	int start,
				2476	int end)
				2477	{
				2478	int result;
				2479
				2480	str = PyUnicode_FromObject(str);
				2481	if (str == NULL)
				2482	return -1;
				2483	substr = PyUnicode_FromObject(substr);
				2484	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2485	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2486	return -1;
				2487	}
				2488
				2489	result = count((PyUnicodeObject *)str,
				2490	start, end,
				2491	(PyUnicodeObject *)substr);
				2492
				2493	Py_DECREF(str);
				2494	Py_DECREF(substr);
				2495	return result;
				2496	}
				2497
				2498	static
				2499	int findstring(PyUnicodeObject *self,
				2500	PyUnicodeObject *substring,
				2501	int start,
				2502	int end,
				2503	int direction)
				2504	{
				2505	if (start < 0)
				2506	start += self->length;
				2507	if (start < 0)
				2508	start = 0;
				2509
				2510	if (substring->length == 0)
				2511	return start;
				2512
				2513	if (end > self->length)
				2514	end = self->length;
				2515	if (end < 0)
				2516	end += self->length;
				2517	if (end < 0)
				2518	end = 0;
				2519
				2520	end -= substring->length;
				2521
				2522	if (direction < 0) {
				2523	for (; end >= start; end--)
				2524	if (Py_UNICODE_MATCH(self, end, substring))
				2525	return end;
				2526	} else {
				2527	for (; start <= end; start++)
				2528	if (Py_UNICODE_MATCH(self, start, substring))
				2529	return start;
				2530	}
				2531
				2532	return -1;
				2533	}
				2534
				2535	int PyUnicode_Find(PyObject *str,
				2536	PyObject *substr,
				2537	int start,
				2538	int end,
				2539	int direction)
				2540	{
				2541	int result;
				2542
				2543	str = PyUnicode_FromObject(str);
				2544	if (str == NULL)
				2545	return -1;
				2546	substr = PyUnicode_FromObject(substr);
				2547	if (substr == NULL) {
				2548	Py_DECREF(substr);
				2549	return -1;
				2550	}
				2551
				2552	result = findstring((PyUnicodeObject *)str,
				2553	(PyUnicodeObject *)substr,
				2554	start, end, direction);
				2555	Py_DECREF(str);
				2556	Py_DECREF(substr);
				2557	return result;
				2558	}
				2559
				2560	static
				2561	int tailmatch(PyUnicodeObject *self,
				2562	PyUnicodeObject *substring,
				2563	int start,
				2564	int end,
				2565	int direction)
				2566	{
				2567	if (start < 0)
				2568	start += self->length;
				2569	if (start < 0)
				2570	start = 0;
				2571
				2572	if (substring->length == 0)
				2573	return 1;
				2574
				2575	if (end > self->length)
				2576	end = self->length;
				2577	if (end < 0)
				2578	end += self->length;
				2579	if (end < 0)
				2580	end = 0;
				2581
				2582	end -= substring->length;
				2583	if (end < start)
				2584	return 0;
				2585
				2586	if (direction > 0) {
				2587	if (Py_UNICODE_MATCH(self, end, substring))
				2588	return 1;
				2589	} else {
				2590	if (Py_UNICODE_MATCH(self, start, substring))
				2591	return 1;
				2592	}
				2593
				2594	return 0;
				2595	}
				2596
				2597	int PyUnicode_Tailmatch(PyObject *str,
				2598	PyObject *substr,
				2599	int start,
				2600	int end,
				2601	int direction)
				2602	{
				2603	int result;
				2604
				2605	str = PyUnicode_FromObject(str);
				2606	if (str == NULL)
				2607	return -1;
				2608	substr = PyUnicode_FromObject(substr);
				2609	if (substr == NULL) {
				2610	Py_DECREF(substr);
				2611	return -1;
				2612	}
				2613
				2614	result = tailmatch((PyUnicodeObject *)str,
				2615	(PyUnicodeObject *)substr,
				2616	start, end, direction);
				2617	Py_DECREF(str);
				2618	Py_DECREF(substr);
				2619	return result;
				2620	}
				2621
				2622	static
				2623	const Py_UNICODE findchar(const Py_UNICODE s,
				2624	int size,
				2625	Py_UNICODE ch)
				2626	{
				2627	/* like wcschr, but doesn't stop at NULL characters */
				2628
				2629	while (size-- > 0) {
				2630	if (*s == ch)
				2631	return s;
				2632	s++;
				2633	}
				2634
				2635	return NULL;
				2636	}
				2637
				2638	/* Apply fixfct filter to the Unicode object self and return a
				2639	reference to the modified object */
				2640
				2641	static
				2642	PyObject fixup(PyUnicodeObject self,
				2643	int (fixfct)(PyUnicodeObject s))
				2644	{
				2645
				2646	PyUnicodeObject *u;
				2647
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2648	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2649	if (u == NULL)
				2650	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2651
				2652	Py_UNICODE_COPY(u->str, self->str, self->length);
				2653
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2654	if (!fixfct(u)) {
				2655	/* fixfct should return TRUE if it modified the buffer. If
				2656	FALSE, return a reference to the original buffer instead
				2657	(to save space, not time) */
				2658	Py_INCREF(self);
				2659	Py_DECREF(u);
				2660	return (PyObject*) self;
				2661	}
				2662	return (PyObject*) u;
				2663	}
				2664
				2665	static
				2666	int fixupper(PyUnicodeObject *self)
				2667	{
				2668	int len = self->length;
				2669	Py_UNICODE *s = self->str;
				2670	int status = 0;
				2671
				2672	while (len-- > 0) {
				2673	register Py_UNICODE ch;
				2674
				2675	ch = Py_UNICODE_TOUPPER(*s);
				2676	if (ch != *s) {
				2677	status = 1;
				2678	*s = ch;
				2679	}
				2680	s++;
				2681	}
				2682
				2683	return status;
				2684	}
				2685
				2686	static
				2687	int fixlower(PyUnicodeObject *self)
				2688	{
				2689	int len = self->length;
				2690	Py_UNICODE *s = self->str;
				2691	int status = 0;
				2692
				2693	while (len-- > 0) {
				2694	register Py_UNICODE ch;
				2695
				2696	ch = Py_UNICODE_TOLOWER(*s);
				2697	if (ch != *s) {
				2698	status = 1;
				2699	*s = ch;
				2700	}
				2701	s++;
				2702	}
				2703
				2704	return status;
				2705	}
				2706
				2707	static
				2708	int fixswapcase(PyUnicodeObject *self)
				2709	{
				2710	int len = self->length;
				2711	Py_UNICODE *s = self->str;
				2712	int status = 0;
				2713
				2714	while (len-- > 0) {
				2715	if (Py_UNICODE_ISUPPER(*s)) {
				2716	s = Py_UNICODE_TOLOWER(s);
				2717	status = 1;
				2718	} else if (Py_UNICODE_ISLOWER(*s)) {
				2719	s = Py_UNICODE_TOUPPER(s);
				2720	status = 1;
				2721	}
				2722	s++;
				2723	}
				2724
				2725	return status;
				2726	}
				2727
				2728	static
				2729	int fixcapitalize(PyUnicodeObject *self)
				2730	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2731	int len = self->length;
				2732	Py_UNICODE *s = self->str;
				2733	int status = 0;
				2734
				2735	if (len == 0)
				2736	return 0;
				2737	if (Py_UNICODE_ISLOWER(*s)) {
				2738	s = Py_UNICODE_TOUPPER(s);
				2739	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2740	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2741	s++;
				2742	while (--len > 0) {
				2743	if (Py_UNICODE_ISUPPER(*s)) {
				2744	s = Py_UNICODE_TOLOWER(s);
				2745	status = 1;
				2746	}
				2747	s++;
				2748	}
				2749	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2750	}
				2751
				2752	static
				2753	int fixtitle(PyUnicodeObject *self)
				2754	{
				2755	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2756	register Py_UNICODE *e;
				2757	int previous_is_cased;
				2758
				2759	/* Shortcut for single character strings */
				2760	if (PyUnicode_GET_SIZE(self) == 1) {
				2761	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2762	if (*p != ch) {
				2763	*p = ch;
				2764	return 1;
				2765	}
				2766	else
				2767	return 0;
				2768	}
				2769
				2770	e = p + PyUnicode_GET_SIZE(self);
				2771	previous_is_cased = 0;
				2772	for (; p < e; p++) {
				2773	register const Py_UNICODE ch = *p;
				2774
				2775	if (previous_is_cased)
				2776	*p = Py_UNICODE_TOLOWER(ch);
				2777	else
				2778	*p = Py_UNICODE_TOTITLE(ch);
				2779
				2780	if (Py_UNICODE_ISLOWER(ch) \|\|
				2781	Py_UNICODE_ISUPPER(ch) \|\|
				2782	Py_UNICODE_ISTITLE(ch))
				2783	previous_is_cased = 1;
				2784	else
				2785	previous_is_cased = 0;
				2786	}
				2787	return 1;
				2788	}
				2789
				2790	PyObject PyUnicode_Join(PyObject separator,
				2791	PyObject *seq)
				2792	{
				2793	Py_UNICODE *sep;
				2794	int seplen;
				2795	PyUnicodeObject *res = NULL;
				2796	int reslen = 0;
				2797	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2798	int sz = 100;
				2799	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2800	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2801
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2802	it = PyObject_GetIter(seq);
				2803	if (it == NULL)
				2804	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2805
				2806	if (separator == NULL) {
				2807	Py_UNICODE blank = ' ';
				2808	sep = &blank;
				2809	seplen = 1;
				2810	}
				2811	else {
				2812	separator = PyUnicode_FromObject(separator);
				2813	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2814	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2815	sep = PyUnicode_AS_UNICODE(separator);
				2816	seplen = PyUnicode_GET_SIZE(separator);
				2817	}
				2818
				2819	res = _PyUnicode_New(sz);
				2820	if (res == NULL)
				2821	goto onError;
				2822	p = PyUnicode_AS_UNICODE(res);
				2823	reslen = 0;
				2824
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2825	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2826	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2827	PyObject *item = PyIter_Next(it);
				2828	if (item == NULL) {
				2829	if (PyErr_Occurred())
				2830	goto onError;
				2831	break;
				2832	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2833	if (!PyUnicode_Check(item)) {
				2834	PyObject *v;
				2835	v = PyUnicode_FromObject(item);
				2836	Py_DECREF(item);
				2837	item = v;
				2838	if (item == NULL)
				2839	goto onError;
				2840	}
				2841	itemlen = PyUnicode_GET_SIZE(item);
				2842	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2843	if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2844	goto onError;
				2845	sz *= 2;
				2846	p = PyUnicode_AS_UNICODE(res) + reslen;
				2847	}
				2848	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2849	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2850	p += seplen;
				2851	reslen += seplen;
				2852	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2853	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2854	p += itemlen;
				2855	reslen += itemlen;
				2856	Py_DECREF(item);
				2857	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2858	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2859	goto onError;
				2860
				2861	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2862	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2863	return (PyObject *)res;
				2864
				2865	onError:
				2866	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2867	Py_XDECREF(res);
				2868	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2869	return NULL;
				2870	}
				2871
				2872	static
				2873	PyUnicodeObject pad(PyUnicodeObject self,
				2874	int left,
				2875	int right,
				2876	Py_UNICODE fill)
				2877	{
				2878	PyUnicodeObject *u;
				2879
				2880	if (left < 0)
				2881	left = 0;
				2882	if (right < 0)
				2883	right = 0;
				2884
				2885	if (left == 0 && right == 0) {
				2886	Py_INCREF(self);
				2887	return self;
				2888	}
				2889
				2890	u = _PyUnicode_New(left + self->length + right);
				2891	if (u) {
				2892	if (left)
				2893	Py_UNICODE_FILL(u->str, fill, left);
				2894	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2895	if (right)
				2896	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2897	}
				2898
				2899	return u;
				2900	}
				2901
				2902	#define SPLIT_APPEND(data, left, right) \
				2903	str = PyUnicode_FromUnicode(data + left, right - left); \
				2904	if (!str) \
				2905	goto onError; \
				2906	if (PyList_Append(list, str)) { \
				2907	Py_DECREF(str); \
				2908	goto onError; \
				2909	} \
				2910	else \
				2911	Py_DECREF(str);
				2912
				2913	static
				2914	PyObject split_whitespace(PyUnicodeObject self,
				2915	PyObject *list,
				2916	int maxcount)
				2917	{
				2918	register int i;
				2919	register int j;
				2920	int len = self->length;
				2921	PyObject *str;
				2922
				2923	for (i = j = 0; i < len; ) {
				2924	/* find a token */
				2925	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2926	i++;
				2927	j = i;
				2928	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2929	i++;
				2930	if (j < i) {
				2931	if (maxcount-- <= 0)
				2932	break;
				2933	SPLIT_APPEND(self->str, j, i);
				2934	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2935	i++;
				2936	j = i;
				2937	}
				2938	}
				2939	if (j < len) {
				2940	SPLIT_APPEND(self->str, j, len);
				2941	}
				2942	return list;
				2943
				2944	onError:
				2945	Py_DECREF(list);
				2946	return NULL;
				2947	}
				2948
				2949	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2950	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2951	{
				2952	register int i;
				2953	register int j;
				2954	int len;
				2955	PyObject *list;
				2956	PyObject *str;
				2957	Py_UNICODE *data;
				2958
				2959	string = PyUnicode_FromObject(string);
				2960	if (string == NULL)
				2961	return NULL;
				2962	data = PyUnicode_AS_UNICODE(string);
				2963	len = PyUnicode_GET_SIZE(string);
				2964
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2965	list = PyList_New(0);
				2966	if (!list)
				2967	goto onError;
				2968
				2969	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2970	int eol;
				2971
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2972	/* Find a line and append it */
				2973	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2974	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2975
				2976	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2977	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2978	if (i < len) {
				2979	if (data[i] == '\r' && i + 1 < len &&
				2980	data[i+1] == '\n')
				2981	i += 2;
				2982	else
				2983	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2984	if (keepends)
				2985	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2986	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2987	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2988	j = i;
				2989	}
				2990	if (j < len) {
				2991	SPLIT_APPEND(data, j, len);
				2992	}
				2993
				2994	Py_DECREF(string);
				2995	return list;
				2996
				2997	onError:
				2998	Py_DECREF(list);
				2999	Py_DECREF(string);
				3000	return NULL;
				3001	}
				3002
				3003	static
				3004	PyObject split_char(PyUnicodeObject self,
				3005	PyObject *list,
				3006	Py_UNICODE ch,
				3007	int maxcount)
				3008	{
				3009	register int i;
				3010	register int j;
				3011	int len = self->length;
				3012	PyObject *str;
				3013
				3014	for (i = j = 0; i < len; ) {
				3015	if (self->str[i] == ch) {
				3016	if (maxcount-- <= 0)
				3017	break;
				3018	SPLIT_APPEND(self->str, j, i);
				3019	i = j = i + 1;
				3020	} else
				3021	i++;
				3022	}
				3023	if (j <= len) {
				3024	SPLIT_APPEND(self->str, j, len);
				3025	}
				3026	return list;
				3027
				3028	onError:
				3029	Py_DECREF(list);
				3030	return NULL;
				3031	}
				3032
				3033	static
				3034	PyObject split_substring(PyUnicodeObject self,
				3035	PyObject *list,
				3036	PyUnicodeObject *substring,
				3037	int maxcount)
				3038	{
				3039	register int i;
				3040	register int j;
				3041	int len = self->length;
				3042	int sublen = substring->length;
				3043	PyObject *str;
				3044
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3045	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3046	if (Py_UNICODE_MATCH(self, i, substring)) {
				3047	if (maxcount-- <= 0)
				3048	break;
				3049	SPLIT_APPEND(self->str, j, i);
				3050	i = j = i + sublen;
				3051	} else
				3052	i++;
				3053	}
				3054	if (j <= len) {
				3055	SPLIT_APPEND(self->str, j, len);
				3056	}
				3057	return list;
				3058
				3059	onError:
				3060	Py_DECREF(list);
				3061	return NULL;
				3062	}
				3063
				3064	#undef SPLIT_APPEND
				3065
				3066	static
				3067	PyObject split(PyUnicodeObject self,
				3068	PyUnicodeObject *substring,
				3069	int maxcount)
				3070	{
				3071	PyObject *list;
				3072
				3073	if (maxcount < 0)
				3074	maxcount = INT_MAX;
				3075
				3076	list = PyList_New(0);
				3077	if (!list)
				3078	return NULL;
				3079
				3080	if (substring == NULL)
				3081	return split_whitespace(self,list,maxcount);
				3082
				3083	else if (substring->length == 1)
				3084	return split_char(self,list,substring->str[0],maxcount);
				3085
				3086	else if (substring->length == 0) {
				3087	Py_DECREF(list);
				3088	PyErr_SetString(PyExc_ValueError, "empty separator");
				3089	return NULL;
				3090	}
				3091	else
				3092	return split_substring(self,list,substring,maxcount);
				3093	}
				3094
				3095	static
				3096	PyObject strip(PyUnicodeObject self,
				3097	int left,
				3098	int right)
				3099	{
				3100	Py_UNICODE *p = self->str;
				3101	int start = 0;
				3102	int end = self->length;
				3103
				3104	if (left)
				3105	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3106	start++;
				3107
				3108	if (right)
				3109	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3110	end--;
				3111
				3112	if (start == 0 && end == self->length) {
				3113	/* couldn't strip anything off, return original string */
				3114	Py_INCREF(self);
				3115	return (PyObject*) self;
				3116	}
				3117
				3118	return (PyObject*) PyUnicode_FromUnicode(
				3119	self->str + start,
				3120	end - start
				3121	);
				3122	}
				3123
				3124	static
				3125	PyObject replace(PyUnicodeObject self,
				3126	PyUnicodeObject *str1,
				3127	PyUnicodeObject *str2,
				3128	int maxcount)
				3129	{
				3130	PyUnicodeObject *u;
				3131
				3132	if (maxcount < 0)
				3133	maxcount = INT_MAX;
				3134
				3135	if (str1->length == 1 && str2->length == 1) {
				3136	int i;
				3137
				3138	/* replace characters */
				3139	if (!findchar(self->str, self->length, str1->str[0])) {
				3140	/* nothing to replace, return original string */
				3141	Py_INCREF(self);
				3142	u = self;
				3143	} else {
				3144	Py_UNICODE u1 = str1->str[0];
				3145	Py_UNICODE u2 = str2->str[0];
				3146
				3147	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3148	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3149	self->length
				3150	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3151	if (u != NULL) {
				3152	Py_UNICODE_COPY(u->str, self->str,
				3153	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3154	for (i = 0; i < u->length; i++)
				3155	if (u->str[i] == u1) {
				3156	if (--maxcount < 0)
				3157	break;
				3158	u->str[i] = u2;
				3159	}
				3160	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3161	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3162
				3163	} else {
				3164	int n, i;
				3165	Py_UNICODE *p;
				3166
				3167	/* replace strings */
				3168	n = count(self, 0, self->length, str1);
				3169	if (n > maxcount)
				3170	n = maxcount;
				3171	if (n == 0) {
				3172	/* nothing to replace, return original string */
				3173	Py_INCREF(self);
				3174	u = self;
				3175	} else {
				3176	u = _PyUnicode_New(
				3177	self->length + n * (str2->length - str1->length));
				3178	if (u) {
				3179	i = 0;
				3180	p = u->str;
				3181	while (i <= self->length - str1->length)
				3182	if (Py_UNICODE_MATCH(self, i, str1)) {
				3183	/* replace string segment */
				3184	Py_UNICODE_COPY(p, str2->str, str2->length);
				3185	p += str2->length;
				3186	i += str1->length;
				3187	if (--n <= 0) {
				3188	/* copy remaining part */
				3189	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3190	break;
				3191	}
				3192	} else
				3193	*p++ = self->str[i++];
				3194	}
				3195	}
				3196	}
				3197
				3198	return (PyObject *) u;
				3199	}
				3200
				3201	/* --- Unicode Object Methods --------------------------------------------- */
				3202
				3203	static char title__doc__[] =
				3204	"S.title() -> unicode\n\
				3205	\n\
				3206	Return a titlecased version of S, i.e. words start with title case\n\
				3207	characters, all remaining cased characters have lower case.";
				3208
				3209	static PyObject*
				3210	unicode_title(PyUnicodeObject self, PyObject args)
				3211	{
				3212	if (!PyArg_NoArgs(args))
				3213	return NULL;
				3214	return fixup(self, fixtitle);
				3215	}
				3216
				3217	static char capitalize__doc__[] =
				3218	"S.capitalize() -> unicode\n\
				3219	\n\
				3220	Return a capitalized version of S, i.e. make the first character\n\
				3221	have upper case.";
				3222
				3223	static PyObject*
				3224	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3225	{
				3226	if (!PyArg_NoArgs(args))
				3227	return NULL;
				3228	return fixup(self, fixcapitalize);
				3229	}
				3230
				3231	#if 0
				3232	static char capwords__doc__[] =
				3233	"S.capwords() -> unicode\n\
				3234	\n\
				3235	Apply .capitalize() to all words in S and return the result with\n\
				3236	normalized whitespace (all whitespace strings are replaced by ' ').";
				3237
				3238	static PyObject*
				3239	unicode_capwords(PyUnicodeObject self, PyObject args)
				3240	{
				3241	PyObject *list;
				3242	PyObject *item;
				3243	int i;
				3244
				3245	if (!PyArg_NoArgs(args))
				3246	return NULL;
				3247
				3248	/* Split into words */
				3249	list = split(self, NULL, -1);
				3250	if (!list)
				3251	return NULL;
				3252
				3253	/* Capitalize each word */
				3254	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3255	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3256	fixcapitalize);
				3257	if (item == NULL)
				3258	goto onError;
				3259	Py_DECREF(PyList_GET_ITEM(list, i));
				3260	PyList_SET_ITEM(list, i, item);
				3261	}
				3262
				3263	/* Join the words to form a new string */
				3264	item = PyUnicode_Join(NULL, list);
				3265
				3266	onError:
				3267	Py_DECREF(list);
				3268	return (PyObject *)item;
				3269	}
				3270	#endif
				3271
				3272	static char center__doc__[] =
				3273	"S.center(width) -> unicode\n\
				3274	\n\
				3275	Return S centered in a Unicode string of length width. Padding is done\n\
				3276	using spaces.";
				3277
				3278	static PyObject *
				3279	unicode_center(PyUnicodeObject self, PyObject args)
				3280	{
				3281	int marg, left;
				3282	int width;
				3283
				3284	if (!PyArg_ParseTuple(args, "i:center", &width))
				3285	return NULL;
				3286
				3287	if (self->length >= width) {
				3288	Py_INCREF(self);
				3289	return (PyObject*) self;
				3290	}
				3291
				3292	marg = width - self->length;
				3293	left = marg / 2 + (marg & width & 1);
				3294
				3295	return (PyObject*) pad(self, left, marg - left, ' ');
				3296	}
				3297
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3298	#if 0
				3299
				3300	/* This code should go into some future Unicode collation support
				3301	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3302	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3303
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3304	/* speedy UTF-16 code point order comparison */
				3305	/* gleaned from: */
				3306	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3307
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3308	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3309	{
				3310	0, 0, 0, 0, 0, 0, 0, 0,
				3311	0, 0, 0, 0, 0, 0, 0, 0,
				3312	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3313	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3314	};
				3315
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3316	static int
				3317	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3318	{
				3319	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3320
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3321	Py_UNICODE *s1 = str1->str;
				3322	Py_UNICODE *s2 = str2->str;
				3323
				3324	len1 = str1->length;
				3325	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3326
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3327	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3328	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3329
				3330	c1 = *s1++;
				3331	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3332
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3333	if (c1 > (1<<11) * 26)
				3334	c1 += utf16Fixup[c1>>11];
				3335	if (c2 > (1<<11) * 26)
				3336	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3337	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3338
				3339	if (c1 != c2)
				3340	return (c1 < c2) ? -1 : 1;
				3341
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3342	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3343	}
				3344
				3345	return (len1 < len2) ? -1 : (len1 != len2);
				3346	}
				3347
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3348	#else
				3349
				3350	static int
				3351	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3352	{
				3353	register int len1, len2;
				3354
				3355	Py_UNICODE *s1 = str1->str;
				3356	Py_UNICODE *s2 = str2->str;
				3357
				3358	len1 = str1->length;
				3359	len2 = str2->length;
				3360
				3361	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3362	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3363
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3364	c1 = *s1++;
				3365	c2 = *s2++;
				3366
				3367	if (c1 != c2)
				3368	return (c1 < c2) ? -1 : 1;
				3369
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3370	len1--; len2--;
				3371	}
				3372
				3373	return (len1 < len2) ? -1 : (len1 != len2);
				3374	}
				3375
				3376	#endif
				3377
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3378	int PyUnicode_Compare(PyObject *left,
				3379	PyObject *right)
				3380	{
				3381	PyUnicodeObject u = NULL, v = NULL;
				3382	int result;
				3383
				3384	/* Coerce the two arguments */
				3385	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3386	if (u == NULL)
				3387	goto onError;
				3388	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3389	if (v == NULL)
				3390	goto onError;
				3391
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3392	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3393	if (v == u) {
				3394	Py_DECREF(u);
				3395	Py_DECREF(v);
				3396	return 0;
				3397	}
				3398
				3399	result = unicode_compare(u, v);
				3400
				3401	Py_DECREF(u);
				3402	Py_DECREF(v);
				3403	return result;
				3404
				3405	onError:
				3406	Py_XDECREF(u);
				3407	Py_XDECREF(v);
				3408	return -1;
				3409	}
				3410
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3411	int PyUnicode_Contains(PyObject *container,
				3412	PyObject *element)
				3413	{
				3414	PyUnicodeObject u = NULL, v = NULL;
				3415	int result;
				3416	register const Py_UNICODE p, e;
				3417	register Py_UNICODE ch;
				3418
				3419	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3420	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3421	if (v == NULL) {
				3422	PyErr_SetString(PyExc_TypeError,
				3423	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3424	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3425	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3426	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3427	if (u == NULL) {
				3428	Py_DECREF(v);
				3429	goto onError;
				3430	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3431
				3432	/* Check v in u */
				3433	if (PyUnicode_GET_SIZE(v) != 1) {
				3434	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3435	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3436	goto onError;
				3437	}
				3438	ch = *PyUnicode_AS_UNICODE(v);
				3439	p = PyUnicode_AS_UNICODE(u);
				3440	e = p + PyUnicode_GET_SIZE(u);
				3441	result = 0;
				3442	while (p < e) {
				3443	if (*p++ == ch) {
				3444	result = 1;
				3445	break;
				3446	}
				3447	}
				3448
				3449	Py_DECREF(u);
				3450	Py_DECREF(v);
				3451	return result;
				3452
				3453	onError:
				3454	Py_XDECREF(u);
				3455	Py_XDECREF(v);
				3456	return -1;
				3457	}
				3458
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3459	/* Concat to string or Unicode object giving a new Unicode object. */
				3460
				3461	PyObject PyUnicode_Concat(PyObject left,
				3462	PyObject *right)
				3463	{
				3464	PyUnicodeObject u = NULL, v = NULL, *w;
				3465
				3466	/* Coerce the two arguments */
				3467	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3468	if (u == NULL)
				3469	goto onError;
				3470	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3471	if (v == NULL)
				3472	goto onError;
				3473
				3474	/* Shortcuts */
				3475	if (v == unicode_empty) {
				3476	Py_DECREF(v);
				3477	return (PyObject *)u;
				3478	}
				3479	if (u == unicode_empty) {
				3480	Py_DECREF(u);
				3481	return (PyObject *)v;
				3482	}
				3483
				3484	/* Concat the two Unicode strings */
				3485	w = _PyUnicode_New(u->length + v->length);
				3486	if (w == NULL)
				3487	goto onError;
				3488	Py_UNICODE_COPY(w->str, u->str, u->length);
				3489	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3490
				3491	Py_DECREF(u);
				3492	Py_DECREF(v);
				3493	return (PyObject *)w;
				3494
				3495	onError:
				3496	Py_XDECREF(u);
				3497	Py_XDECREF(v);
				3498	return NULL;
				3499	}
				3500
				3501	static char count__doc__[] =
				3502	"S.count(sub[, start[, end]]) -> int\n\
				3503	\n\
				3504	Return the number of occurrences of substring sub in Unicode string\n\
				3505	S[start:end]. Optional arguments start and end are\n\
				3506	interpreted as in slice notation.";
				3507
				3508	static PyObject *
				3509	unicode_count(PyUnicodeObject self, PyObject args)
				3510	{
				3511	PyUnicodeObject *substring;
				3512	int start = 0;
				3513	int end = INT_MAX;
				3514	PyObject *result;
				3515
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3516	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3517	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3518	return NULL;
				3519
				3520	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3521	(PyObject *)substring);
				3522	if (substring == NULL)
				3523	return NULL;
				3524
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3525	if (start < 0)
				3526	start += self->length;
				3527	if (start < 0)
				3528	start = 0;
				3529	if (end > self->length)
				3530	end = self->length;
				3531	if (end < 0)
				3532	end += self->length;
				3533	if (end < 0)
				3534	end = 0;
				3535
				3536	result = PyInt_FromLong((long) count(self, start, end, substring));
				3537
				3538	Py_DECREF(substring);
				3539	return result;
				3540	}
				3541
				3542	static char encode__doc__[] =
				3543	"S.encode([encoding[,errors]]) -> string\n\
				3544	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3545	Return an encoded string version of S. Default encoding is the current\n\
				3546	default string encoding. errors may be given to set a different error\n\
				3547	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3548	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3549
				3550	static PyObject *
				3551	unicode_encode(PyUnicodeObject self, PyObject args)
				3552	{
				3553	char *encoding = NULL;
				3554	char *errors = NULL;
				3555	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3556	return NULL;
				3557	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3558	}
				3559
				3560	static char expandtabs__doc__[] =
				3561	"S.expandtabs([tabsize]) -> unicode\n\
				3562	\n\
				3563	Return a copy of S where all tab characters are expanded using spaces.\n\
				3564	If tabsize is not given, a tab size of 8 characters is assumed.";
				3565
				3566	static PyObject*
				3567	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3568	{
				3569	Py_UNICODE *e;
				3570	Py_UNICODE *p;
				3571	Py_UNICODE *q;
				3572	int i, j;
				3573	PyUnicodeObject *u;
				3574	int tabsize = 8;
				3575
				3576	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3577	return NULL;
				3578
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3579	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3580	i = j = 0;
				3581	e = self->str + self->length;
				3582	for (p = self->str; p < e; p++)
				3583	if (*p == '\t') {
				3584	if (tabsize > 0)
				3585	j += tabsize - (j % tabsize);
				3586	}
				3587	else {
				3588	j++;
				3589	if (p == '\n' \|\| p == '\r') {
				3590	i += j;
				3591	j = 0;
				3592	}
				3593	}
				3594
				3595	/* Second pass: create output string and fill it */
				3596	u = _PyUnicode_New(i + j);
				3597	if (!u)
				3598	return NULL;
				3599
				3600	j = 0;
				3601	q = u->str;
				3602
				3603	for (p = self->str; p < e; p++)
				3604	if (*p == '\t') {
				3605	if (tabsize > 0) {
				3606	i = tabsize - (j % tabsize);
				3607	j += i;
				3608	while (i--)
				3609	*q++ = ' ';
				3610	}
				3611	}
				3612	else {
				3613	j++;
				3614	q++ = p;
				3615	if (p == '\n' \|\| p == '\r')
				3616	j = 0;
				3617	}
				3618
				3619	return (PyObject*) u;
				3620	}
				3621
				3622	static char find__doc__[] =
				3623	"S.find(sub [,start [,end]]) -> int\n\
				3624	\n\
				3625	Return the lowest index in S where substring sub is found,\n\
				3626	such that sub is contained within s[start,end]. Optional\n\
				3627	arguments start and end are interpreted as in slice notation.\n\
				3628	\n\
				3629	Return -1 on failure.";
				3630
				3631	static PyObject *
				3632	unicode_find(PyUnicodeObject self, PyObject args)
				3633	{
				3634	PyUnicodeObject *substring;
				3635	int start = 0;
				3636	int end = INT_MAX;
				3637	PyObject *result;
				3638
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3639	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3640	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3641	return NULL;
				3642	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3643	(PyObject *)substring);
				3644	if (substring == NULL)
				3645	return NULL;
				3646
				3647	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3648
				3649	Py_DECREF(substring);
				3650	return result;
				3651	}
				3652
				3653	static PyObject *
				3654	unicode_getitem(PyUnicodeObject *self, int index)
				3655	{
				3656	if (index < 0 \|\| index >= self->length) {
				3657	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3658	return NULL;
				3659	}
				3660
				3661	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3662	}
				3663
				3664	static long
				3665	unicode_hash(PyUnicodeObject *self)
				3666	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3667	/* Since Unicode objects compare equal to their ASCII string
				3668	counterparts, they should use the individual character values
				3669	as basis for their hash value. This is needed to assure that
				3670	strings and Unicode objects behave in the same way as
				3671	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3672
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3673	register int len;
				3674	register Py_UNICODE *p;
				3675	register long x;
				3676
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3677	if (self->hash != -1)
				3678	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3679	len = PyUnicode_GET_SIZE(self);
				3680	p = PyUnicode_AS_UNICODE(self);
				3681	x = *p << 7;
				3682	while (--len >= 0)
				3683	x = (1000003x) ^ p++;
				3684	x ^= PyUnicode_GET_SIZE(self);
				3685	if (x == -1)
				3686	x = -2;
				3687	self->hash = x;
				3688	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3689	}
				3690
				3691	static char index__doc__[] =
				3692	"S.index(sub [,start [,end]]) -> int\n\
				3693	\n\
				3694	Like S.find() but raise ValueError when the substring is not found.";
				3695
				3696	static PyObject *
				3697	unicode_index(PyUnicodeObject self, PyObject args)
				3698	{
				3699	int result;
				3700	PyUnicodeObject *substring;
				3701	int start = 0;
				3702	int end = INT_MAX;
				3703
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3704	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3705	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3706	return NULL;
				3707
				3708	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3709	(PyObject *)substring);
				3710	if (substring == NULL)
				3711	return NULL;
				3712
				3713	result = findstring(self, substring, start, end, 1);
				3714
				3715	Py_DECREF(substring);
				3716	if (result < 0) {
				3717	PyErr_SetString(PyExc_ValueError, "substring not found");
				3718	return NULL;
				3719	}
				3720	return PyInt_FromLong(result);
				3721	}
				3722
				3723	static char islower__doc__[] =
				3724	"S.islower() -> int\n\
				3725	\n\
				3726	Return 1 if all cased characters in S are lowercase and there is\n\
				3727	at least one cased character in S, 0 otherwise.";
				3728
				3729	static PyObject*
				3730	unicode_islower(PyUnicodeObject self, PyObject args)
				3731	{
				3732	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3733	register const Py_UNICODE *e;
				3734	int cased;
				3735
				3736	if (!PyArg_NoArgs(args))
				3737	return NULL;
				3738
				3739	/* Shortcut for single character strings */
				3740	if (PyUnicode_GET_SIZE(self) == 1)
				3741	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3742
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3743	/* Special case for empty strings */
				3744	if (PyString_GET_SIZE(self) == 0)
				3745	return PyInt_FromLong(0);
				3746
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3747	e = p + PyUnicode_GET_SIZE(self);
				3748	cased = 0;
				3749	for (; p < e; p++) {
				3750	register const Py_UNICODE ch = *p;
				3751
				3752	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3753	return PyInt_FromLong(0);
				3754	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3755	cased = 1;
				3756	}
				3757	return PyInt_FromLong(cased);
				3758	}
				3759
				3760	static char isupper__doc__[] =
				3761	"S.isupper() -> int\n\
				3762	\n\
				3763	Return 1 if all cased characters in S are uppercase and there is\n\
				3764	at least one cased character in S, 0 otherwise.";
				3765
				3766	static PyObject*
				3767	unicode_isupper(PyUnicodeObject self, PyObject args)
				3768	{
				3769	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3770	register const Py_UNICODE *e;
				3771	int cased;
				3772
				3773	if (!PyArg_NoArgs(args))
				3774	return NULL;
				3775
				3776	/* Shortcut for single character strings */
				3777	if (PyUnicode_GET_SIZE(self) == 1)
				3778	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3779
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3780	/* Special case for empty strings */
				3781	if (PyString_GET_SIZE(self) == 0)
				3782	return PyInt_FromLong(0);
				3783
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3784	e = p + PyUnicode_GET_SIZE(self);
				3785	cased = 0;
				3786	for (; p < e; p++) {
				3787	register const Py_UNICODE ch = *p;
				3788
				3789	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3790	return PyInt_FromLong(0);
				3791	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3792	cased = 1;
				3793	}
				3794	return PyInt_FromLong(cased);
				3795	}
				3796
				3797	static char istitle__doc__[] =
				3798	"S.istitle() -> int\n\
				3799	\n\
				3800	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3801	may only follow uncased characters and lowercase characters only cased\n\
				3802	ones. Return 0 otherwise.";
				3803
				3804	static PyObject*
				3805	unicode_istitle(PyUnicodeObject self, PyObject args)
				3806	{
				3807	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3808	register const Py_UNICODE *e;
				3809	int cased, previous_is_cased;
				3810
				3811	if (!PyArg_NoArgs(args))
				3812	return NULL;
				3813
				3814	/* Shortcut for single character strings */
				3815	if (PyUnicode_GET_SIZE(self) == 1)
				3816	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3817	(Py_UNICODE_ISUPPER(*p) != 0));
				3818
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3819	/* Special case for empty strings */
				3820	if (PyString_GET_SIZE(self) == 0)
				3821	return PyInt_FromLong(0);
				3822
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3823	e = p + PyUnicode_GET_SIZE(self);
				3824	cased = 0;
				3825	previous_is_cased = 0;
				3826	for (; p < e; p++) {
				3827	register const Py_UNICODE ch = *p;
				3828
				3829	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3830	if (previous_is_cased)
				3831	return PyInt_FromLong(0);
				3832	previous_is_cased = 1;
				3833	cased = 1;
				3834	}
				3835	else if (Py_UNICODE_ISLOWER(ch)) {
				3836	if (!previous_is_cased)
				3837	return PyInt_FromLong(0);
				3838	previous_is_cased = 1;
				3839	cased = 1;
				3840	}
				3841	else
				3842	previous_is_cased = 0;
				3843	}
				3844	return PyInt_FromLong(cased);
				3845	}
				3846
				3847	static char isspace__doc__[] =
				3848	"S.isspace() -> int\n\
				3849	\n\
				3850	Return 1 if there are only whitespace characters in S,\n\
				3851	0 otherwise.";
				3852
				3853	static PyObject*
				3854	unicode_isspace(PyUnicodeObject self, PyObject args)
				3855	{
				3856	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3857	register const Py_UNICODE *e;
				3858
				3859	if (!PyArg_NoArgs(args))
				3860	return NULL;
				3861
				3862	/* Shortcut for single character strings */
				3863	if (PyUnicode_GET_SIZE(self) == 1 &&
				3864	Py_UNICODE_ISSPACE(*p))
				3865	return PyInt_FromLong(1);
				3866
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3867	/* Special case for empty strings */
				3868	if (PyString_GET_SIZE(self) == 0)
				3869	return PyInt_FromLong(0);
				3870
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3871	e = p + PyUnicode_GET_SIZE(self);
				3872	for (; p < e; p++) {
				3873	if (!Py_UNICODE_ISSPACE(*p))
				3874	return PyInt_FromLong(0);
				3875	}
				3876	return PyInt_FromLong(1);
				3877	}
				3878
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3879	static char isalpha__doc__[] =
				3880	"S.isalpha() -> int\n\
				3881	\n\
				3882	Return 1 if all characters in S are alphabetic\n\
				3883	and there is at least one character in S, 0 otherwise.";
				3884
				3885	static PyObject*
				3886	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3887	{
				3888	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3889	register const Py_UNICODE *e;
				3890
				3891	if (!PyArg_NoArgs(args))
				3892	return NULL;
				3893
				3894	/* Shortcut for single character strings */
				3895	if (PyUnicode_GET_SIZE(self) == 1 &&
				3896	Py_UNICODE_ISALPHA(*p))
				3897	return PyInt_FromLong(1);
				3898
				3899	/* Special case for empty strings */
				3900	if (PyString_GET_SIZE(self) == 0)
				3901	return PyInt_FromLong(0);
				3902
				3903	e = p + PyUnicode_GET_SIZE(self);
				3904	for (; p < e; p++) {
				3905	if (!Py_UNICODE_ISALPHA(*p))
				3906	return PyInt_FromLong(0);
				3907	}
				3908	return PyInt_FromLong(1);
				3909	}
				3910
				3911	static char isalnum__doc__[] =
				3912	"S.isalnum() -> int\n\
				3913	\n\
				3914	Return 1 if all characters in S are alphanumeric\n\
				3915	and there is at least one character in S, 0 otherwise.";
				3916
				3917	static PyObject*
				3918	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3919	{
				3920	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3921	register const Py_UNICODE *e;
				3922
				3923	if (!PyArg_NoArgs(args))
				3924	return NULL;
				3925
				3926	/* Shortcut for single character strings */
				3927	if (PyUnicode_GET_SIZE(self) == 1 &&
				3928	Py_UNICODE_ISALNUM(*p))
				3929	return PyInt_FromLong(1);
				3930
				3931	/* Special case for empty strings */
				3932	if (PyString_GET_SIZE(self) == 0)
				3933	return PyInt_FromLong(0);
				3934
				3935	e = p + PyUnicode_GET_SIZE(self);
				3936	for (; p < e; p++) {
				3937	if (!Py_UNICODE_ISALNUM(*p))
				3938	return PyInt_FromLong(0);
				3939	}
				3940	return PyInt_FromLong(1);
				3941	}
				3942
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3943	static char isdecimal__doc__[] =
				3944	"S.isdecimal() -> int\n\
				3945	\n\
				3946	Return 1 if there are only decimal characters in S,\n\
				3947	0 otherwise.";
				3948
				3949	static PyObject*
				3950	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3951	{
				3952	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3953	register const Py_UNICODE *e;
				3954
				3955	if (!PyArg_NoArgs(args))
				3956	return NULL;
				3957
				3958	/* Shortcut for single character strings */
				3959	if (PyUnicode_GET_SIZE(self) == 1 &&
				3960	Py_UNICODE_ISDECIMAL(*p))
				3961	return PyInt_FromLong(1);
				3962
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3963	/* Special case for empty strings */
				3964	if (PyString_GET_SIZE(self) == 0)
				3965	return PyInt_FromLong(0);
				3966
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3967	e = p + PyUnicode_GET_SIZE(self);
				3968	for (; p < e; p++) {
				3969	if (!Py_UNICODE_ISDECIMAL(*p))
				3970	return PyInt_FromLong(0);
				3971	}
				3972	return PyInt_FromLong(1);
				3973	}
				3974
				3975	static char isdigit__doc__[] =
				3976	"S.isdigit() -> int\n\
				3977	\n\
				3978	Return 1 if there are only digit characters in S,\n\
				3979	0 otherwise.";
				3980
				3981	static PyObject*
				3982	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3983	{
				3984	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3985	register const Py_UNICODE *e;
				3986
				3987	if (!PyArg_NoArgs(args))
				3988	return NULL;
				3989
				3990	/* Shortcut for single character strings */
				3991	if (PyUnicode_GET_SIZE(self) == 1 &&
				3992	Py_UNICODE_ISDIGIT(*p))
				3993	return PyInt_FromLong(1);
				3994
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3995	/* Special case for empty strings */
				3996	if (PyString_GET_SIZE(self) == 0)
				3997	return PyInt_FromLong(0);
				3998
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3999	e = p + PyUnicode_GET_SIZE(self);
				4000	for (; p < e; p++) {
				4001	if (!Py_UNICODE_ISDIGIT(*p))
				4002	return PyInt_FromLong(0);
				4003	}
				4004	return PyInt_FromLong(1);
				4005	}
				4006
				4007	static char isnumeric__doc__[] =
				4008	"S.isnumeric() -> int\n\
				4009	\n\
				4010	Return 1 if there are only numeric characters in S,\n\
				4011	0 otherwise.";
				4012
				4013	static PyObject*
				4014	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				4015	{
				4016	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4017	register const Py_UNICODE *e;
				4018
				4019	if (!PyArg_NoArgs(args))
				4020	return NULL;
				4021
				4022	/* Shortcut for single character strings */
				4023	if (PyUnicode_GET_SIZE(self) == 1 &&
				4024	Py_UNICODE_ISNUMERIC(*p))
				4025	return PyInt_FromLong(1);
				4026
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4027	/* Special case for empty strings */
				4028	if (PyString_GET_SIZE(self) == 0)
				4029	return PyInt_FromLong(0);
				4030
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4031	e = p + PyUnicode_GET_SIZE(self);
				4032	for (; p < e; p++) {
				4033	if (!Py_UNICODE_ISNUMERIC(*p))
				4034	return PyInt_FromLong(0);
				4035	}
				4036	return PyInt_FromLong(1);
				4037	}
				4038
				4039	static char join__doc__[] =
				4040	"S.join(sequence) -> unicode\n\
				4041	\n\
				4042	Return a string which is the concatenation of the strings in the\n\
				4043	sequence. The separator between elements is S.";
				4044
				4045	static PyObject*
				4046	unicode_join(PyUnicodeObject self, PyObject args)
				4047	{
				4048	PyObject *data;
				4049	if (!PyArg_ParseTuple(args, "O:join", &data))
				4050	return NULL;
				4051
				4052	return PyUnicode_Join((PyObject *)self, data);
				4053	}
				4054
				4055	static int
				4056	unicode_length(PyUnicodeObject *self)
				4057	{
				4058	return self->length;
				4059	}
				4060
				4061	static char ljust__doc__[] =
				4062	"S.ljust(width) -> unicode\n\
				4063	\n\
				4064	Return S left justified in a Unicode string of length width. Padding is\n\
				4065	done using spaces.";
				4066
				4067	static PyObject *
				4068	unicode_ljust(PyUnicodeObject self, PyObject args)
				4069	{
				4070	int width;
				4071	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4072	return NULL;
				4073
				4074	if (self->length >= width) {
				4075	Py_INCREF(self);
				4076	return (PyObject*) self;
				4077	}
				4078
				4079	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4080	}
				4081
				4082	static char lower__doc__[] =
				4083	"S.lower() -> unicode\n\
				4084	\n\
				4085	Return a copy of the string S converted to lowercase.";
				4086
				4087	static PyObject*
				4088	unicode_lower(PyUnicodeObject self, PyObject args)
				4089	{
				4090	if (!PyArg_NoArgs(args))
				4091	return NULL;
				4092	return fixup(self, fixlower);
				4093	}
				4094
				4095	static char lstrip__doc__[] =
				4096	"S.lstrip() -> unicode\n\
				4097	\n\
				4098	Return a copy of the string S with leading whitespace removed.";
				4099
				4100	static PyObject *
				4101	unicode_lstrip(PyUnicodeObject self, PyObject args)
				4102	{
				4103	if (!PyArg_NoArgs(args))
				4104	return NULL;
				4105	return strip(self, 1, 0);
				4106	}
				4107
				4108	static PyObject*
				4109	unicode_repeat(PyUnicodeObject *str, int len)
				4110	{
				4111	PyUnicodeObject *u;
				4112	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4113	int nchars;
				4114	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4115
				4116	if (len < 0)
				4117	len = 0;
				4118
				4119	if (len == 1) {
				4120	/* no repeat, return original string */
				4121	Py_INCREF(str);
				4122	return (PyObject*) str;
				4123	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4124
				4125	/* ensure # of chars needed doesn't overflow int and # of bytes
				4126	* needed doesn't overflow size_t
				4127	*/
				4128	nchars = len * str->length;
				4129	if (len && nchars / len != str->length) {
				4130	PyErr_SetString(PyExc_OverflowError,
				4131	"repeated string is too long");
				4132	return NULL;
				4133	}
				4134	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4135	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4136	PyErr_SetString(PyExc_OverflowError,
				4137	"repeated string is too long");
				4138	return NULL;
				4139	}
				4140	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4141	if (!u)
				4142	return NULL;
				4143
				4144	p = u->str;
				4145
				4146	while (len-- > 0) {
				4147	Py_UNICODE_COPY(p, str->str, str->length);
				4148	p += str->length;
				4149	}
				4150
				4151	return (PyObject*) u;
				4152	}
				4153
				4154	PyObject PyUnicode_Replace(PyObject obj,
				4155	PyObject *subobj,
				4156	PyObject *replobj,
				4157	int maxcount)
				4158	{
				4159	PyObject *self;
				4160	PyObject *str1;
				4161	PyObject *str2;
				4162	PyObject *result;
				4163
				4164	self = PyUnicode_FromObject(obj);
				4165	if (self == NULL)
				4166	return NULL;
				4167	str1 = PyUnicode_FromObject(subobj);
				4168	if (str1 == NULL) {
				4169	Py_DECREF(self);
				4170	return NULL;
				4171	}
				4172	str2 = PyUnicode_FromObject(replobj);
				4173	if (str2 == NULL) {
				4174	Py_DECREF(self);
				4175	Py_DECREF(str1);
				4176	return NULL;
				4177	}
				4178	result = replace((PyUnicodeObject *)self,
				4179	(PyUnicodeObject *)str1,
				4180	(PyUnicodeObject *)str2,
				4181	maxcount);
				4182	Py_DECREF(self);
				4183	Py_DECREF(str1);
				4184	Py_DECREF(str2);
				4185	return result;
				4186	}
				4187
				4188	static char replace__doc__[] =
				4189	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4190	\n\
				4191	Return a copy of S with all occurrences of substring\n\
				4192	old replaced by new. If the optional argument maxsplit is\n\
				4193	given, only the first maxsplit occurrences are replaced.";
				4194
				4195	static PyObject*
				4196	unicode_replace(PyUnicodeObject self, PyObject args)
				4197	{
				4198	PyUnicodeObject *str1;
				4199	PyUnicodeObject *str2;
				4200	int maxcount = -1;
				4201	PyObject *result;
				4202
				4203	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4204	return NULL;
				4205	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4206	if (str1 == NULL)
				4207	return NULL;
				4208	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4209	if (str2 == NULL)
				4210	return NULL;
				4211
				4212	result = replace(self, str1, str2, maxcount);
				4213
				4214	Py_DECREF(str1);
				4215	Py_DECREF(str2);
				4216	return result;
				4217	}
				4218
				4219	static
				4220	PyObject unicode_repr(PyObject unicode)
				4221	{
				4222	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4223	PyUnicode_GET_SIZE(unicode),
				4224	1);
				4225	}
				4226
				4227	static char rfind__doc__[] =
				4228	"S.rfind(sub [,start [,end]]) -> int\n\
				4229	\n\
				4230	Return the highest index in S where substring sub is found,\n\
				4231	such that sub is contained within s[start,end]. Optional\n\
				4232	arguments start and end are interpreted as in slice notation.\n\
				4233	\n\
				4234	Return -1 on failure.";
				4235
				4236	static PyObject *
				4237	unicode_rfind(PyUnicodeObject self, PyObject args)
				4238	{
				4239	PyUnicodeObject *substring;
				4240	int start = 0;
				4241	int end = INT_MAX;
				4242	PyObject *result;
				4243
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4244	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4245	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4246	return NULL;
				4247	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4248	(PyObject *)substring);
				4249	if (substring == NULL)
				4250	return NULL;
				4251
				4252	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4253
				4254	Py_DECREF(substring);
				4255	return result;
				4256	}
				4257
				4258	static char rindex__doc__[] =
				4259	"S.rindex(sub [,start [,end]]) -> int\n\
				4260	\n\
				4261	Like S.rfind() but raise ValueError when the substring is not found.";
				4262
				4263	static PyObject *
				4264	unicode_rindex(PyUnicodeObject self, PyObject args)
				4265	{
				4266	int result;
				4267	PyUnicodeObject *substring;
				4268	int start = 0;
				4269	int end = INT_MAX;
				4270
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4271	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4272	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4273	return NULL;
				4274	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4275	(PyObject *)substring);
				4276	if (substring == NULL)
				4277	return NULL;
				4278
				4279	result = findstring(self, substring, start, end, -1);
				4280
				4281	Py_DECREF(substring);
				4282	if (result < 0) {
				4283	PyErr_SetString(PyExc_ValueError, "substring not found");
				4284	return NULL;
				4285	}
				4286	return PyInt_FromLong(result);
				4287	}
				4288
				4289	static char rjust__doc__[] =
				4290	"S.rjust(width) -> unicode\n\
				4291	\n\
				4292	Return S right justified in a Unicode string of length width. Padding is\n\
				4293	done using spaces.";
				4294
				4295	static PyObject *
				4296	unicode_rjust(PyUnicodeObject self, PyObject args)
				4297	{
				4298	int width;
				4299	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4300	return NULL;
				4301
				4302	if (self->length >= width) {
				4303	Py_INCREF(self);
				4304	return (PyObject*) self;
				4305	}
				4306
				4307	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4308	}
				4309
				4310	static char rstrip__doc__[] =
				4311	"S.rstrip() -> unicode\n\
				4312	\n\
				4313	Return a copy of the string S with trailing whitespace removed.";
				4314
				4315	static PyObject *
				4316	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4317	{
				4318	if (!PyArg_NoArgs(args))
				4319	return NULL;
				4320	return strip(self, 0, 1);
				4321	}
				4322
				4323	static PyObject*
				4324	unicode_slice(PyUnicodeObject *self, int start, int end)
				4325	{
				4326	/* standard clamping */
				4327	if (start < 0)
				4328	start = 0;
				4329	if (end < 0)
				4330	end = 0;
				4331	if (end > self->length)
				4332	end = self->length;
				4333	if (start == 0 && end == self->length) {
				4334	/* full slice, return original string */
				4335	Py_INCREF(self);
				4336	return (PyObject*) self;
				4337	}
				4338	if (start > end)
				4339	start = end;
				4340	/* copy slice */
				4341	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4342	end - start);
				4343	}
				4344
				4345	PyObject PyUnicode_Split(PyObject s,
				4346	PyObject *sep,
				4347	int maxsplit)
				4348	{
				4349	PyObject *result;
				4350
				4351	s = PyUnicode_FromObject(s);
				4352	if (s == NULL)
				4353	return NULL;
				4354	if (sep != NULL) {
				4355	sep = PyUnicode_FromObject(sep);
				4356	if (sep == NULL) {
				4357	Py_DECREF(s);
				4358	return NULL;
				4359	}
				4360	}
				4361
				4362	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4363
				4364	Py_DECREF(s);
				4365	Py_XDECREF(sep);
				4366	return result;
				4367	}
				4368
				4369	static char split__doc__[] =
				4370	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4371	\n\
				4372	Return a list of the words in S, using sep as the\n\
				4373	delimiter string. If maxsplit is given, at most maxsplit\n\
				4374	splits are done. If sep is not specified, any whitespace string\n\
				4375	is a separator.";
				4376
				4377	static PyObject*
				4378	unicode_split(PyUnicodeObject self, PyObject args)
				4379	{
				4380	PyObject *substring = Py_None;
				4381	int maxcount = -1;
				4382
				4383	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4384	return NULL;
				4385
				4386	if (substring == Py_None)
				4387	return split(self, NULL, maxcount);
				4388	else if (PyUnicode_Check(substring))
				4389	return split(self, (PyUnicodeObject *)substring, maxcount);
				4390	else
				4391	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4392	}
				4393
				4394	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4395	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4396	\n\
				4397	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4398	Line breaks are not included in the resulting list unless keepends\n\
				4399	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4400
				4401	static PyObject*
				4402	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4403	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4404	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4405
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4406	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4407	return NULL;
				4408
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4409	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4410	}
				4411
				4412	static
				4413	PyObject unicode_str(PyUnicodeObject self)
				4414	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4415	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4416	}
				4417
				4418	static char strip__doc__[] =
				4419	"S.strip() -> unicode\n\
				4420	\n\
				4421	Return a copy of S with leading and trailing whitespace removed.";
				4422
				4423	static PyObject *
				4424	unicode_strip(PyUnicodeObject self, PyObject args)
				4425	{
				4426	if (!PyArg_NoArgs(args))
				4427	return NULL;
				4428	return strip(self, 1, 1);
				4429	}
				4430
				4431	static char swapcase__doc__[] =
				4432	"S.swapcase() -> unicode\n\
				4433	\n\
				4434	Return a copy of S with uppercase characters converted to lowercase\n\
				4435	and vice versa.";
				4436
				4437	static PyObject*
				4438	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4439	{
				4440	if (!PyArg_NoArgs(args))
				4441	return NULL;
				4442	return fixup(self, fixswapcase);
				4443	}
				4444
				4445	static char translate__doc__[] =
				4446	"S.translate(table) -> unicode\n\
				4447	\n\
				4448	Return a copy of the string S, where all characters have been mapped\n\
				4449	through the given translation table, which must be a mapping of\n\
				4450	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4451	are left untouched. Characters mapped to None are deleted.";
				4452
				4453	static PyObject*
				4454	unicode_translate(PyUnicodeObject self, PyObject args)
				4455	{
				4456	PyObject *table;
				4457
				4458	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4459	return NULL;
				4460	return PyUnicode_TranslateCharmap(self->str,
				4461	self->length,
				4462	table,
				4463	"ignore");
				4464	}
				4465
				4466	static char upper__doc__[] =
				4467	"S.upper() -> unicode\n\
				4468	\n\
				4469	Return a copy of S converted to uppercase.";
				4470
				4471	static PyObject*
				4472	unicode_upper(PyUnicodeObject self, PyObject args)
				4473	{
				4474	if (!PyArg_NoArgs(args))
				4475	return NULL;
				4476	return fixup(self, fixupper);
				4477	}
				4478
				4479	#if 0
				4480	static char zfill__doc__[] =
				4481	"S.zfill(width) -> unicode\n\
				4482	\n\
				4483	Pad a numeric string x with zeros on the left, to fill a field\n\
				4484	of the specified width. The string x is never truncated.";
				4485
				4486	static PyObject *
				4487	unicode_zfill(PyUnicodeObject self, PyObject args)
				4488	{
				4489	int fill;
				4490	PyUnicodeObject *u;
				4491
				4492	int width;
				4493	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4494	return NULL;
				4495
				4496	if (self->length >= width) {
				4497	Py_INCREF(self);
				4498	return (PyObject*) self;
				4499	}
				4500
				4501	fill = width - self->length;
				4502
				4503	u = pad(self, fill, 0, '0');
				4504
				4505	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4506	/* move sign to beginning of string */
				4507	u->str[0] = u->str[fill];
				4508	u->str[fill] = '0';
				4509	}
				4510
				4511	return (PyObject*) u;
				4512	}
				4513	#endif
				4514
				4515	#if 0
				4516	static PyObject*
				4517	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4518	{
				4519	if (!PyArg_NoArgs(args))
				4520	return NULL;
				4521	return PyInt_FromLong(unicode_freelist_size);
				4522	}
				4523	#endif
				4524
				4525	static char startswith__doc__[] =
				4526	"S.startswith(prefix[, start[, end]]) -> int\n\
				4527	\n\
				4528	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4529	optional start, test S beginning at that position. With optional end, stop\n\
				4530	comparing S at that position.";
				4531
				4532	static PyObject *
				4533	unicode_startswith(PyUnicodeObject *self,
				4534	PyObject *args)
				4535	{
				4536	PyUnicodeObject *substring;
				4537	int start = 0;
				4538	int end = INT_MAX;
				4539	PyObject *result;
				4540
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4541	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4542	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4543	return NULL;
				4544	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4545	(PyObject *)substring);
				4546	if (substring == NULL)
				4547	return NULL;
				4548
				4549	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4550
				4551	Py_DECREF(substring);
				4552	return result;
				4553	}
				4554
				4555
				4556	static char endswith__doc__[] =
				4557	"S.endswith(suffix[, start[, end]]) -> int\n\
				4558	\n\
				4559	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4560	optional start, test S beginning at that position. With optional end, stop\n\
				4561	comparing S at that position.";
				4562
				4563	static PyObject *
				4564	unicode_endswith(PyUnicodeObject *self,
				4565	PyObject *args)
				4566	{
				4567	PyUnicodeObject *substring;
				4568	int start = 0;
				4569	int end = INT_MAX;
				4570	PyObject *result;
				4571
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4572	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4573	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4574	return NULL;
				4575	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4576	(PyObject *)substring);
				4577	if (substring == NULL)
				4578	return NULL;
				4579
				4580	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4581
				4582	Py_DECREF(substring);
				4583	return result;
				4584	}
				4585
				4586
				4587	static PyMethodDef unicode_methods[] = {
				4588
				4589	/* Order is according to common usage: often used methods should
				4590	appear first, since lookup is done sequentially. */
				4591
				4592	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4593	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4594	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4595	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4596	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4597	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4598	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4599	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4600	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4601	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4602	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4603	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4604	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4605	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4606	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4607	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4608	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4609	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4610	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4611	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4612	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4613	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4614	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4615	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4616	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4617	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4618	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4619	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4620	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4621	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4622	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4623	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4624	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4625	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4626	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4627	#if 0
				4628	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4629	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4630	#endif
				4631
				4632	#if 0
				4633	/* This one is just used for debugging the implementation. */
				4634	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4635	#endif
				4636
				4637	{NULL, NULL}
				4638	};
				4639
				4640	static PyObject *
				4641	unicode_getattr(PyUnicodeObject self, char name)
				4642	{
				4643	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4644	}
				4645
				4646	static PySequenceMethods unicode_as_sequence = {
				4647	(inquiry) unicode_length, /* sq_length */
				4648	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4649	(intargfunc) unicode_repeat, /* sq_repeat */
				4650	(intargfunc) unicode_getitem, /* sq_item */
				4651	(intintargfunc) unicode_slice, /* sq_slice */
				4652	0, /* sq_ass_item */
				4653	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4654	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4655	};
				4656
				4657	static int
				4658	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4659	int index,
				4660	const void **ptr)
				4661	{
				4662	if (index != 0) {
				4663	PyErr_SetString(PyExc_SystemError,
				4664	"accessing non-existent unicode segment");
				4665	return -1;
				4666	}
				4667	ptr = (void ) self->str;
				4668	return PyUnicode_GET_DATA_SIZE(self);
				4669	}
				4670
				4671	static int
				4672	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4673	const void **ptr)
				4674	{
				4675	PyErr_SetString(PyExc_TypeError,
				4676	"cannot use unicode as modifyable buffer");
				4677	return -1;
				4678	}
				4679
				4680	static int
				4681	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4682	int *lenp)
				4683	{
				4684	if (lenp)
				4685	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4686	return 1;
				4687	}
				4688
				4689	static int
				4690	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4691	int index,
				4692	const void **ptr)
				4693	{
				4694	PyObject *str;
				4695
				4696	if (index != 0) {
				4697	PyErr_SetString(PyExc_SystemError,
				4698	"accessing non-existent unicode segment");
				4699	return -1;
				4700	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4701	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4702	if (str == NULL)
				4703	return -1;
				4704	ptr = (void ) PyString_AS_STRING(str);
				4705	return PyString_GET_SIZE(str);
				4706	}
				4707
				4708	/* Helpers for PyUnicode_Format() */
				4709
				4710	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4711	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4712	{
				4713	int argidx = *p_argidx;
				4714	if (argidx < arglen) {
				4715	(*p_argidx)++;
				4716	if (arglen < 0)
				4717	return args;
				4718	else
				4719	return PyTuple_GetItem(args, argidx);
				4720	}
				4721	PyErr_SetString(PyExc_TypeError,
				4722	"not enough arguments for format string");
				4723	return NULL;
				4724	}
				4725
				4726	#define F_LJUST (1<<0)
				4727	#define F_SIGN (1<<1)
				4728	#define F_BLANK (1<<2)
				4729	#define F_ALT (1<<3)
				4730	#define F_ZERO (1<<4)
				4731
				4732	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4733	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4734	{
				4735	register int i;
				4736	int len;
				4737	va_list va;
				4738	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4739	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4740
				4741	/* First, format the string as char array, then expand to Py_UNICODE
				4742	array. */
				4743	charbuffer = (char *)buffer;
				4744	len = vsprintf(charbuffer, format, va);
				4745	for (i = len - 1; i >= 0; i--)
				4746	buffer[i] = (Py_UNICODE) charbuffer[i];
				4747
				4748	va_end(va);
				4749	return len;
				4750	}
				4751
				4752	static int
				4753	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4754	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4755	int flags,
				4756	int prec,
				4757	int type,
				4758	PyObject *v)
				4759	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4760	/* fmt = '%#.' + `prec` + `type`
				4761	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4762	char fmt[20];
				4763	double x;
				4764
				4765	x = PyFloat_AsDouble(v);
				4766	if (x == -1.0 && PyErr_Occurred())
				4767	return -1;
				4768	if (prec < 0)
				4769	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4770	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4771	type = 'g';
				4772	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4773	/* worst case length calc to ensure no buffer overrun:
				4774	fmt = %#.<prec>g
				4775	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4776	for any double rep.)
				4777	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4778	If prec=0 the effective precision is 1 (the leading digit is
				4779	always given), therefore increase by one to 10+prec. */
				4780	if (buflen <= (size_t)10 + (size_t)prec) {
				4781	PyErr_SetString(PyExc_OverflowError,
				4782	"formatted float is too long (precision too long?)");
				4783	return -1;
				4784	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4785	return usprintf(buf, fmt, x);
				4786	}
				4787
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4788	static PyObject*
				4789	formatlong(PyObject *val, int flags, int prec, int type)
				4790	{
				4791	char *buf;
				4792	int i, len;
				4793	PyObject str; / temporary string object. */
				4794	PyUnicodeObject *result;
				4795
				4796	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4797	if (!str)
				4798	return NULL;
				4799	result = _PyUnicode_New(len);
				4800	for (i = 0; i < len; i++)
				4801	result->str[i] = buf[i];
				4802	result->str[len] = 0;
				4803	Py_DECREF(str);
				4804	return (PyObject*)result;
				4805	}
				4806
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4807	static int
				4808	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4809	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4810	int flags,
				4811	int prec,
				4812	int type,
				4813	PyObject *v)
				4814	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4815	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4816	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4817	+ 1 + 1 = 24*/
				4818	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4819	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4820	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4821
				4822	x = PyInt_AsLong(v);
				4823	if (x == -1 && PyErr_Occurred())
				4824	return -1;
				4825	if (prec < 0)
				4826	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4827	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4828	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4829	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4830	PyErr_SetString(PyExc_OverflowError,
				4831	"formatted integer is too long (precision too long?)");
				4832	return -1;
				4833	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4834	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				4835	* but we want it (for consistency with other %#x conversions, and
				4836	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4837	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				4838	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				4839	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4840	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4841	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				4842	/* Only way to know what the platform does is to try it. */
				4843	sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
				4844	if (fmt[1] != (char)type) {
				4845	/* Supply our own leading 0x/0X -- needed under std C */
				4846	use_native_c_format = 0;
				4847	sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
				4848	}
				4849	}
				4850	if (use_native_c_format)
				4851	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4852	return usprintf(buf, fmt, x);
				4853	}
				4854
				4855	static int
				4856	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4857	size_t buflen,
				4858	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4859	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4860	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4861	if (PyUnicode_Check(v)) {
				4862	if (PyUnicode_GET_SIZE(v) != 1)
				4863	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4864	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4865	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4866
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4867	else if (PyString_Check(v)) {
				4868	if (PyString_GET_SIZE(v) != 1)
				4869	goto onError;
				4870	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4871	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4872
				4873	else {
				4874	/* Integer input truncated to a character */
				4875	long x;
				4876	x = PyInt_AsLong(v);
				4877	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4878	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4879	buf[0] = (char) x;
				4880	}
				4881	buf[1] = '\0';
				4882	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4883
				4884	onError:
				4885	PyErr_SetString(PyExc_TypeError,
				4886	"%c requires int or char");
				4887	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4888	}
				4889
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4890	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4891
				4892	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4893	chars are formatted. XXX This is a magic number. Each formatting
				4894	routine does bounds checking to ensure no overflow, but a better
				4895	solution may be to malloc a buffer of appropriate size for each
				4896	format. For now, the current solution is sufficient.
				4897	*/
				4898	#define FORMATBUFLEN (size_t)120
				4899
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4900	PyObject PyUnicode_Format(PyObject format,
				4901	PyObject *args)
				4902	{
				4903	Py_UNICODE fmt, res;
				4904	int fmtcnt, rescnt, reslen, arglen, argidx;
				4905	int args_owned = 0;
				4906	PyUnicodeObject *result = NULL;
				4907	PyObject *dict = NULL;
				4908	PyObject *uformat;
				4909
				4910	if (format == NULL \|\| args == NULL) {
				4911	PyErr_BadInternalCall();
				4912	return NULL;
				4913	}
				4914	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4915	if (uformat == NULL)
				4916	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4917	fmt = PyUnicode_AS_UNICODE(uformat);
				4918	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4919
				4920	reslen = rescnt = fmtcnt + 100;
				4921	result = _PyUnicode_New(reslen);
				4922	if (result == NULL)
				4923	goto onError;
				4924	res = PyUnicode_AS_UNICODE(result);
				4925
				4926	if (PyTuple_Check(args)) {
				4927	arglen = PyTuple_Size(args);
				4928	argidx = 0;
				4929	}
				4930	else {
				4931	arglen = -1;
				4932	argidx = -2;
				4933	}
				4934	if (args->ob_type->tp_as_mapping)
				4935	dict = args;
				4936
				4937	while (--fmtcnt >= 0) {
				4938	if (*fmt != '%') {
				4939	if (--rescnt < 0) {
				4940	rescnt = fmtcnt + 100;
				4941	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	4942	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4943	return NULL;
				4944	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4945	--rescnt;
				4946	}
				4947	res++ = fmt++;
				4948	}
				4949	else {
				4950	/* Got a format specifier */
				4951	int flags = 0;
				4952	int width = -1;
				4953	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4954	Py_UNICODE c = '\0';
				4955	Py_UNICODE fill;
				4956	PyObject *v = NULL;
				4957	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4958	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4959	Py_UNICODE sign;
				4960	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4961	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4962
				4963	fmt++;
				4964	if (*fmt == '(') {
				4965	Py_UNICODE *keystart;
				4966	int keylen;
				4967	PyObject *key;
				4968	int pcount = 1;
				4969
				4970	if (dict == NULL) {
				4971	PyErr_SetString(PyExc_TypeError,
				4972	"format requires a mapping");
				4973	goto onError;
				4974	}
				4975	++fmt;
				4976	--fmtcnt;
				4977	keystart = fmt;
				4978	/* Skip over balanced parentheses */
				4979	while (pcount > 0 && --fmtcnt >= 0) {
				4980	if (*fmt == ')')
				4981	--pcount;
				4982	else if (*fmt == '(')
				4983	++pcount;
				4984	fmt++;
				4985	}
				4986	keylen = fmt - keystart - 1;
				4987	if (fmtcnt < 0 \|\| pcount > 0) {
				4988	PyErr_SetString(PyExc_ValueError,
				4989	"incomplete format key");
				4990	goto onError;
				4991	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4992	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4993	then looked up since Python uses strings to hold
				4994	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4995	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4996	key = PyUnicode_EncodeUTF8(keystart,
				4997	keylen,
				4998	NULL);
				4999	if (key == NULL)
				5000	goto onError;
				5001	if (args_owned) {
				5002	Py_DECREF(args);
				5003	args_owned = 0;
				5004	}
				5005	args = PyObject_GetItem(dict, key);
				5006	Py_DECREF(key);
				5007	if (args == NULL) {
				5008	goto onError;
				5009	}
				5010	args_owned = 1;
				5011	arglen = -1;
				5012	argidx = -2;
				5013	}
				5014	while (--fmtcnt >= 0) {
				5015	switch (c = *fmt++) {
				5016	case '-': flags \|= F_LJUST; continue;
				5017	case '+': flags \|= F_SIGN; continue;
				5018	case ' ': flags \|= F_BLANK; continue;
				5019	case '#': flags \|= F_ALT; continue;
				5020	case '0': flags \|= F_ZERO; continue;
				5021	}
				5022	break;
				5023	}
				5024	if (c == '*') {
				5025	v = getnextarg(args, arglen, &argidx);
				5026	if (v == NULL)
				5027	goto onError;
				5028	if (!PyInt_Check(v)) {
				5029	PyErr_SetString(PyExc_TypeError,
				5030	"* wants int");
				5031	goto onError;
				5032	}
				5033	width = PyInt_AsLong(v);
				5034	if (width < 0) {
				5035	flags \|= F_LJUST;
				5036	width = -width;
				5037	}
				5038	if (--fmtcnt >= 0)
				5039	c = *fmt++;
				5040	}
				5041	else if (c >= '0' && c <= '9') {
				5042	width = c - '0';
				5043	while (--fmtcnt >= 0) {
				5044	c = *fmt++;
				5045	if (c < '0' \|\| c > '9')
				5046	break;
				5047	if ((width*10) / 10 != width) {
				5048	PyErr_SetString(PyExc_ValueError,
				5049	"width too big");
				5050	goto onError;
				5051	}
				5052	width = width*10 + (c - '0');
				5053	}
				5054	}
				5055	if (c == '.') {
				5056	prec = 0;
				5057	if (--fmtcnt >= 0)
				5058	c = *fmt++;
				5059	if (c == '*') {
				5060	v = getnextarg(args, arglen, &argidx);
				5061	if (v == NULL)
				5062	goto onError;
				5063	if (!PyInt_Check(v)) {
				5064	PyErr_SetString(PyExc_TypeError,
				5065	"* wants int");
				5066	goto onError;
				5067	}
				5068	prec = PyInt_AsLong(v);
				5069	if (prec < 0)
				5070	prec = 0;
				5071	if (--fmtcnt >= 0)
				5072	c = *fmt++;
				5073	}
				5074	else if (c >= '0' && c <= '9') {
				5075	prec = c - '0';
				5076	while (--fmtcnt >= 0) {
				5077	c = Py_CHARMASK(*fmt++);
				5078	if (c < '0' \|\| c > '9')
				5079	break;
				5080	if ((prec*10) / 10 != prec) {
				5081	PyErr_SetString(PyExc_ValueError,
				5082	"prec too big");
				5083	goto onError;
				5084	}
				5085	prec = prec*10 + (c - '0');
				5086	}
				5087	}
				5088	} /* prec */
				5089	if (fmtcnt >= 0) {
				5090	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5091	if (--fmtcnt >= 0)
				5092	c = *fmt++;
				5093	}
				5094	}
				5095	if (fmtcnt < 0) {
				5096	PyErr_SetString(PyExc_ValueError,
				5097	"incomplete format");
				5098	goto onError;
				5099	}
				5100	if (c != '%') {
				5101	v = getnextarg(args, arglen, &argidx);
				5102	if (v == NULL)
				5103	goto onError;
				5104	}
				5105	sign = 0;
				5106	fill = ' ';
				5107	switch (c) {
				5108
				5109	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5110	pbuf = formatbuf;
				5111	/* presume that buffer length is at least 1 */
				5112	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5113	len = 1;
				5114	break;
				5115
				5116	case 's':
				5117	case 'r':
				5118	if (PyUnicode_Check(v) && c == 's') {
				5119	temp = v;
				5120	Py_INCREF(temp);
				5121	}
				5122	else {
				5123	PyObject *unicode;
				5124	if (c == 's')
				5125	temp = PyObject_Str(v);
				5126	else
				5127	temp = PyObject_Repr(v);
				5128	if (temp == NULL)
				5129	goto onError;
				5130	if (!PyString_Check(temp)) {
				5131	/* XXX Note: this should never happen, since
				5132	PyObject_Repr() and PyObject_Str() assure
				5133	this */
				5134	Py_DECREF(temp);
				5135	PyErr_SetString(PyExc_TypeError,
				5136	"%s argument has non-string str()");
				5137	goto onError;
				5138	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5139	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5140	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5141	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5142	"strict");
				5143	Py_DECREF(temp);
				5144	temp = unicode;
				5145	if (temp == NULL)
				5146	goto onError;
				5147	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5148	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5149	len = PyUnicode_GET_SIZE(temp);
				5150	if (prec >= 0 && len > prec)
				5151	len = prec;
				5152	break;
				5153
				5154	case 'i':
				5155	case 'd':
				5156	case 'u':
				5157	case 'o':
				5158	case 'x':
				5159	case 'X':
				5160	if (c == 'i')
				5161	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5162	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5163	temp = formatlong(v, flags, prec, c);
				5164	if (!temp)
				5165	goto onError;
				5166	pbuf = PyUnicode_AS_UNICODE(temp);
				5167	len = PyUnicode_GET_SIZE(temp);
				5168	/* unbounded ints can always produce
				5169	a sign character! */
				5170	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5171	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5172	else {
				5173	pbuf = formatbuf;
				5174	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5175	flags, prec, c, v);
				5176	if (len < 0)
				5177	goto onError;
				5178	/* only d conversion is signed */
				5179	sign = c == 'd';
				5180	}
				5181	if (flags & F_ZERO)
				5182	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5183	break;
				5184
				5185	case 'e':
				5186	case 'E':
				5187	case 'f':
				5188	case 'g':
				5189	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5190	pbuf = formatbuf;
				5191	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5192	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5193	if (len < 0)
				5194	goto onError;
				5195	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5196	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5197	fill = '0';
				5198	break;
				5199
				5200	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5201	pbuf = formatbuf;
				5202	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5203	if (len < 0)
				5204	goto onError;
				5205	break;
				5206
				5207	default:
				5208	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5209	"unsupported format character '%c' (0x%x) "
				5210	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5211	(31<=c && c<=126) ? c : '?',
				5212	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5213	goto onError;
				5214	}
				5215	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5216	if (pbuf == '-' \|\| pbuf == '+') {
				5217	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5218	len--;
				5219	}
				5220	else if (flags & F_SIGN)
				5221	sign = '+';
				5222	else if (flags & F_BLANK)
				5223	sign = ' ';
				5224	else
				5225	sign = 0;
				5226	}
				5227	if (width < len)
				5228	width = len;
				5229	if (rescnt < width + (sign != 0)) {
				5230	reslen -= rescnt;
				5231	rescnt = width + fmtcnt + 100;
				5232	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5233	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5234	return NULL;
				5235	res = PyUnicode_AS_UNICODE(result)
				5236	+ reslen - rescnt;
				5237	}
				5238	if (sign) {
				5239	if (fill != ' ')
				5240	*res++ = sign;
				5241	rescnt--;
				5242	if (width > len)
				5243	width--;
				5244	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5245	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5246	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5247	assert(pbuf[1] == c);
				5248	if (fill != ' ') {
				5249	res++ = pbuf++;
				5250	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5251	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5252	rescnt -= 2;
				5253	width -= 2;
				5254	if (width < 0)
				5255	width = 0;
				5256	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5257	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5258	if (width > len && !(flags & F_LJUST)) {
				5259	do {
				5260	--rescnt;
				5261	*res++ = fill;
				5262	} while (--width > len);
				5263	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5264	if (fill == ' ') {
				5265	if (sign)
				5266	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5267	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5268	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5269	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5270	res++ = pbuf++;
				5271	res++ = pbuf++;
				5272	}
				5273	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5274	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5275	res += len;
				5276	rescnt -= len;
				5277	while (--width >= len) {
				5278	--rescnt;
				5279	*res++ = ' ';
				5280	}
				5281	if (dict && (argidx < arglen) && c != '%') {
				5282	PyErr_SetString(PyExc_TypeError,
				5283	"not all arguments converted");
				5284	goto onError;
				5285	}
				5286	Py_XDECREF(temp);
				5287	} /* '%' */
				5288	} /* until end */
				5289	if (argidx < arglen && !dict) {
				5290	PyErr_SetString(PyExc_TypeError,
				5291	"not all arguments converted");
				5292	goto onError;
				5293	}
				5294
				5295	if (args_owned) {
				5296	Py_DECREF(args);
				5297	}
				5298	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5299	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5300	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5301	return (PyObject *)result;
				5302
				5303	onError:
				5304	Py_XDECREF(result);
				5305	Py_DECREF(uformat);
				5306	if (args_owned) {
				5307	Py_DECREF(args);
				5308	}
				5309	return NULL;
				5310	}
				5311
				5312	static PyBufferProcs unicode_as_buffer = {
				5313	(getreadbufferproc) unicode_buffer_getreadbuf,
				5314	(getwritebufferproc) unicode_buffer_getwritebuf,
				5315	(getsegcountproc) unicode_buffer_getsegcount,
				5316	(getcharbufferproc) unicode_buffer_getcharbuf,
				5317	};
				5318
				5319	PyTypeObject PyUnicode_Type = {
				5320	PyObject_HEAD_INIT(&PyType_Type)
				5321	0, /* ob_size */
				5322	"unicode", /* tp_name */
				5323	sizeof(PyUnicodeObject), /* tp_size */
				5324	0, /* tp_itemsize */
				5325	/* Slots */
				5326	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5327	0, /* tp_print */
				5328	(getattrfunc)unicode_getattr, /* tp_getattr */
				5329	0, /* tp_setattr */
				5330	(cmpfunc) unicode_compare, /* tp_compare */
				5331	(reprfunc) unicode_repr, /* tp_repr */
				5332	0, /* tp_as_number */
				5333	&unicode_as_sequence, /* tp_as_sequence */
				5334	0, /* tp_as_mapping */
				5335	(hashfunc) unicode_hash, /* tp_hash*/
				5336	0, /* tp_call*/
				5337	(reprfunc) unicode_str, /* tp_str */
				5338	(getattrofunc) NULL, /* tp_getattro */
				5339	(setattrofunc) NULL, /* tp_setattro */
				5340	&unicode_as_buffer, /* tp_as_buffer */
				5341	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5342	};
				5343
				5344	/* Initialize the Unicode implementation */
				5345
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5346	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5347	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5348	int i;
				5349
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5350	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5351	unicode_freelist = NULL;
				5352	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5353	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5354	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5355	for (i = 0; i < 256; i++)
				5356	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5357	}
				5358
				5359	/* Finalize the Unicode implementation */
				5360
				5361	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5362	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5363	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5364	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5365	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5366
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5367	Py_XDECREF(unicode_empty);
				5368	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5369
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5370	for (i = 0; i < 256; i++) {
				5371	if (unicode_latin1[i]) {
				5372	Py_DECREF(unicode_latin1[i]);
				5373	unicode_latin1[i] = NULL;
				5374	}
				5375	}
				5376
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5377	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5378	PyUnicodeObject *v = u;
				5379	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5380	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5381	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5382	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5383	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5384	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5385	unicode_freelist = NULL;
				5386	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5387	}