Blame - Objects/unicodeobject.c - platform/external/python/cpython2

blob: 1319c7c52ad3ab13dc6659de5ee605e34f1ff36a [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	107	PyUnicode_GetMax(void)
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	108	{
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	109	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
				204	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	222	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
				227	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				228	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	230	/* Keep-Alive optimization */
				231	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	232	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	233	unicode->str = NULL;
				234	unicode->length = 0;
				235	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	236	if (unicode->defenc) {
				237	Py_DECREF(unicode->defenc);
				238	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	239	}
				240	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	(PyUnicodeObject *)unicode = unicode_freelist;
				242	unicode_freelist = unicode;
				243	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	244	}
				245	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	246	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	247	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	248	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	249	}
				250	}
				251
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	252	int PyUnicode_Resize(PyObject **unicode,
				253	int length)
				254	{
				255	register PyUnicodeObject *v;
				256
				257	/* Argument checks */
				258	if (unicode == NULL) {
				259	PyErr_BadInternalCall();
				260	return -1;
				261	}
				262	v = (PyUnicodeObject )unicode;
				263	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				264	PyErr_BadInternalCall();
				265	return -1;
				266	}
				267
				268	/* Resizing unicode_empty and single character objects is not
				269	possible since these are being shared. We simply return a fresh
				270	copy with the same Unicode content. */
				271	if (v->length != length &&
				272	(v == unicode_empty \|\| v->length == 1)) {
				273	PyUnicodeObject *w = _PyUnicode_New(length);
				274	if (w == NULL)
				275	return -1;
				276	Py_UNICODE_COPY(w->str, v->str,
				277	length < v->length ? length : v->length);
				278	unicode = (PyObject )w;
				279	return 0;
				280	}
				281
				282	/* Note that we don't have to modify *unicode for unshared Unicode
				283	objects, since we can modify them in-place. */
				284	return unicode_resize(v, length);
				285	}
				286
				287	/* Internal API for use in unicodeobject.c only ! */
				288	#define _PyUnicode_Resize(unicodevar, length) \
				289	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				290
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	291	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				292	int size)
				293	{
				294	PyUnicodeObject *unicode;
				295
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	296	/* If the Unicode data is known at construction time, we can apply
				297	some optimizations which share commonly used objects. */
				298	if (u != NULL) {
				299
				300	/* Optimization for empty strings */
				301	if (size == 0 && unicode_empty != NULL) {
				302	Py_INCREF(unicode_empty);
				303	return (PyObject *)unicode_empty;
				304	}
				305
				306	/* Single character Unicode objects in the Latin-1 range are
				307	shared when using this constructor */
				308	if (size == 1 && *u < 256) {
				309	unicode = unicode_latin1[*u];
				310	if (!unicode) {
				311	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	312	if (!unicode)
				313	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	314	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	315	unicode_latin1[*u] = unicode;
				316	}
				317	Py_INCREF(unicode);
				318	return (PyObject *)unicode;
				319	}
				320	}
				321
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	322	unicode = _PyUnicode_New(size);
				323	if (!unicode)
				324	return NULL;
				325
				326	/* Copy the Unicode data into the new object */
				327	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	328	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	329
				330	return (PyObject *)unicode;
				331	}
				332
				333	#ifdef HAVE_WCHAR_H
				334
				335	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				336	int size)
				337	{
				338	PyUnicodeObject *unicode;
				339
				340	if (w == NULL) {
				341	PyErr_BadInternalCall();
				342	return NULL;
				343	}
				344
				345	unicode = _PyUnicode_New(size);
				346	if (!unicode)
				347	return NULL;
				348
				349	/* Copy the wchar_t data into the new object */
				350	#ifdef HAVE_USABLE_WCHAR_T
				351	memcpy(unicode->str, w, size * sizeof(wchar_t));
				352	#else
				353	{
				354	register Py_UNICODE *u;
				355	register int i;
				356	u = PyUnicode_AS_UNICODE(unicode);
				357	for (i = size; i >= 0; i--)
				358	u++ = w++;
				359	}
				360	#endif
				361
				362	return (PyObject *)unicode;
				363	}
				364
				365	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				366	register wchar_t *w,
				367	int size)
				368	{
				369	if (unicode == NULL) {
				370	PyErr_BadInternalCall();
				371	return -1;
				372	}
				373	if (size > PyUnicode_GET_SIZE(unicode))
				374	size = PyUnicode_GET_SIZE(unicode);
				375	#ifdef HAVE_USABLE_WCHAR_T
				376	memcpy(w, unicode->str, size * sizeof(wchar_t));
				377	#else
				378	{
				379	register Py_UNICODE *u;
				380	register int i;
				381	u = PyUnicode_AS_UNICODE(unicode);
				382	for (i = size; i >= 0; i--)
				383	w++ = u++;
				384	}
				385	#endif
				386
				387	return size;
				388	}
				389
				390	#endif
				391
				392	PyObject PyUnicode_FromObject(register PyObject obj)
				393	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	394	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				395	}
				396
				397	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				398	const char *encoding,
				399	const char *errors)
				400	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	401	const char *s;
				402	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	403	int owned = 0;
				404	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	405
				406	if (obj == NULL) {
				407	PyErr_BadInternalCall();
				408	return NULL;
				409	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	410
				411	/* Coerce object */
				412	if (PyInstance_Check(obj)) {
				413	PyObject *func;
				414	func = PyObject_GetAttrString(obj, "__str__");
				415	if (func == NULL) {
				416	PyErr_SetString(PyExc_TypeError,
				417	"coercing to Unicode: instance doesn't define __str__");
				418	return NULL;
				419	}
				420	obj = PyEval_CallObject(func, NULL);
				421	Py_DECREF(func);
				422	if (obj == NULL)
				423	return NULL;
				424	owned = 1;
				425	}
				426	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	427	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	428	v = obj;
				429	if (encoding) {
				430	PyErr_SetString(PyExc_TypeError,
				431	"decoding Unicode is not supported");
				432	return NULL;
				433	}
				434	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	435	}
				436	else if (PyString_Check(obj)) {
				437	s = PyString_AS_STRING(obj);
				438	len = PyString_GET_SIZE(obj);
				439	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	440	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				441	/* Overwrite the error message with something more useful in
				442	case of a TypeError. */
				443	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	444	PyErr_Format(PyExc_TypeError,
				445	"coercing to Unicode: need string or buffer, "
				446	"%.80s found",
				447	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	448	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	449	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	450
				451	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	452	if (len == 0) {
				453	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	454	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	455	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	456	else
				457	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	458
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	459	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	460	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	461	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	462	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	463	return v;
				464
				465	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	466	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	467	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	468	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	469	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	470	}
				471
				472	PyObject PyUnicode_Decode(const char s,
				473	int size,
				474	const char *encoding,
				475	const char *errors)
				476	{
				477	PyObject buffer = NULL, unicode;
				478
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	479	if (encoding == NULL)
				480	encoding = PyUnicode_GetDefaultEncoding();
				481
				482	/* Shortcuts for common default encodings */
				483	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	484	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	485	else if (strcmp(encoding, "latin-1") == 0)
				486	return PyUnicode_DecodeLatin1(s, size, errors);
				487	else if (strcmp(encoding, "ascii") == 0)
				488	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	489
				490	/* Decode via the codec registry */
				491	buffer = PyBuffer_FromMemory((void *)s, size);
				492	if (buffer == NULL)
				493	goto onError;
				494	unicode = PyCodec_Decode(buffer, encoding, errors);
				495	if (unicode == NULL)
				496	goto onError;
				497	if (!PyUnicode_Check(unicode)) {
				498	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	499	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	500	unicode->ob_type->tp_name);
				501	Py_DECREF(unicode);
				502	goto onError;
				503	}
				504	Py_DECREF(buffer);
				505	return unicode;
				506
				507	onError:
				508	Py_XDECREF(buffer);
				509	return NULL;
				510	}
				511
				512	PyObject PyUnicode_Encode(const Py_UNICODE s,
				513	int size,
				514	const char *encoding,
				515	const char *errors)
				516	{
				517	PyObject v, unicode;
				518
				519	unicode = PyUnicode_FromUnicode(s, size);
				520	if (unicode == NULL)
				521	return NULL;
				522	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				523	Py_DECREF(unicode);
				524	return v;
				525	}
				526
				527	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				528	const char *encoding,
				529	const char *errors)
				530	{
				531	PyObject *v;
				532
				533	if (!PyUnicode_Check(unicode)) {
				534	PyErr_BadArgument();
				535	goto onError;
				536	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	537
				538	if (encoding == NULL)
				539	encoding = PyUnicode_GetDefaultEncoding();
				540
				541	/* Shortcuts for common default encodings */
				542	if (errors == NULL) {
				543	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	544	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	545	else if (strcmp(encoding, "latin-1") == 0)
				546	return PyUnicode_AsLatin1String(unicode);
				547	else if (strcmp(encoding, "ascii") == 0)
				548	return PyUnicode_AsASCIIString(unicode);
				549	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	550
				551	/* Encode via the codec registry */
				552	v = PyCodec_Encode(unicode, encoding, errors);
				553	if (v == NULL)
				554	goto onError;
				555	/* XXX Should we really enforce this ? */
				556	if (!PyString_Check(v)) {
				557	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	558	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	559	v->ob_type->tp_name);
				560	Py_DECREF(v);
				561	goto onError;
				562	}
				563	return v;
				564
				565	onError:
				566	return NULL;
				567	}
				568
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	569	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				570	const char *errors)
				571	{
				572	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				573
				574	if (v)
				575	return v;
				576	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				577	if (v && errors == NULL)
				578	((PyUnicodeObject *)unicode)->defenc = v;
				579	return v;
				580	}
				581
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	582	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				583	{
				584	if (!PyUnicode_Check(unicode)) {
				585	PyErr_BadArgument();
				586	goto onError;
				587	}
				588	return PyUnicode_AS_UNICODE(unicode);
				589
				590	onError:
				591	return NULL;
				592	}
				593
				594	int PyUnicode_GetSize(PyObject *unicode)
				595	{
				596	if (!PyUnicode_Check(unicode)) {
				597	PyErr_BadArgument();
				598	goto onError;
				599	}
				600	return PyUnicode_GET_SIZE(unicode);
				601
				602	onError:
				603	return -1;
				604	}
				605
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	606	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	607	{
				608	return unicode_default_encoding;
				609	}
				610
				611	int PyUnicode_SetDefaultEncoding(const char *encoding)
				612	{
				613	PyObject *v;
				614
				615	/* Make sure the encoding is valid. As side effect, this also
				616	loads the encoding into the codec registry cache. */
				617	v = _PyCodec_Lookup(encoding);
				618	if (v == NULL)
				619	goto onError;
				620	Py_DECREF(v);
				621	strncpy(unicode_default_encoding,
				622	encoding,
				623	sizeof(unicode_default_encoding));
				624	return 0;
				625
				626	onError:
				627	return -1;
				628	}
				629
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	630	/* --- UTF-8 Codec -------------------------------------------------------- */
				631
				632	static
				633	char utf8_code_length[256] = {
				634	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				635	illegal prefix. see RFC 2279 for details */
				636	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				637	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				638	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				639	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				640	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				641	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				642	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				643	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				644	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				645	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				646	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				647	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				648	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				649	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				650	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				651	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				652	};
				653
				654	static
				655	int utf8_decoding_error(const char **source,
				656	Py_UNICODE **dest,
				657	const char *errors,
				658	const char *details)
				659	{
				660	if ((errors == NULL) \|\|
				661	(strcmp(errors,"strict") == 0)) {
				662	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	663	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	664	details);
				665	return -1;
				666	}
				667	else if (strcmp(errors,"ignore") == 0) {
				668	(*source)++;
				669	return 0;
				670	}
				671	else if (strcmp(errors,"replace") == 0) {
				672	(*source)++;
				673	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				674	(*dest)++;
				675	return 0;
				676	}
				677	else {
				678	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	679	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	680	errors);
				681	return -1;
				682	}
				683	}
				684
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	685	PyObject PyUnicode_DecodeUTF8(const char s,
				686	int size,
				687	const char *errors)
				688	{
				689	int n;
				690	const char *e;
				691	PyUnicodeObject *unicode;
				692	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	693	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	694
				695	/* Note: size will always be longer than the resulting Unicode
				696	character count */
				697	unicode = _PyUnicode_New(size);
				698	if (!unicode)
				699	return NULL;
				700	if (size == 0)
				701	return (PyObject *)unicode;
				702
				703	/* Unpack UTF-8 encoded data */
				704	p = unicode->str;
				705	e = s + size;
				706
				707	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	708	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	709
				710	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	711	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	712	s++;
				713	continue;
				714	}
				715
				716	n = utf8_code_length[ch];
				717
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	718	if (s + n > e) {
				719	errmsg = "unexpected end of data";
				720	goto utf8Error;
				721	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	722
				723	switch (n) {
				724
				725	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	726	errmsg = "unexpected code byte";
				727	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	728
				729	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	730	errmsg = "internal error";
				731	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	732
				733	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	734	if ((s[1] & 0xc0) != 0x80) {
				735	errmsg = "invalid data";
				736	goto utf8Error;
				737	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	738	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	739	if (ch < 0x80) {
				740	errmsg = "illegal encoding";
				741	goto utf8Error;
				742	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	743	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	744	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	745	break;
				746
				747	case 3:
				748	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	749	(s[2] & 0xc0) != 0x80) {
				750	errmsg = "invalid data";
				751	goto utf8Error;
				752	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	753	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	754	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				755	errmsg = "illegal encoding";
				756	goto utf8Error;
				757	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	758	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	759	*p++ = (Py_UNICODE)ch;
				760	break;
				761
				762	case 4:
				763	if ((s[1] & 0xc0) != 0x80 \|\|
				764	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	765	(s[3] & 0xc0) != 0x80) {
				766	errmsg = "invalid data";
				767	goto utf8Error;
				768	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	769	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				770	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				771	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	772	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	773	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	774	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	775	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	776	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	777	errmsg = "illegal encoding";
				778	goto utf8Error;
				779	}
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	780	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	781	*p++ = (Py_UNICODE)ch;
				782	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	783	/* compute and append the two surrogates: */
				784
				785	/* translate from 10000..10FFFF to 0..FFFF */
				786	ch -= 0x10000;
				787
				788	/* high surrogate = top 10 bits added to D800 */
				789	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				790
				791	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	792	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	793	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	794	break;
				795
				796	default:
				797	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	798	errmsg = "unsupported Unicode code range";
				799	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	800	}
				801	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	802	continue;
				803
				804	utf8Error:
				805	if (utf8_decoding_error(&s, &p, errors, errmsg))
				806	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	807	}
				808
				809	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	810	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	811	goto onError;
				812
				813	return (PyObject *)unicode;
				814
				815	onError:
				816	Py_DECREF(unicode);
				817	return NULL;
				818	}
				819
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	820	/* Not used anymore, now that the encoder supports UTF-16
				821	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	822	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	823	static
				824	int utf8_encoding_error(const Py_UNICODE **source,
				825	char **dest,
				826	const char *errors,
				827	const char *details)
				828	{
				829	if ((errors == NULL) \|\|
				830	(strcmp(errors,"strict") == 0)) {
				831	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	832	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	833	details);
				834	return -1;
				835	}
				836	else if (strcmp(errors,"ignore") == 0) {
				837	return 0;
				838	}
				839	else if (strcmp(errors,"replace") == 0) {
				840	**dest = '?';
				841	(*dest)++;
				842	return 0;
				843	}
				844	else {
				845	PyErr_Format(PyExc_ValueError,
				846	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	847	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	848	errors);
				849	return -1;
				850	}
				851	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	852	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	853
				854	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				855	int size,
				856	const char *errors)
				857	{
				858	PyObject *v;
				859	char *p;
				860	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	861	Py_UCS4 ch2;
				862	unsigned int cbAllocated = 3 * size;
				863	unsigned int cbWritten = 0;
				864	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	865
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	866	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	867	if (v == NULL)
				868	return NULL;
				869	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	870	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	871
				872	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	873	while (i < size) {
				874	Py_UCS4 ch = s[i++];
				875	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	876	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	877	cbWritten++;
				878	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	879	else if (ch < 0x0800) {
				880	*p++ = 0xc0 \| (ch >> 6);
				881	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	882	cbWritten += 2;
				883	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	884	else if (ch < 0x10000) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	885	/* Check for high surrogate */
				886	if (0xD800 <= ch && ch <= 0xDBFF) {
				887	if (i != size) {
				888	ch2 = s[i];
				889	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				890
				891	if (cbWritten >= (cbAllocated - 4)) {
				892	/* Provide enough room for some more
				893	surrogates */
				894	cbAllocated += 4*10;
				895	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	896	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	897	}
				898
				899	/* combine the two values */
				900	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				901
				902	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	903	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	904	i++;
				905	cbWritten += 4;
				906	}
				907	}
				908	}
				909	else {
				910	*p++ = (char)(0xe0 \| (ch >> 12));
				911	cbWritten += 3;
				912	}
				913	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				914	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	915	} else {
				916	*p++ = 0xf0 \| (ch>>18);
				917	*p++ = 0x80 \| ((ch>>12) & 0x3f);
				918	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				919	*p++ = 0x80 \| (ch & 0x3f);
				920	cbWritten += 4;
				921	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	922	}
				923	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	924	if (_PyString_Resize(&v, p - q))
				925	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	926	return v;
				927
				928	onError:
				929	Py_DECREF(v);
				930	return NULL;
				931	}
				932
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	933	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				934	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	935	if (!PyUnicode_Check(unicode)) {
				936	PyErr_BadArgument();
				937	return NULL;
				938	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	939	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				940	PyUnicode_GET_SIZE(unicode),
				941	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	942	}
				943
				944	/* --- UTF-16 Codec ------------------------------------------------------- */
				945
				946	static
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	947	int utf16_decoding_error(const Py_UCS2 **source,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	948	Py_UNICODE **dest,
				949	const char *errors,
				950	const char *details)
				951	{
				952	if ((errors == NULL) \|\|
				953	(strcmp(errors,"strict") == 0)) {
				954	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	955	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	956	details);
				957	return -1;
				958	}
				959	else if (strcmp(errors,"ignore") == 0) {
				960	return 0;
				961	}
				962	else if (strcmp(errors,"replace") == 0) {
				963	if (dest) {
				964	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				965	(*dest)++;
				966	}
				967	return 0;
				968	}
				969	else {
				970	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	971	"UTF-16 decoding error; "
				972	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	973	errors);
				974	return -1;
				975	}
				976	}
				977
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	978	PyObject PyUnicode_DecodeUTF16(const char s,
				979	int size,
				980	const char *errors,
				981	int *byteorder)
				982	{
				983	PyUnicodeObject *unicode;
				984	Py_UNICODE *p;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	985	const Py_UCS2 q, e;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	986	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	987	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	988
				989	/* size should be an even number */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	990	if (size % sizeof(Py_UCS2) != 0) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	991	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				992	return NULL;
				993	/* The remaining input chars are ignored if we fall through
				994	here... */
				995	}
				996
				997	/* Note: size will always be longer than the resulting Unicode
				998	character count */
				999	unicode = _PyUnicode_New(size);
				1000	if (!unicode)
				1001	return NULL;
				1002	if (size == 0)
				1003	return (PyObject *)unicode;
				1004
				1005	/* Unpack UTF-16 encoded data */
				1006	p = unicode->str;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1007	q = (Py_UCS2 *)s;
				1008	e = q + (size / sizeof(Py_UCS2));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1009
				1010	if (byteorder)
				1011	bo = *byteorder;
				1012
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1013	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1014	byte order setting accordingly. In native mode, the leading BOM
				1015	mark is skipped, in all other modes, it is copied to the output
				1016	stream as-is (giving a ZWNBSP character). */
				1017	if (bo == 0) {
				1018	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1019	if (*q == 0xFEFF) {
				1020	q++;
				1021	bo = -1;
				1022	} else if (*q == 0xFFFE) {
				1023	q++;
				1024	bo = 1;
				1025	}
				1026	#else
				1027	if (*q == 0xFEFF) {
				1028	q++;
				1029	bo = 1;
				1030	} else if (*q == 0xFFFE) {
				1031	q++;
				1032	bo = -1;
				1033	}
				1034	#endif
				1035	}
				1036
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1037	while (q < e) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1038	register Py_UCS2 ch = *q++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1039
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1040	/* Swap input bytes if needed. (This assumes
				1041	sizeof(Py_UNICODE) == 2 !) */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1042	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1043	if (bo == 1)
				1044	ch = (ch >> 8) \| (ch << 8);
				1045	#else
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1046	if (bo == -1)
				1047	ch = (ch >> 8) \| (ch << 8);
				1048	#endif
				1049	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1050	*p++ = ch;
				1051	continue;
				1052	}
				1053
				1054	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1055	if (q >= e) {
				1056	errmsg = "unexpected end of data";
				1057	goto utf16Error;
				1058	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1059	if (0xD800 <= ch && ch <= 0xDBFF) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1060	Py_UCS2 ch2 = *q++;
				1061	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1062	if (bo == 1)
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1063	ch2 = (ch2 >> 8) \| (ch2 << 8);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1064	#else
				1065	if (bo == -1)
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1066	ch2 = (ch2 >> 8) \| (ch2 << 8);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1067	#endif
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1068	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1069	#ifndef Py_UNICODE_WIDE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1070	*p++ = ch;
				1071	*p++ = ch2;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1072	#else
				1073	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1074	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1075	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1076	}
				1077	else {
				1078	errmsg = "illegal UTF-16 surrogate";
				1079	goto utf16Error;
				1080	}
				1081
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1082	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1083	errmsg = "illegal encoding";
				1084	/* Fall through to report the error */
				1085
				1086	utf16Error:
				1087	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1088	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1089	}
				1090
				1091	if (byteorder)
				1092	*byteorder = bo;
				1093
				1094	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1095	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1096	goto onError;
				1097
				1098	return (PyObject *)unicode;
				1099
				1100	onError:
				1101	Py_DECREF(unicode);
				1102	return NULL;
				1103	}
				1104
				1105	#undef UTF16_ERROR
				1106
				1107	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1108	int size,
				1109	const char *errors,
				1110	int byteorder)
				1111	{
				1112	PyObject *v;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1113	Py_UCS2 *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1114	char *q;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1115	int i, pairs, doswap = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1116
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1117	for (i = pairs = 0; i < size; i++)
				1118	if (s[i] >= 0x10000)
				1119	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1120	v = PyString_FromStringAndSize(NULL,
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1121	sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1122	if (v == NULL)
				1123	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1124
				1125	q = PyString_AS_STRING(v);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1126	p = (Py_UCS2 *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1127	if (byteorder == 0)
				1128	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1129	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1130	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1131	if (byteorder == 0 \|\|
				1132	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1133	byteorder == -1
				1134	#else
				1135	byteorder == 1
				1136	#endif
				1137	)
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1138	doswap = 0;
				1139	while (size-- > 0) {
				1140	Py_UNICODE ch = *s++;
				1141	Py_UNICODE ch2 = 0;
				1142	if (ch >= 0x10000) {
				1143	ch2 = 0xDC00\|((ch-0x10000) & 0x3FF);
				1144	ch = 0xD800\|((ch-0x10000)>>10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1145	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1146	if (doswap){
				1147	*p++ = (ch >> 8) \| (ch << 8);
				1148	if (ch2)
				1149	*p++ = (ch2 >> 8) \| (ch2 << 8);
				1150	}else{
				1151	*p++ = ch;
				1152	if(ch2)
				1153	*p++ = ch2;
				1154	}
				1155	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1156	return v;
				1157	}
				1158
				1159	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1160	{
				1161	if (!PyUnicode_Check(unicode)) {
				1162	PyErr_BadArgument();
				1163	return NULL;
				1164	}
				1165	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1166	PyUnicode_GET_SIZE(unicode),
				1167	NULL,
				1168	0);
				1169	}
				1170
				1171	/* --- Unicode Escape Codec ----------------------------------------------- */
				1172
				1173	static
				1174	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1175	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1176	const char *errors,
				1177	const char *details)
				1178	{
				1179	if ((errors == NULL) \|\|
				1180	(strcmp(errors,"strict") == 0)) {
				1181	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1182	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1183	details);
				1184	return -1;
				1185	}
				1186	else if (strcmp(errors,"ignore") == 0) {
				1187	return 0;
				1188	}
				1189	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1190	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1191	return 0;
				1192	}
				1193	else {
				1194	PyErr_Format(PyExc_ValueError,
				1195	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1196	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1197	errors);
				1198	return -1;
				1199	}
				1200	}
				1201
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1202	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1203
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1204	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1205	int size,
				1206	const char *errors)
				1207	{
				1208	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1209	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1210	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1211	char* message;
				1212	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1213
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1214	/* Escaped strings will always be longer than the resulting
				1215	Unicode string, so we start with size here and then reduce the
				1216	length after conversion to the true value. */
				1217	v = _PyUnicode_New(size);
				1218	if (v == NULL)
				1219	goto onError;
				1220	if (size == 0)
				1221	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1222
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1223	p = buf = PyUnicode_AS_UNICODE(v);
				1224	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1225
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1226	while (s < end) {
				1227	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1228	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1229	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1230
				1231	/* Non-escape characters are interpreted as Unicode ordinals */
				1232	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1233	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1234	continue;
				1235	}
				1236
				1237	/* \ - Escapes */
				1238	s++;
				1239	switch (*s++) {
				1240
				1241	/* \x escapes */
				1242	case '\n': break;
				1243	case '\\': *p++ = '\\'; break;
				1244	case '\'': *p++ = '\''; break;
				1245	case '\"': *p++ = '\"'; break;
				1246	case 'b': *p++ = '\b'; break;
				1247	case 'f': p++ = '\014'; break; / FF */
				1248	case 't': *p++ = '\t'; break;
				1249	case 'n': *p++ = '\n'; break;
				1250	case 'r': *p++ = '\r'; break;
				1251	case 'v': p++ = '\013'; break; / VT */
				1252	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1253
				1254	/* \OOO (octal) escapes */
				1255	case '0': case '1': case '2': case '3':
				1256	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1257	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1258	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1259	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1260	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1261	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1262	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1263	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1264	break;
				1265
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1266	/* hex escapes */
				1267	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1268	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1269	digits = 2;
				1270	message = "truncated \\xXX escape";
				1271	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1272
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1273	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1274	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1275	digits = 4;
				1276	message = "truncated \\uXXXX escape";
				1277	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1278
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1279	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1280	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1281	digits = 8;
				1282	message = "truncated \\UXXXXXXXX escape";
				1283	hexescape:
				1284	chr = 0;
				1285	for (i = 0; i < digits; i++) {
				1286	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1287	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1288	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1289	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1290	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1291	i++;
				1292	break;
				1293	}
				1294	chr = (chr<<4) & ~0xF;
				1295	if (c >= '0' && c <= '9')
				1296	chr += c - '0';
				1297	else if (c >= 'a' && c <= 'f')
				1298	chr += 10 + c - 'a';
				1299	else
				1300	chr += 10 + c - 'A';
				1301	}
				1302	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1303	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1304	/* when we get here, chr is a 32-bit unicode character */
				1305	if (chr <= 0xffff)
				1306	/* UCS-2 character */
				1307	*p++ = (Py_UNICODE) chr;
				1308	else if (chr <= 0x10ffff) {
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1309	/* UCS-4 character. Either store directly, or as
				1310	surrogate pair. */
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1311	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1312	*p++ = chr;
				1313	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1314	chr -= 0x10000L;
				1315	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1316	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1317	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1318	} else {
				1319	if (unicodeescape_decoding_error(
				1320	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1321	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1322	)
				1323	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1324	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1325	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1326	break;
				1327
				1328	/* \N{name} */
				1329	case 'N':
				1330	message = "malformed \\N character escape";
				1331	if (ucnhash_CAPI == NULL) {
				1332	/* load the unicode data module */
				1333	PyObject m, v;
				1334	m = PyImport_ImportModule("unicodedata");
				1335	if (m == NULL)
				1336	goto ucnhashError;
				1337	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1338	Py_DECREF(m);
				1339	if (v == NULL)
				1340	goto ucnhashError;
				1341	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1342	Py_DECREF(v);
				1343	if (ucnhash_CAPI == NULL)
				1344	goto ucnhashError;
				1345	}
				1346	if (*s == '{') {
				1347	const char *start = s+1;
				1348	/* look for the closing brace */
				1349	while (*s != '}' && s < end)
				1350	s++;
				1351	if (s > start && s < end && *s == '}') {
				1352	/* found a name. look it up in the unicode database */
				1353	message = "unknown Unicode character name";
				1354	s++;
				1355	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1356	goto store;
				1357	}
				1358	}
				1359	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1360	goto onError;
				1361	*p++ = x;
				1362	break;
				1363
				1364	default:
				1365	*p++ = '\\';
				1366	*p++ = (unsigned char)s[-1];
				1367	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1368	}
				1369	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1370	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1371	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1372	return (PyObject *)v;
				1373
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1374	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1375	PyErr_SetString(
				1376	PyExc_UnicodeError,
				1377	"\\N escapes not supported (can't load unicodedata module)"
				1378	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1379	return NULL;
				1380
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1381	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1382	Py_XDECREF(v);
				1383	return NULL;
				1384	}
				1385
				1386	/* Return a Unicode-Escape string version of the Unicode object.
				1387
				1388	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1389	appropriate.
				1390
				1391	*/
				1392
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1393	static const Py_UNICODE findchar(const Py_UNICODE s,
				1394	int size,
				1395	Py_UNICODE ch);
				1396
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1397	static
				1398	PyObject unicodeescape_string(const Py_UNICODE s,
				1399	int size,
				1400	int quotes)
				1401	{
				1402	PyObject *repr;
				1403	char *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1404
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1405	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1406
				1407	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1408	if (repr == NULL)
				1409	return NULL;
				1410
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1411	p = PyString_AS_STRING(repr);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1412
				1413	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1414	*p++ = 'u';
				1415	*p++ = (findchar(s, size, '\'') &&
				1416	!findchar(s, size, '"')) ? '"' : '\'';
				1417	}
				1418	while (size-- > 0) {
				1419	Py_UNICODE ch = *s++;
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1420
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1421	/* Escape quotes */
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1422	if (quotes &&
				1423	(ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1424	*p++ = '\\';
				1425	*p++ = (char) ch;
				1426	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1427
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1428	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1429	/* Map 21-bit characters to '\U00xxxxxx' */
				1430	else if (ch >= 0x10000) {
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1431	int offset = p - PyString_AS_STRING(repr);
				1432
				1433	/* Resize the string if necessary */
				1434	if (offset + 12 > PyString_GET_SIZE(repr)) {
				1435	if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
				1436	goto onError;
				1437	p = PyString_AS_STRING(repr) + offset;
				1438	}
				1439
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1440	*p++ = '\\';
				1441	*p++ = 'U';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1442	*p++ = hexdigit[(ch >> 28) & 0x0000000F];
				1443	*p++ = hexdigit[(ch >> 24) & 0x0000000F];
				1444	*p++ = hexdigit[(ch >> 20) & 0x0000000F];
				1445	*p++ = hexdigit[(ch >> 16) & 0x0000000F];
				1446	*p++ = hexdigit[(ch >> 12) & 0x0000000F];
				1447	*p++ = hexdigit[(ch >> 8) & 0x0000000F];
				1448	*p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1449	*p++ = hexdigit[ch & 0x0000000F];
				1450	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1451	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1452	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1453	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
				1454	else if (ch >= 0xD800 && ch < 0xDC00) {
				1455	Py_UNICODE ch2;
				1456	Py_UCS4 ucs;
				1457
				1458	ch2 = *s++;
				1459	size--;
				1460	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
				1461	ucs = (((ch & 0x03FF) << 10) \| (ch2 & 0x03FF)) + 0x00010000;
				1462	*p++ = '\\';
				1463	*p++ = 'U';
				1464	*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
				1465	*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
				1466	*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
				1467	*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
				1468	*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
				1469	*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
				1470	*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
				1471	*p++ = hexdigit[ucs & 0x0000000F];
				1472	continue;
				1473	}
				1474	/* Fall through: isolated surrogates are copied as-is */
				1475	s--;
				1476	size++;
				1477	}
				1478
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1479	/* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1480	if (ch >= 256) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1481	*p++ = '\\';
				1482	*p++ = 'u';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1483	*p++ = hexdigit[(ch >> 12) & 0x000F];
				1484	*p++ = hexdigit[(ch >> 8) & 0x000F];
				1485	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1486	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1487	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1488
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1489	/* Map special whitespace to '\t', \n', '\r' */
				1490	else if (ch == '\t') {
				1491	*p++ = '\\';
				1492	*p++ = 't';
				1493	}
				1494	else if (ch == '\n') {
				1495	*p++ = '\\';
				1496	*p++ = 'n';
				1497	}
				1498	else if (ch == '\r') {
				1499	*p++ = '\\';
				1500	*p++ = 'r';
				1501	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1502
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1503	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1504	else if (ch < ' ' \|\| ch >= 128) {
				1505	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1506	*p++ = 'x';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1507	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1508	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1509	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1510
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1511	/* Copy everything else as-is */
				1512	else
				1513	*p++ = (char) ch;
				1514	}
				1515	if (quotes)
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1516	*p++ = PyString_AS_STRING(repr)[1];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1517
				1518	*p = '\0';
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1519	if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1520	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1521
				1522	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1523
				1524	onError:
				1525	Py_DECREF(repr);
				1526	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1527	}
				1528
				1529	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1530	int size)
				1531	{
				1532	return unicodeescape_string(s, size, 0);
				1533	}
				1534
				1535	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1536	{
				1537	if (!PyUnicode_Check(unicode)) {
				1538	PyErr_BadArgument();
				1539	return NULL;
				1540	}
				1541	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1542	PyUnicode_GET_SIZE(unicode));
				1543	}
				1544
				1545	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1546
				1547	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1548	int size,
				1549	const char *errors)
				1550	{
				1551	PyUnicodeObject *v;
				1552	Py_UNICODE p, buf;
				1553	const char *end;
				1554	const char *bs;
				1555
				1556	/* Escaped strings will always be longer than the resulting
				1557	Unicode string, so we start with size here and then reduce the
				1558	length after conversion to the true value. */
				1559	v = _PyUnicode_New(size);
				1560	if (v == NULL)
				1561	goto onError;
				1562	if (size == 0)
				1563	return (PyObject *)v;
				1564	p = buf = PyUnicode_AS_UNICODE(v);
				1565	end = s + size;
				1566	while (s < end) {
				1567	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1568	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1569	int i;
				1570
				1571	/* Non-escape characters are interpreted as Unicode ordinals */
				1572	if (*s != '\\') {
				1573	p++ = (unsigned char)s++;
				1574	continue;
				1575	}
				1576
				1577	/* \u-escapes are only interpreted iff the number of leading
				1578	backslashes if odd */
				1579	bs = s;
				1580	for (;s < end;) {
				1581	if (*s != '\\')
				1582	break;
				1583	p++ = (unsigned char)s++;
				1584	}
				1585	if (((s - bs) & 1) == 0 \|\|
				1586	s >= end \|\|
				1587	*s != 'u') {
				1588	continue;
				1589	}
				1590	p--;
				1591	s++;
				1592
				1593	/* \uXXXX with 4 hex digits */
				1594	for (x = 0, i = 0; i < 4; i++) {
				1595	c = (unsigned char)s[i];
				1596	if (!isxdigit(c)) {
				1597	if (unicodeescape_decoding_error(&s, &x, errors,
				1598	"truncated \\uXXXX"))
				1599	goto onError;
				1600	i++;
				1601	break;
				1602	}
				1603	x = (x<<4) & ~0xF;
				1604	if (c >= '0' && c <= '9')
				1605	x += c - '0';
				1606	else if (c >= 'a' && c <= 'f')
				1607	x += 10 + c - 'a';
				1608	else
				1609	x += 10 + c - 'A';
				1610	}
				1611	s += i;
				1612	*p++ = x;
				1613	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1614	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1615	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1616	return (PyObject *)v;
				1617
				1618	onError:
				1619	Py_XDECREF(v);
				1620	return NULL;
				1621	}
				1622
				1623	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1624	int size)
				1625	{
				1626	PyObject *repr;
				1627	char *p;
				1628	char *q;
				1629
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1630	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1631
				1632	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1633	if (repr == NULL)
				1634	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1635	if (size == 0)
				1636	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1637
				1638	p = q = PyString_AS_STRING(repr);
				1639	while (size-- > 0) {
				1640	Py_UNICODE ch = *s++;
				1641	/* Map 16-bit characters to '\uxxxx' */
				1642	if (ch >= 256) {
				1643	*p++ = '\\';
				1644	*p++ = 'u';
				1645	*p++ = hexdigit[(ch >> 12) & 0xf];
				1646	*p++ = hexdigit[(ch >> 8) & 0xf];
				1647	*p++ = hexdigit[(ch >> 4) & 0xf];
				1648	*p++ = hexdigit[ch & 15];
				1649	}
				1650	/* Copy everything else as-is */
				1651	else
				1652	*p++ = (char) ch;
				1653	}
				1654	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1655	if (_PyString_Resize(&repr, p - q))
				1656	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1657
				1658	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1659
				1660	onError:
				1661	Py_DECREF(repr);
				1662	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1663	}
				1664
				1665	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1666	{
				1667	if (!PyUnicode_Check(unicode)) {
				1668	PyErr_BadArgument();
				1669	return NULL;
				1670	}
				1671	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1672	PyUnicode_GET_SIZE(unicode));
				1673	}
				1674
				1675	/* --- Latin-1 Codec ------------------------------------------------------ */
				1676
				1677	PyObject PyUnicode_DecodeLatin1(const char s,
				1678	int size,
				1679	const char *errors)
				1680	{
				1681	PyUnicodeObject *v;
				1682	Py_UNICODE *p;
				1683
				1684	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1685	if (size == 1 && (unsigned char)s < 256) {
				1686	Py_UNICODE r = (unsigned char)s;
				1687	return PyUnicode_FromUnicode(&r, 1);
				1688	}
				1689
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1690	v = _PyUnicode_New(size);
				1691	if (v == NULL)
				1692	goto onError;
				1693	if (size == 0)
				1694	return (PyObject *)v;
				1695	p = PyUnicode_AS_UNICODE(v);
				1696	while (size-- > 0)
				1697	p++ = (unsigned char)s++;
				1698	return (PyObject *)v;
				1699
				1700	onError:
				1701	Py_XDECREF(v);
				1702	return NULL;
				1703	}
				1704
				1705	static
				1706	int latin1_encoding_error(const Py_UNICODE **source,
				1707	char **dest,
				1708	const char *errors,
				1709	const char *details)
				1710	{
				1711	if ((errors == NULL) \|\|
				1712	(strcmp(errors,"strict") == 0)) {
				1713	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1714	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1715	details);
				1716	return -1;
				1717	}
				1718	else if (strcmp(errors,"ignore") == 0) {
				1719	return 0;
				1720	}
				1721	else if (strcmp(errors,"replace") == 0) {
				1722	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1723	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1724	return 0;
				1725	}
				1726	else {
				1727	PyErr_Format(PyExc_ValueError,
				1728	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1729	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1730	errors);
				1731	return -1;
				1732	}
				1733	}
				1734
				1735	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1736	int size,
				1737	const char *errors)
				1738	{
				1739	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1740	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1741
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1742	repr = PyString_FromStringAndSize(NULL, size);
				1743	if (repr == NULL)
				1744	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1745	if (size == 0)
				1746	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1747
				1748	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1749	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1750	while (size-- > 0) {
				1751	Py_UNICODE ch = *p++;
				1752	if (ch >= 256) {
				1753	if (latin1_encoding_error(&p, &s, errors,
				1754	"ordinal not in range(256)"))
				1755	goto onError;
				1756	}
				1757	else
				1758	*s++ = (char)ch;
				1759	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1760	/* Resize if error handling skipped some characters */
				1761	if (s - start < PyString_GET_SIZE(repr))
				1762	if (_PyString_Resize(&repr, s - start))
				1763	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1764	return repr;
				1765
				1766	onError:
				1767	Py_DECREF(repr);
				1768	return NULL;
				1769	}
				1770
				1771	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1772	{
				1773	if (!PyUnicode_Check(unicode)) {
				1774	PyErr_BadArgument();
				1775	return NULL;
				1776	}
				1777	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1778	PyUnicode_GET_SIZE(unicode),
				1779	NULL);
				1780	}
				1781
				1782	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1783
				1784	static
				1785	int ascii_decoding_error(const char **source,
				1786	Py_UNICODE **dest,
				1787	const char *errors,
				1788	const char *details)
				1789	{
				1790	if ((errors == NULL) \|\|
				1791	(strcmp(errors,"strict") == 0)) {
				1792	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1793	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1794	details);
				1795	return -1;
				1796	}
				1797	else if (strcmp(errors,"ignore") == 0) {
				1798	return 0;
				1799	}
				1800	else if (strcmp(errors,"replace") == 0) {
				1801	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1802	(*dest)++;
				1803	return 0;
				1804	}
				1805	else {
				1806	PyErr_Format(PyExc_ValueError,
				1807	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1808	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1809	errors);
				1810	return -1;
				1811	}
				1812	}
				1813
				1814	PyObject PyUnicode_DecodeASCII(const char s,
				1815	int size,
				1816	const char *errors)
				1817	{
				1818	PyUnicodeObject *v;
				1819	Py_UNICODE *p;
				1820
				1821	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1822	if (size == 1 && (unsigned char)s < 128) {
				1823	Py_UNICODE r = (unsigned char)s;
				1824	return PyUnicode_FromUnicode(&r, 1);
				1825	}
				1826
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1827	v = _PyUnicode_New(size);
				1828	if (v == NULL)
				1829	goto onError;
				1830	if (size == 0)
				1831	return (PyObject *)v;
				1832	p = PyUnicode_AS_UNICODE(v);
				1833	while (size-- > 0) {
				1834	register unsigned char c;
				1835
				1836	c = (unsigned char)*s++;
				1837	if (c < 128)
				1838	*p++ = c;
				1839	else if (ascii_decoding_error(&s, &p, errors,
				1840	"ordinal not in range(128)"))
				1841	goto onError;
				1842	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1843	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1844	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1845	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1846	return (PyObject *)v;
				1847
				1848	onError:
				1849	Py_XDECREF(v);
				1850	return NULL;
				1851	}
				1852
				1853	static
				1854	int ascii_encoding_error(const Py_UNICODE **source,
				1855	char **dest,
				1856	const char *errors,
				1857	const char *details)
				1858	{
				1859	if ((errors == NULL) \|\|
				1860	(strcmp(errors,"strict") == 0)) {
				1861	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1862	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1863	details);
				1864	return -1;
				1865	}
				1866	else if (strcmp(errors,"ignore") == 0) {
				1867	return 0;
				1868	}
				1869	else if (strcmp(errors,"replace") == 0) {
				1870	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1871	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1872	return 0;
				1873	}
				1874	else {
				1875	PyErr_Format(PyExc_ValueError,
				1876	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1877	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1878	errors);
				1879	return -1;
				1880	}
				1881	}
				1882
				1883	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1884	int size,
				1885	const char *errors)
				1886	{
				1887	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1888	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1889
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1890	repr = PyString_FromStringAndSize(NULL, size);
				1891	if (repr == NULL)
				1892	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1893	if (size == 0)
				1894	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1895
				1896	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1897	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1898	while (size-- > 0) {
				1899	Py_UNICODE ch = *p++;
				1900	if (ch >= 128) {
				1901	if (ascii_encoding_error(&p, &s, errors,
				1902	"ordinal not in range(128)"))
				1903	goto onError;
				1904	}
				1905	else
				1906	*s++ = (char)ch;
				1907	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1908	/* Resize if error handling skipped some characters */
				1909	if (s - start < PyString_GET_SIZE(repr))
				1910	if (_PyString_Resize(&repr, s - start))
				1911	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1912	return repr;
				1913
				1914	onError:
				1915	Py_DECREF(repr);
				1916	return NULL;
				1917	}
				1918
				1919	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1920	{
				1921	if (!PyUnicode_Check(unicode)) {
				1922	PyErr_BadArgument();
				1923	return NULL;
				1924	}
				1925	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1926	PyUnicode_GET_SIZE(unicode),
				1927	NULL);
				1928	}
				1929
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	1930	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1931
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1932	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1933
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1934	PyObject PyUnicode_DecodeMBCS(const char s,
				1935	int size,
				1936	const char *errors)
				1937	{
				1938	PyUnicodeObject *v;
				1939	Py_UNICODE *p;
				1940
				1941	/* First get the size of the result */
				1942	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1943	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1944	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1945
				1946	v = _PyUnicode_New(usize);
				1947	if (v == NULL)
				1948	return NULL;
				1949	if (usize == 0)
				1950	return (PyObject *)v;
				1951	p = PyUnicode_AS_UNICODE(v);
				1952	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1953	Py_DECREF(v);
				1954	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1955	}
				1956
				1957	return (PyObject *)v;
				1958	}
				1959
				1960	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1961	int size,
				1962	const char *errors)
				1963	{
				1964	PyObject *repr;
				1965	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1966	DWORD mbcssize;
				1967
				1968	/* If there are no characters, bail now! */
				1969	if (size==0)
				1970	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1971
				1972	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1973	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1974	if (mbcssize==0)
				1975	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1976
				1977	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1978	if (repr == NULL)
				1979	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1980	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1981	return repr;
				1982
				1983	/* Do the conversion */
				1984	s = PyString_AS_STRING(repr);
				1985	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1986	Py_DECREF(repr);
				1987	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1988	}
				1989	return repr;
				1990	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1991
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1992	#endif /* MS_WIN32 */
				1993
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1994	/* --- Character Mapping Codec -------------------------------------------- */
				1995
				1996	static
				1997	int charmap_decoding_error(const char **source,
				1998	Py_UNICODE **dest,
				1999	const char *errors,
				2000	const char *details)
				2001	{
				2002	if ((errors == NULL) \|\|
				2003	(strcmp(errors,"strict") == 0)) {
				2004	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2005	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2006	details);
				2007	return -1;
				2008	}
				2009	else if (strcmp(errors,"ignore") == 0) {
				2010	return 0;
				2011	}
				2012	else if (strcmp(errors,"replace") == 0) {
				2013	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2014	(*dest)++;
				2015	return 0;
				2016	}
				2017	else {
				2018	PyErr_Format(PyExc_ValueError,
				2019	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2020	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2021	errors);
				2022	return -1;
				2023	}
				2024	}
				2025
				2026	PyObject PyUnicode_DecodeCharmap(const char s,
				2027	int size,
				2028	PyObject *mapping,
				2029	const char *errors)
				2030	{
				2031	PyUnicodeObject *v;
				2032	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2033	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2034
				2035	/* Default to Latin-1 */
				2036	if (mapping == NULL)
				2037	return PyUnicode_DecodeLatin1(s, size, errors);
				2038
				2039	v = _PyUnicode_New(size);
				2040	if (v == NULL)
				2041	goto onError;
				2042	if (size == 0)
				2043	return (PyObject *)v;
				2044	p = PyUnicode_AS_UNICODE(v);
				2045	while (size-- > 0) {
				2046	unsigned char ch = *s++;
				2047	PyObject w, x;
				2048
				2049	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2050	w = PyInt_FromLong((long)ch);
				2051	if (w == NULL)
				2052	goto onError;
				2053	x = PyObject_GetItem(mapping, w);
				2054	Py_DECREF(w);
				2055	if (x == NULL) {
				2056	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2057	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2058	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2059	x = Py_None;
				2060	Py_INCREF(x);
				2061	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2062	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2063	}
				2064
				2065	/* Apply mapping */
				2066	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2067	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2068	if (value < 0 \|\| value > 65535) {
				2069	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2070	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2071	Py_DECREF(x);
				2072	goto onError;
				2073	}
				2074	*p++ = (Py_UNICODE)value;
				2075	}
				2076	else if (x == Py_None) {
				2077	/* undefined mapping */
				2078	if (charmap_decoding_error(&s, &p, errors,
				2079	"character maps to <undefined>")) {
				2080	Py_DECREF(x);
				2081	goto onError;
				2082	}
				2083	}
				2084	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2085	int targetsize = PyUnicode_GET_SIZE(x);
				2086
				2087	if (targetsize == 1)
				2088	/* 1-1 mapping */
				2089	p++ = PyUnicode_AS_UNICODE(x);
				2090
				2091	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2092	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2093	if (targetsize > extrachars) {
				2094	/* resize first */
				2095	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2096	int needed = (targetsize - extrachars) + \
				2097	(targetsize << 2);
				2098	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2099	if (_PyUnicode_Resize(&v,
				2100	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2101	Py_DECREF(x);
				2102	goto onError;
				2103	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2104	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2105	}
				2106	Py_UNICODE_COPY(p,
				2107	PyUnicode_AS_UNICODE(x),
				2108	targetsize);
				2109	p += targetsize;
				2110	extrachars -= targetsize;
				2111	}
				2112	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2113	}
				2114	else {
				2115	/* wrong return value */
				2116	PyErr_SetString(PyExc_TypeError,
				2117	"character mapping must return integer, None or unicode");
				2118	Py_DECREF(x);
				2119	goto onError;
				2120	}
				2121	Py_DECREF(x);
				2122	}
				2123	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2124	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2125	goto onError;
				2126	return (PyObject *)v;
				2127
				2128	onError:
				2129	Py_XDECREF(v);
				2130	return NULL;
				2131	}
				2132
				2133	static
				2134	int charmap_encoding_error(const Py_UNICODE **source,
				2135	char **dest,
				2136	const char *errors,
				2137	const char *details)
				2138	{
				2139	if ((errors == NULL) \|\|
				2140	(strcmp(errors,"strict") == 0)) {
				2141	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2142	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2143	details);
				2144	return -1;
				2145	}
				2146	else if (strcmp(errors,"ignore") == 0) {
				2147	return 0;
				2148	}
				2149	else if (strcmp(errors,"replace") == 0) {
				2150	**dest = '?';
				2151	(*dest)++;
				2152	return 0;
				2153	}
				2154	else {
				2155	PyErr_Format(PyExc_ValueError,
				2156	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2157	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2158	errors);
				2159	return -1;
				2160	}
				2161	}
				2162
				2163	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2164	int size,
				2165	PyObject *mapping,
				2166	const char *errors)
				2167	{
				2168	PyObject *v;
				2169	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2170	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2171
				2172	/* Default to Latin-1 */
				2173	if (mapping == NULL)
				2174	return PyUnicode_EncodeLatin1(p, size, errors);
				2175
				2176	v = PyString_FromStringAndSize(NULL, size);
				2177	if (v == NULL)
				2178	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2179	if (size == 0)
				2180	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2181	s = PyString_AS_STRING(v);
				2182	while (size-- > 0) {
				2183	Py_UNICODE ch = *p++;
				2184	PyObject w, x;
				2185
				2186	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2187	w = PyInt_FromLong((long)ch);
				2188	if (w == NULL)
				2189	goto onError;
				2190	x = PyObject_GetItem(mapping, w);
				2191	Py_DECREF(w);
				2192	if (x == NULL) {
				2193	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2194	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2195	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2196	x = Py_None;
				2197	Py_INCREF(x);
				2198	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2199	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2200	}
				2201
				2202	/* Apply mapping */
				2203	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2204	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2205	if (value < 0 \|\| value > 255) {
				2206	PyErr_SetString(PyExc_TypeError,
				2207	"character mapping must be in range(256)");
				2208	Py_DECREF(x);
				2209	goto onError;
				2210	}
				2211	*s++ = (char)value;
				2212	}
				2213	else if (x == Py_None) {
				2214	/* undefined mapping */
				2215	if (charmap_encoding_error(&p, &s, errors,
				2216	"character maps to <undefined>")) {
				2217	Py_DECREF(x);
				2218	goto onError;
				2219	}
				2220	}
				2221	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2222	int targetsize = PyString_GET_SIZE(x);
				2223
				2224	if (targetsize == 1)
				2225	/* 1-1 mapping */
				2226	s++ = PyString_AS_STRING(x);
				2227
				2228	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2229	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2230	if (targetsize > extrachars) {
				2231	/* resize first */
				2232	int oldpos = (int)(s - PyString_AS_STRING(v));
				2233	int needed = (targetsize - extrachars) + \
				2234	(targetsize << 2);
				2235	extrachars += needed;
				2236	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2237	Py_DECREF(x);
				2238	goto onError;
				2239	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2240	s = PyString_AS_STRING(v) + oldpos;
				2241	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2242	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2243	s += targetsize;
				2244	extrachars -= targetsize;
				2245	}
				2246	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2247	}
				2248	else {
				2249	/* wrong return value */
				2250	PyErr_SetString(PyExc_TypeError,
				2251	"character mapping must return integer, None or unicode");
				2252	Py_DECREF(x);
				2253	goto onError;
				2254	}
				2255	Py_DECREF(x);
				2256	}
				2257	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2258	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2259	goto onError;
				2260	return v;
				2261
				2262	onError:
				2263	Py_DECREF(v);
				2264	return NULL;
				2265	}
				2266
				2267	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2268	PyObject *mapping)
				2269	{
				2270	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2271	PyErr_BadArgument();
				2272	return NULL;
				2273	}
				2274	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2275	PyUnicode_GET_SIZE(unicode),
				2276	mapping,
				2277	NULL);
				2278	}
				2279
				2280	static
				2281	int translate_error(const Py_UNICODE **source,
				2282	Py_UNICODE **dest,
				2283	const char *errors,
				2284	const char *details)
				2285	{
				2286	if ((errors == NULL) \|\|
				2287	(strcmp(errors,"strict") == 0)) {
				2288	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2289	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2290	details);
				2291	return -1;
				2292	}
				2293	else if (strcmp(errors,"ignore") == 0) {
				2294	return 0;
				2295	}
				2296	else if (strcmp(errors,"replace") == 0) {
				2297	**dest = '?';
				2298	(*dest)++;
				2299	return 0;
				2300	}
				2301	else {
				2302	PyErr_Format(PyExc_ValueError,
				2303	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2304	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2305	errors);
				2306	return -1;
				2307	}
				2308	}
				2309
				2310	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2311	int size,
				2312	PyObject *mapping,
				2313	const char *errors)
				2314	{
				2315	PyUnicodeObject *v;
				2316	Py_UNICODE *p;
				2317
				2318	if (mapping == NULL) {
				2319	PyErr_BadArgument();
				2320	return NULL;
				2321	}
				2322
				2323	/* Output will never be longer than input */
				2324	v = _PyUnicode_New(size);
				2325	if (v == NULL)
				2326	goto onError;
				2327	if (size == 0)
				2328	goto done;
				2329	p = PyUnicode_AS_UNICODE(v);
				2330	while (size-- > 0) {
				2331	Py_UNICODE ch = *s++;
				2332	PyObject w, x;
				2333
				2334	/* Get mapping */
				2335	w = PyInt_FromLong(ch);
				2336	if (w == NULL)
				2337	goto onError;
				2338	x = PyObject_GetItem(mapping, w);
				2339	Py_DECREF(w);
				2340	if (x == NULL) {
				2341	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2342	/* No mapping found: default to 1-1 mapping */
				2343	PyErr_Clear();
				2344	*p++ = ch;
				2345	continue;
				2346	}
				2347	goto onError;
				2348	}
				2349
				2350	/* Apply mapping */
				2351	if (PyInt_Check(x))
				2352	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2353	else if (x == Py_None) {
				2354	/* undefined mapping */
				2355	if (translate_error(&s, &p, errors,
				2356	"character maps to <undefined>")) {
				2357	Py_DECREF(x);
				2358	goto onError;
				2359	}
				2360	}
				2361	else if (PyUnicode_Check(x)) {
				2362	if (PyUnicode_GET_SIZE(x) != 1) {
				2363	/* 1-n mapping */
				2364	PyErr_SetString(PyExc_NotImplementedError,
				2365	"1-n mappings are currently not implemented");
				2366	Py_DECREF(x);
				2367	goto onError;
				2368	}
				2369	p++ = PyUnicode_AS_UNICODE(x);
				2370	}
				2371	else {
				2372	/* wrong return value */
				2373	PyErr_SetString(PyExc_TypeError,
				2374	"translate mapping must return integer, None or unicode");
				2375	Py_DECREF(x);
				2376	goto onError;
				2377	}
				2378	Py_DECREF(x);
				2379	}
				2380	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2381	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2382	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2383
				2384	done:
				2385	return (PyObject *)v;
				2386
				2387	onError:
				2388	Py_XDECREF(v);
				2389	return NULL;
				2390	}
				2391
				2392	PyObject PyUnicode_Translate(PyObject str,
				2393	PyObject *mapping,
				2394	const char *errors)
				2395	{
				2396	PyObject *result;
				2397
				2398	str = PyUnicode_FromObject(str);
				2399	if (str == NULL)
				2400	goto onError;
				2401	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2402	PyUnicode_GET_SIZE(str),
				2403	mapping,
				2404	errors);
				2405	Py_DECREF(str);
				2406	return result;
				2407
				2408	onError:
				2409	Py_XDECREF(str);
				2410	return NULL;
				2411	}
				2412
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2413	/* --- Decimal Encoder ---------------------------------------------------- */
				2414
				2415	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2416	int length,
				2417	char *output,
				2418	const char *errors)
				2419	{
				2420	Py_UNICODE p, end;
				2421
				2422	if (output == NULL) {
				2423	PyErr_BadArgument();
				2424	return -1;
				2425	}
				2426
				2427	p = s;
				2428	end = s + length;
				2429	while (p < end) {
				2430	register Py_UNICODE ch = *p++;
				2431	int decimal;
				2432
				2433	if (Py_UNICODE_ISSPACE(ch)) {
				2434	*output++ = ' ';
				2435	continue;
				2436	}
				2437	decimal = Py_UNICODE_TODECIMAL(ch);
				2438	if (decimal >= 0) {
				2439	*output++ = '0' + decimal;
				2440	continue;
				2441	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2442	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2443	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2444	continue;
				2445	}
				2446	/* All other characters are considered invalid */
				2447	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2448	PyErr_SetString(PyExc_ValueError,
				2449	"invalid decimal Unicode string");
				2450	goto onError;
				2451	}
				2452	else if (strcmp(errors, "ignore") == 0)
				2453	continue;
				2454	else if (strcmp(errors, "replace") == 0) {
				2455	*output++ = '?';
				2456	continue;
				2457	}
				2458	}
				2459	/* 0-terminate the output string */
				2460	*output++ = '\0';
				2461	return 0;
				2462
				2463	onError:
				2464	return -1;
				2465	}
				2466
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2467	/* --- Helpers ------------------------------------------------------------ */
				2468
				2469	static
				2470	int count(PyUnicodeObject *self,
				2471	int start,
				2472	int end,
				2473	PyUnicodeObject *substring)
				2474	{
				2475	int count = 0;
				2476
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2477	if (start < 0)
				2478	start += self->length;
				2479	if (start < 0)
				2480	start = 0;
				2481	if (end > self->length)
				2482	end = self->length;
				2483	if (end < 0)
				2484	end += self->length;
				2485	if (end < 0)
				2486	end = 0;
				2487
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2488	if (substring->length == 0)
				2489	return (end - start + 1);
				2490
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2491	end -= substring->length;
				2492
				2493	while (start <= end)
				2494	if (Py_UNICODE_MATCH(self, start, substring)) {
				2495	count++;
				2496	start += substring->length;
				2497	} else
				2498	start++;
				2499
				2500	return count;
				2501	}
				2502
				2503	int PyUnicode_Count(PyObject *str,
				2504	PyObject *substr,
				2505	int start,
				2506	int end)
				2507	{
				2508	int result;
				2509
				2510	str = PyUnicode_FromObject(str);
				2511	if (str == NULL)
				2512	return -1;
				2513	substr = PyUnicode_FromObject(substr);
				2514	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2515	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2516	return -1;
				2517	}
				2518
				2519	result = count((PyUnicodeObject *)str,
				2520	start, end,
				2521	(PyUnicodeObject *)substr);
				2522
				2523	Py_DECREF(str);
				2524	Py_DECREF(substr);
				2525	return result;
				2526	}
				2527
				2528	static
				2529	int findstring(PyUnicodeObject *self,
				2530	PyUnicodeObject *substring,
				2531	int start,
				2532	int end,
				2533	int direction)
				2534	{
				2535	if (start < 0)
				2536	start += self->length;
				2537	if (start < 0)
				2538	start = 0;
				2539
				2540	if (substring->length == 0)
				2541	return start;
				2542
				2543	if (end > self->length)
				2544	end = self->length;
				2545	if (end < 0)
				2546	end += self->length;
				2547	if (end < 0)
				2548	end = 0;
				2549
				2550	end -= substring->length;
				2551
				2552	if (direction < 0) {
				2553	for (; end >= start; end--)
				2554	if (Py_UNICODE_MATCH(self, end, substring))
				2555	return end;
				2556	} else {
				2557	for (; start <= end; start++)
				2558	if (Py_UNICODE_MATCH(self, start, substring))
				2559	return start;
				2560	}
				2561
				2562	return -1;
				2563	}
				2564
				2565	int PyUnicode_Find(PyObject *str,
				2566	PyObject *substr,
				2567	int start,
				2568	int end,
				2569	int direction)
				2570	{
				2571	int result;
				2572
				2573	str = PyUnicode_FromObject(str);
				2574	if (str == NULL)
				2575	return -1;
				2576	substr = PyUnicode_FromObject(substr);
				2577	if (substr == NULL) {
				2578	Py_DECREF(substr);
				2579	return -1;
				2580	}
				2581
				2582	result = findstring((PyUnicodeObject *)str,
				2583	(PyUnicodeObject *)substr,
				2584	start, end, direction);
				2585	Py_DECREF(str);
				2586	Py_DECREF(substr);
				2587	return result;
				2588	}
				2589
				2590	static
				2591	int tailmatch(PyUnicodeObject *self,
				2592	PyUnicodeObject *substring,
				2593	int start,
				2594	int end,
				2595	int direction)
				2596	{
				2597	if (start < 0)
				2598	start += self->length;
				2599	if (start < 0)
				2600	start = 0;
				2601
				2602	if (substring->length == 0)
				2603	return 1;
				2604
				2605	if (end > self->length)
				2606	end = self->length;
				2607	if (end < 0)
				2608	end += self->length;
				2609	if (end < 0)
				2610	end = 0;
				2611
				2612	end -= substring->length;
				2613	if (end < start)
				2614	return 0;
				2615
				2616	if (direction > 0) {
				2617	if (Py_UNICODE_MATCH(self, end, substring))
				2618	return 1;
				2619	} else {
				2620	if (Py_UNICODE_MATCH(self, start, substring))
				2621	return 1;
				2622	}
				2623
				2624	return 0;
				2625	}
				2626
				2627	int PyUnicode_Tailmatch(PyObject *str,
				2628	PyObject *substr,
				2629	int start,
				2630	int end,
				2631	int direction)
				2632	{
				2633	int result;
				2634
				2635	str = PyUnicode_FromObject(str);
				2636	if (str == NULL)
				2637	return -1;
				2638	substr = PyUnicode_FromObject(substr);
				2639	if (substr == NULL) {
				2640	Py_DECREF(substr);
				2641	return -1;
				2642	}
				2643
				2644	result = tailmatch((PyUnicodeObject *)str,
				2645	(PyUnicodeObject *)substr,
				2646	start, end, direction);
				2647	Py_DECREF(str);
				2648	Py_DECREF(substr);
				2649	return result;
				2650	}
				2651
				2652	static
				2653	const Py_UNICODE findchar(const Py_UNICODE s,
				2654	int size,
				2655	Py_UNICODE ch)
				2656	{
				2657	/* like wcschr, but doesn't stop at NULL characters */
				2658
				2659	while (size-- > 0) {
				2660	if (*s == ch)
				2661	return s;
				2662	s++;
				2663	}
				2664
				2665	return NULL;
				2666	}
				2667
				2668	/* Apply fixfct filter to the Unicode object self and return a
				2669	reference to the modified object */
				2670
				2671	static
				2672	PyObject fixup(PyUnicodeObject self,
				2673	int (fixfct)(PyUnicodeObject s))
				2674	{
				2675
				2676	PyUnicodeObject *u;
				2677
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2678	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2679	if (u == NULL)
				2680	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2681
				2682	Py_UNICODE_COPY(u->str, self->str, self->length);
				2683
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2684	if (!fixfct(u)) {
				2685	/* fixfct should return TRUE if it modified the buffer. If
				2686	FALSE, return a reference to the original buffer instead
				2687	(to save space, not time) */
				2688	Py_INCREF(self);
				2689	Py_DECREF(u);
				2690	return (PyObject*) self;
				2691	}
				2692	return (PyObject*) u;
				2693	}
				2694
				2695	static
				2696	int fixupper(PyUnicodeObject *self)
				2697	{
				2698	int len = self->length;
				2699	Py_UNICODE *s = self->str;
				2700	int status = 0;
				2701
				2702	while (len-- > 0) {
				2703	register Py_UNICODE ch;
				2704
				2705	ch = Py_UNICODE_TOUPPER(*s);
				2706	if (ch != *s) {
				2707	status = 1;
				2708	*s = ch;
				2709	}
				2710	s++;
				2711	}
				2712
				2713	return status;
				2714	}
				2715
				2716	static
				2717	int fixlower(PyUnicodeObject *self)
				2718	{
				2719	int len = self->length;
				2720	Py_UNICODE *s = self->str;
				2721	int status = 0;
				2722
				2723	while (len-- > 0) {
				2724	register Py_UNICODE ch;
				2725
				2726	ch = Py_UNICODE_TOLOWER(*s);
				2727	if (ch != *s) {
				2728	status = 1;
				2729	*s = ch;
				2730	}
				2731	s++;
				2732	}
				2733
				2734	return status;
				2735	}
				2736
				2737	static
				2738	int fixswapcase(PyUnicodeObject *self)
				2739	{
				2740	int len = self->length;
				2741	Py_UNICODE *s = self->str;
				2742	int status = 0;
				2743
				2744	while (len-- > 0) {
				2745	if (Py_UNICODE_ISUPPER(*s)) {
				2746	s = Py_UNICODE_TOLOWER(s);
				2747	status = 1;
				2748	} else if (Py_UNICODE_ISLOWER(*s)) {
				2749	s = Py_UNICODE_TOUPPER(s);
				2750	status = 1;
				2751	}
				2752	s++;
				2753	}
				2754
				2755	return status;
				2756	}
				2757
				2758	static
				2759	int fixcapitalize(PyUnicodeObject *self)
				2760	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2761	int len = self->length;
				2762	Py_UNICODE *s = self->str;
				2763	int status = 0;
				2764
				2765	if (len == 0)
				2766	return 0;
				2767	if (Py_UNICODE_ISLOWER(*s)) {
				2768	s = Py_UNICODE_TOUPPER(s);
				2769	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2770	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2771	s++;
				2772	while (--len > 0) {
				2773	if (Py_UNICODE_ISUPPER(*s)) {
				2774	s = Py_UNICODE_TOLOWER(s);
				2775	status = 1;
				2776	}
				2777	s++;
				2778	}
				2779	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2780	}
				2781
				2782	static
				2783	int fixtitle(PyUnicodeObject *self)
				2784	{
				2785	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2786	register Py_UNICODE *e;
				2787	int previous_is_cased;
				2788
				2789	/* Shortcut for single character strings */
				2790	if (PyUnicode_GET_SIZE(self) == 1) {
				2791	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2792	if (*p != ch) {
				2793	*p = ch;
				2794	return 1;
				2795	}
				2796	else
				2797	return 0;
				2798	}
				2799
				2800	e = p + PyUnicode_GET_SIZE(self);
				2801	previous_is_cased = 0;
				2802	for (; p < e; p++) {
				2803	register const Py_UNICODE ch = *p;
				2804
				2805	if (previous_is_cased)
				2806	*p = Py_UNICODE_TOLOWER(ch);
				2807	else
				2808	*p = Py_UNICODE_TOTITLE(ch);
				2809
				2810	if (Py_UNICODE_ISLOWER(ch) \|\|
				2811	Py_UNICODE_ISUPPER(ch) \|\|
				2812	Py_UNICODE_ISTITLE(ch))
				2813	previous_is_cased = 1;
				2814	else
				2815	previous_is_cased = 0;
				2816	}
				2817	return 1;
				2818	}
				2819
				2820	PyObject PyUnicode_Join(PyObject separator,
				2821	PyObject *seq)
				2822	{
				2823	Py_UNICODE *sep;
				2824	int seplen;
				2825	PyUnicodeObject *res = NULL;
				2826	int reslen = 0;
				2827	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2828	int sz = 100;
				2829	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2830	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2831
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2832	it = PyObject_GetIter(seq);
				2833	if (it == NULL)
				2834	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2835
				2836	if (separator == NULL) {
				2837	Py_UNICODE blank = ' ';
				2838	sep = &blank;
				2839	seplen = 1;
				2840	}
				2841	else {
				2842	separator = PyUnicode_FromObject(separator);
				2843	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2844	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2845	sep = PyUnicode_AS_UNICODE(separator);
				2846	seplen = PyUnicode_GET_SIZE(separator);
				2847	}
				2848
				2849	res = _PyUnicode_New(sz);
				2850	if (res == NULL)
				2851	goto onError;
				2852	p = PyUnicode_AS_UNICODE(res);
				2853	reslen = 0;
				2854
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2855	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2856	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2857	PyObject *item = PyIter_Next(it);
				2858	if (item == NULL) {
				2859	if (PyErr_Occurred())
				2860	goto onError;
				2861	break;
				2862	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2863	if (!PyUnicode_Check(item)) {
				2864	PyObject *v;
				2865	v = PyUnicode_FromObject(item);
				2866	Py_DECREF(item);
				2867	item = v;
				2868	if (item == NULL)
				2869	goto onError;
				2870	}
				2871	itemlen = PyUnicode_GET_SIZE(item);
				2872	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2873	if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2874	goto onError;
				2875	sz *= 2;
				2876	p = PyUnicode_AS_UNICODE(res) + reslen;
				2877	}
				2878	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2879	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2880	p += seplen;
				2881	reslen += seplen;
				2882	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2883	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2884	p += itemlen;
				2885	reslen += itemlen;
				2886	Py_DECREF(item);
				2887	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2888	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2889	goto onError;
				2890
				2891	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2892	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2893	return (PyObject *)res;
				2894
				2895	onError:
				2896	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2897	Py_XDECREF(res);
				2898	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2899	return NULL;
				2900	}
				2901
				2902	static
				2903	PyUnicodeObject pad(PyUnicodeObject self,
				2904	int left,
				2905	int right,
				2906	Py_UNICODE fill)
				2907	{
				2908	PyUnicodeObject *u;
				2909
				2910	if (left < 0)
				2911	left = 0;
				2912	if (right < 0)
				2913	right = 0;
				2914
				2915	if (left == 0 && right == 0) {
				2916	Py_INCREF(self);
				2917	return self;
				2918	}
				2919
				2920	u = _PyUnicode_New(left + self->length + right);
				2921	if (u) {
				2922	if (left)
				2923	Py_UNICODE_FILL(u->str, fill, left);
				2924	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2925	if (right)
				2926	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2927	}
				2928
				2929	return u;
				2930	}
				2931
				2932	#define SPLIT_APPEND(data, left, right) \
				2933	str = PyUnicode_FromUnicode(data + left, right - left); \
				2934	if (!str) \
				2935	goto onError; \
				2936	if (PyList_Append(list, str)) { \
				2937	Py_DECREF(str); \
				2938	goto onError; \
				2939	} \
				2940	else \
				2941	Py_DECREF(str);
				2942
				2943	static
				2944	PyObject split_whitespace(PyUnicodeObject self,
				2945	PyObject *list,
				2946	int maxcount)
				2947	{
				2948	register int i;
				2949	register int j;
				2950	int len = self->length;
				2951	PyObject *str;
				2952
				2953	for (i = j = 0; i < len; ) {
				2954	/* find a token */
				2955	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2956	i++;
				2957	j = i;
				2958	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2959	i++;
				2960	if (j < i) {
				2961	if (maxcount-- <= 0)
				2962	break;
				2963	SPLIT_APPEND(self->str, j, i);
				2964	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2965	i++;
				2966	j = i;
				2967	}
				2968	}
				2969	if (j < len) {
				2970	SPLIT_APPEND(self->str, j, len);
				2971	}
				2972	return list;
				2973
				2974	onError:
				2975	Py_DECREF(list);
				2976	return NULL;
				2977	}
				2978
				2979	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2980	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2981	{
				2982	register int i;
				2983	register int j;
				2984	int len;
				2985	PyObject *list;
				2986	PyObject *str;
				2987	Py_UNICODE *data;
				2988
				2989	string = PyUnicode_FromObject(string);
				2990	if (string == NULL)
				2991	return NULL;
				2992	data = PyUnicode_AS_UNICODE(string);
				2993	len = PyUnicode_GET_SIZE(string);
				2994
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2995	list = PyList_New(0);
				2996	if (!list)
				2997	goto onError;
				2998
				2999	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3000	int eol;
				3001
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3002	/* Find a line and append it */
				3003	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				3004	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3005
				3006	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3007	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3008	if (i < len) {
				3009	if (data[i] == '\r' && i + 1 < len &&
				3010	data[i+1] == '\n')
				3011	i += 2;
				3012	else
				3013	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3014	if (keepends)
				3015	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3016	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3017	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3018	j = i;
				3019	}
				3020	if (j < len) {
				3021	SPLIT_APPEND(data, j, len);
				3022	}
				3023
				3024	Py_DECREF(string);
				3025	return list;
				3026
				3027	onError:
				3028	Py_DECREF(list);
				3029	Py_DECREF(string);
				3030	return NULL;
				3031	}
				3032
				3033	static
				3034	PyObject split_char(PyUnicodeObject self,
				3035	PyObject *list,
				3036	Py_UNICODE ch,
				3037	int maxcount)
				3038	{
				3039	register int i;
				3040	register int j;
				3041	int len = self->length;
				3042	PyObject *str;
				3043
				3044	for (i = j = 0; i < len; ) {
				3045	if (self->str[i] == ch) {
				3046	if (maxcount-- <= 0)
				3047	break;
				3048	SPLIT_APPEND(self->str, j, i);
				3049	i = j = i + 1;
				3050	} else
				3051	i++;
				3052	}
				3053	if (j <= len) {
				3054	SPLIT_APPEND(self->str, j, len);
				3055	}
				3056	return list;
				3057
				3058	onError:
				3059	Py_DECREF(list);
				3060	return NULL;
				3061	}
				3062
				3063	static
				3064	PyObject split_substring(PyUnicodeObject self,
				3065	PyObject *list,
				3066	PyUnicodeObject *substring,
				3067	int maxcount)
				3068	{
				3069	register int i;
				3070	register int j;
				3071	int len = self->length;
				3072	int sublen = substring->length;
				3073	PyObject *str;
				3074
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3075	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3076	if (Py_UNICODE_MATCH(self, i, substring)) {
				3077	if (maxcount-- <= 0)
				3078	break;
				3079	SPLIT_APPEND(self->str, j, i);
				3080	i = j = i + sublen;
				3081	} else
				3082	i++;
				3083	}
				3084	if (j <= len) {
				3085	SPLIT_APPEND(self->str, j, len);
				3086	}
				3087	return list;
				3088
				3089	onError:
				3090	Py_DECREF(list);
				3091	return NULL;
				3092	}
				3093
				3094	#undef SPLIT_APPEND
				3095
				3096	static
				3097	PyObject split(PyUnicodeObject self,
				3098	PyUnicodeObject *substring,
				3099	int maxcount)
				3100	{
				3101	PyObject *list;
				3102
				3103	if (maxcount < 0)
				3104	maxcount = INT_MAX;
				3105
				3106	list = PyList_New(0);
				3107	if (!list)
				3108	return NULL;
				3109
				3110	if (substring == NULL)
				3111	return split_whitespace(self,list,maxcount);
				3112
				3113	else if (substring->length == 1)
				3114	return split_char(self,list,substring->str[0],maxcount);
				3115
				3116	else if (substring->length == 0) {
				3117	Py_DECREF(list);
				3118	PyErr_SetString(PyExc_ValueError, "empty separator");
				3119	return NULL;
				3120	}
				3121	else
				3122	return split_substring(self,list,substring,maxcount);
				3123	}
				3124
				3125	static
				3126	PyObject strip(PyUnicodeObject self,
				3127	int left,
				3128	int right)
				3129	{
				3130	Py_UNICODE *p = self->str;
				3131	int start = 0;
				3132	int end = self->length;
				3133
				3134	if (left)
				3135	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3136	start++;
				3137
				3138	if (right)
				3139	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3140	end--;
				3141
				3142	if (start == 0 && end == self->length) {
				3143	/* couldn't strip anything off, return original string */
				3144	Py_INCREF(self);
				3145	return (PyObject*) self;
				3146	}
				3147
				3148	return (PyObject*) PyUnicode_FromUnicode(
				3149	self->str + start,
				3150	end - start
				3151	);
				3152	}
				3153
				3154	static
				3155	PyObject replace(PyUnicodeObject self,
				3156	PyUnicodeObject *str1,
				3157	PyUnicodeObject *str2,
				3158	int maxcount)
				3159	{
				3160	PyUnicodeObject *u;
				3161
				3162	if (maxcount < 0)
				3163	maxcount = INT_MAX;
				3164
				3165	if (str1->length == 1 && str2->length == 1) {
				3166	int i;
				3167
				3168	/* replace characters */
				3169	if (!findchar(self->str, self->length, str1->str[0])) {
				3170	/* nothing to replace, return original string */
				3171	Py_INCREF(self);
				3172	u = self;
				3173	} else {
				3174	Py_UNICODE u1 = str1->str[0];
				3175	Py_UNICODE u2 = str2->str[0];
				3176
				3177	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3178	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3179	self->length
				3180	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3181	if (u != NULL) {
				3182	Py_UNICODE_COPY(u->str, self->str,
				3183	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3184	for (i = 0; i < u->length; i++)
				3185	if (u->str[i] == u1) {
				3186	if (--maxcount < 0)
				3187	break;
				3188	u->str[i] = u2;
				3189	}
				3190	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3191	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3192
				3193	} else {
				3194	int n, i;
				3195	Py_UNICODE *p;
				3196
				3197	/* replace strings */
				3198	n = count(self, 0, self->length, str1);
				3199	if (n > maxcount)
				3200	n = maxcount;
				3201	if (n == 0) {
				3202	/* nothing to replace, return original string */
				3203	Py_INCREF(self);
				3204	u = self;
				3205	} else {
				3206	u = _PyUnicode_New(
				3207	self->length + n * (str2->length - str1->length));
				3208	if (u) {
				3209	i = 0;
				3210	p = u->str;
				3211	while (i <= self->length - str1->length)
				3212	if (Py_UNICODE_MATCH(self, i, str1)) {
				3213	/* replace string segment */
				3214	Py_UNICODE_COPY(p, str2->str, str2->length);
				3215	p += str2->length;
				3216	i += str1->length;
				3217	if (--n <= 0) {
				3218	/* copy remaining part */
				3219	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3220	break;
				3221	}
				3222	} else
				3223	*p++ = self->str[i++];
				3224	}
				3225	}
				3226	}
				3227
				3228	return (PyObject *) u;
				3229	}
				3230
				3231	/* --- Unicode Object Methods --------------------------------------------- */
				3232
				3233	static char title__doc__[] =
				3234	"S.title() -> unicode\n\
				3235	\n\
				3236	Return a titlecased version of S, i.e. words start with title case\n\
				3237	characters, all remaining cased characters have lower case.";
				3238
				3239	static PyObject*
				3240	unicode_title(PyUnicodeObject self, PyObject args)
				3241	{
				3242	if (!PyArg_NoArgs(args))
				3243	return NULL;
				3244	return fixup(self, fixtitle);
				3245	}
				3246
				3247	static char capitalize__doc__[] =
				3248	"S.capitalize() -> unicode\n\
				3249	\n\
				3250	Return a capitalized version of S, i.e. make the first character\n\
				3251	have upper case.";
				3252
				3253	static PyObject*
				3254	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3255	{
				3256	if (!PyArg_NoArgs(args))
				3257	return NULL;
				3258	return fixup(self, fixcapitalize);
				3259	}
				3260
				3261	#if 0
				3262	static char capwords__doc__[] =
				3263	"S.capwords() -> unicode\n\
				3264	\n\
				3265	Apply .capitalize() to all words in S and return the result with\n\
				3266	normalized whitespace (all whitespace strings are replaced by ' ').";
				3267
				3268	static PyObject*
				3269	unicode_capwords(PyUnicodeObject self, PyObject args)
				3270	{
				3271	PyObject *list;
				3272	PyObject *item;
				3273	int i;
				3274
				3275	if (!PyArg_NoArgs(args))
				3276	return NULL;
				3277
				3278	/* Split into words */
				3279	list = split(self, NULL, -1);
				3280	if (!list)
				3281	return NULL;
				3282
				3283	/* Capitalize each word */
				3284	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3285	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3286	fixcapitalize);
				3287	if (item == NULL)
				3288	goto onError;
				3289	Py_DECREF(PyList_GET_ITEM(list, i));
				3290	PyList_SET_ITEM(list, i, item);
				3291	}
				3292
				3293	/* Join the words to form a new string */
				3294	item = PyUnicode_Join(NULL, list);
				3295
				3296	onError:
				3297	Py_DECREF(list);
				3298	return (PyObject *)item;
				3299	}
				3300	#endif
				3301
				3302	static char center__doc__[] =
				3303	"S.center(width) -> unicode\n\
				3304	\n\
				3305	Return S centered in a Unicode string of length width. Padding is done\n\
				3306	using spaces.";
				3307
				3308	static PyObject *
				3309	unicode_center(PyUnicodeObject self, PyObject args)
				3310	{
				3311	int marg, left;
				3312	int width;
				3313
				3314	if (!PyArg_ParseTuple(args, "i:center", &width))
				3315	return NULL;
				3316
				3317	if (self->length >= width) {
				3318	Py_INCREF(self);
				3319	return (PyObject*) self;
				3320	}
				3321
				3322	marg = width - self->length;
				3323	left = marg / 2 + (marg & width & 1);
				3324
				3325	return (PyObject*) pad(self, left, marg - left, ' ');
				3326	}
				3327
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3328	#if 0
				3329
				3330	/* This code should go into some future Unicode collation support
				3331	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3332	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3333
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3334	/* speedy UTF-16 code point order comparison */
				3335	/* gleaned from: */
				3336	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3337
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3338	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3339	{
				3340	0, 0, 0, 0, 0, 0, 0, 0,
				3341	0, 0, 0, 0, 0, 0, 0, 0,
				3342	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3343	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3344	};
				3345
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3346	static int
				3347	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3348	{
				3349	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3350
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3351	Py_UNICODE *s1 = str1->str;
				3352	Py_UNICODE *s2 = str2->str;
				3353
				3354	len1 = str1->length;
				3355	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3356
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3357	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3358	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3359
				3360	c1 = *s1++;
				3361	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3362
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3363	if (c1 > (1<<11) * 26)
				3364	c1 += utf16Fixup[c1>>11];
				3365	if (c2 > (1<<11) * 26)
				3366	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3367	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3368
				3369	if (c1 != c2)
				3370	return (c1 < c2) ? -1 : 1;
				3371
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3372	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3373	}
				3374
				3375	return (len1 < len2) ? -1 : (len1 != len2);
				3376	}
				3377
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3378	#else
				3379
				3380	static int
				3381	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3382	{
				3383	register int len1, len2;
				3384
				3385	Py_UNICODE *s1 = str1->str;
				3386	Py_UNICODE *s2 = str2->str;
				3387
				3388	len1 = str1->length;
				3389	len2 = str2->length;
				3390
				3391	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3392	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3393
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3394	c1 = *s1++;
				3395	c2 = *s2++;
				3396
				3397	if (c1 != c2)
				3398	return (c1 < c2) ? -1 : 1;
				3399
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3400	len1--; len2--;
				3401	}
				3402
				3403	return (len1 < len2) ? -1 : (len1 != len2);
				3404	}
				3405
				3406	#endif
				3407
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3408	int PyUnicode_Compare(PyObject *left,
				3409	PyObject *right)
				3410	{
				3411	PyUnicodeObject u = NULL, v = NULL;
				3412	int result;
				3413
				3414	/* Coerce the two arguments */
				3415	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3416	if (u == NULL)
				3417	goto onError;
				3418	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3419	if (v == NULL)
				3420	goto onError;
				3421
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3422	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3423	if (v == u) {
				3424	Py_DECREF(u);
				3425	Py_DECREF(v);
				3426	return 0;
				3427	}
				3428
				3429	result = unicode_compare(u, v);
				3430
				3431	Py_DECREF(u);
				3432	Py_DECREF(v);
				3433	return result;
				3434
				3435	onError:
				3436	Py_XDECREF(u);
				3437	Py_XDECREF(v);
				3438	return -1;
				3439	}
				3440
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3441	int PyUnicode_Contains(PyObject *container,
				3442	PyObject *element)
				3443	{
				3444	PyUnicodeObject u = NULL, v = NULL;
				3445	int result;
				3446	register const Py_UNICODE p, e;
				3447	register Py_UNICODE ch;
				3448
				3449	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3450	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3451	if (v == NULL) {
				3452	PyErr_SetString(PyExc_TypeError,
				3453	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3454	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3455	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3456	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3457	if (u == NULL) {
				3458	Py_DECREF(v);
				3459	goto onError;
				3460	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3461
				3462	/* Check v in u */
				3463	if (PyUnicode_GET_SIZE(v) != 1) {
				3464	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3465	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3466	goto onError;
				3467	}
				3468	ch = *PyUnicode_AS_UNICODE(v);
				3469	p = PyUnicode_AS_UNICODE(u);
				3470	e = p + PyUnicode_GET_SIZE(u);
				3471	result = 0;
				3472	while (p < e) {
				3473	if (*p++ == ch) {
				3474	result = 1;
				3475	break;
				3476	}
				3477	}
				3478
				3479	Py_DECREF(u);
				3480	Py_DECREF(v);
				3481	return result;
				3482
				3483	onError:
				3484	Py_XDECREF(u);
				3485	Py_XDECREF(v);
				3486	return -1;
				3487	}
				3488
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3489	/* Concat to string or Unicode object giving a new Unicode object. */
				3490
				3491	PyObject PyUnicode_Concat(PyObject left,
				3492	PyObject *right)
				3493	{
				3494	PyUnicodeObject u = NULL, v = NULL, *w;
				3495
				3496	/* Coerce the two arguments */
				3497	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3498	if (u == NULL)
				3499	goto onError;
				3500	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3501	if (v == NULL)
				3502	goto onError;
				3503
				3504	/* Shortcuts */
				3505	if (v == unicode_empty) {
				3506	Py_DECREF(v);
				3507	return (PyObject *)u;
				3508	}
				3509	if (u == unicode_empty) {
				3510	Py_DECREF(u);
				3511	return (PyObject *)v;
				3512	}
				3513
				3514	/* Concat the two Unicode strings */
				3515	w = _PyUnicode_New(u->length + v->length);
				3516	if (w == NULL)
				3517	goto onError;
				3518	Py_UNICODE_COPY(w->str, u->str, u->length);
				3519	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3520
				3521	Py_DECREF(u);
				3522	Py_DECREF(v);
				3523	return (PyObject *)w;
				3524
				3525	onError:
				3526	Py_XDECREF(u);
				3527	Py_XDECREF(v);
				3528	return NULL;
				3529	}
				3530
				3531	static char count__doc__[] =
				3532	"S.count(sub[, start[, end]]) -> int\n\
				3533	\n\
				3534	Return the number of occurrences of substring sub in Unicode string\n\
				3535	S[start:end]. Optional arguments start and end are\n\
				3536	interpreted as in slice notation.";
				3537
				3538	static PyObject *
				3539	unicode_count(PyUnicodeObject self, PyObject args)
				3540	{
				3541	PyUnicodeObject *substring;
				3542	int start = 0;
				3543	int end = INT_MAX;
				3544	PyObject *result;
				3545
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3546	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3547	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3548	return NULL;
				3549
				3550	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3551	(PyObject *)substring);
				3552	if (substring == NULL)
				3553	return NULL;
				3554
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3555	if (start < 0)
				3556	start += self->length;
				3557	if (start < 0)
				3558	start = 0;
				3559	if (end > self->length)
				3560	end = self->length;
				3561	if (end < 0)
				3562	end += self->length;
				3563	if (end < 0)
				3564	end = 0;
				3565
				3566	result = PyInt_FromLong((long) count(self, start, end, substring));
				3567
				3568	Py_DECREF(substring);
				3569	return result;
				3570	}
				3571
				3572	static char encode__doc__[] =
				3573	"S.encode([encoding[,errors]]) -> string\n\
				3574	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3575	Return an encoded string version of S. Default encoding is the current\n\
				3576	default string encoding. errors may be given to set a different error\n\
				3577	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3578	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3579
				3580	static PyObject *
				3581	unicode_encode(PyUnicodeObject self, PyObject args)
				3582	{
				3583	char *encoding = NULL;
				3584	char *errors = NULL;
				3585	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3586	return NULL;
				3587	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3588	}
				3589
				3590	static char expandtabs__doc__[] =
				3591	"S.expandtabs([tabsize]) -> unicode\n\
				3592	\n\
				3593	Return a copy of S where all tab characters are expanded using spaces.\n\
				3594	If tabsize is not given, a tab size of 8 characters is assumed.";
				3595
				3596	static PyObject*
				3597	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3598	{
				3599	Py_UNICODE *e;
				3600	Py_UNICODE *p;
				3601	Py_UNICODE *q;
				3602	int i, j;
				3603	PyUnicodeObject *u;
				3604	int tabsize = 8;
				3605
				3606	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3607	return NULL;
				3608
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3609	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3610	i = j = 0;
				3611	e = self->str + self->length;
				3612	for (p = self->str; p < e; p++)
				3613	if (*p == '\t') {
				3614	if (tabsize > 0)
				3615	j += tabsize - (j % tabsize);
				3616	}
				3617	else {
				3618	j++;
				3619	if (p == '\n' \|\| p == '\r') {
				3620	i += j;
				3621	j = 0;
				3622	}
				3623	}
				3624
				3625	/* Second pass: create output string and fill it */
				3626	u = _PyUnicode_New(i + j);
				3627	if (!u)
				3628	return NULL;
				3629
				3630	j = 0;
				3631	q = u->str;
				3632
				3633	for (p = self->str; p < e; p++)
				3634	if (*p == '\t') {
				3635	if (tabsize > 0) {
				3636	i = tabsize - (j % tabsize);
				3637	j += i;
				3638	while (i--)
				3639	*q++ = ' ';
				3640	}
				3641	}
				3642	else {
				3643	j++;
				3644	q++ = p;
				3645	if (p == '\n' \|\| p == '\r')
				3646	j = 0;
				3647	}
				3648
				3649	return (PyObject*) u;
				3650	}
				3651
				3652	static char find__doc__[] =
				3653	"S.find(sub [,start [,end]]) -> int\n\
				3654	\n\
				3655	Return the lowest index in S where substring sub is found,\n\
				3656	such that sub is contained within s[start,end]. Optional\n\
				3657	arguments start and end are interpreted as in slice notation.\n\
				3658	\n\
				3659	Return -1 on failure.";
				3660
				3661	static PyObject *
				3662	unicode_find(PyUnicodeObject self, PyObject args)
				3663	{
				3664	PyUnicodeObject *substring;
				3665	int start = 0;
				3666	int end = INT_MAX;
				3667	PyObject *result;
				3668
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3669	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3670	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3671	return NULL;
				3672	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3673	(PyObject *)substring);
				3674	if (substring == NULL)
				3675	return NULL;
				3676
				3677	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3678
				3679	Py_DECREF(substring);
				3680	return result;
				3681	}
				3682
				3683	static PyObject *
				3684	unicode_getitem(PyUnicodeObject *self, int index)
				3685	{
				3686	if (index < 0 \|\| index >= self->length) {
				3687	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3688	return NULL;
				3689	}
				3690
				3691	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3692	}
				3693
				3694	static long
				3695	unicode_hash(PyUnicodeObject *self)
				3696	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3697	/* Since Unicode objects compare equal to their ASCII string
				3698	counterparts, they should use the individual character values
				3699	as basis for their hash value. This is needed to assure that
				3700	strings and Unicode objects behave in the same way as
				3701	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3702
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3703	register int len;
				3704	register Py_UNICODE *p;
				3705	register long x;
				3706
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3707	if (self->hash != -1)
				3708	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3709	len = PyUnicode_GET_SIZE(self);
				3710	p = PyUnicode_AS_UNICODE(self);
				3711	x = *p << 7;
				3712	while (--len >= 0)
				3713	x = (1000003x) ^ p++;
				3714	x ^= PyUnicode_GET_SIZE(self);
				3715	if (x == -1)
				3716	x = -2;
				3717	self->hash = x;
				3718	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3719	}
				3720
				3721	static char index__doc__[] =
				3722	"S.index(sub [,start [,end]]) -> int\n\
				3723	\n\
				3724	Like S.find() but raise ValueError when the substring is not found.";
				3725
				3726	static PyObject *
				3727	unicode_index(PyUnicodeObject self, PyObject args)
				3728	{
				3729	int result;
				3730	PyUnicodeObject *substring;
				3731	int start = 0;
				3732	int end = INT_MAX;
				3733
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3734	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3735	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3736	return NULL;
				3737
				3738	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3739	(PyObject *)substring);
				3740	if (substring == NULL)
				3741	return NULL;
				3742
				3743	result = findstring(self, substring, start, end, 1);
				3744
				3745	Py_DECREF(substring);
				3746	if (result < 0) {
				3747	PyErr_SetString(PyExc_ValueError, "substring not found");
				3748	return NULL;
				3749	}
				3750	return PyInt_FromLong(result);
				3751	}
				3752
				3753	static char islower__doc__[] =
				3754	"S.islower() -> int\n\
				3755	\n\
				3756	Return 1 if all cased characters in S are lowercase and there is\n\
				3757	at least one cased character in S, 0 otherwise.";
				3758
				3759	static PyObject*
				3760	unicode_islower(PyUnicodeObject self, PyObject args)
				3761	{
				3762	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3763	register const Py_UNICODE *e;
				3764	int cased;
				3765
				3766	if (!PyArg_NoArgs(args))
				3767	return NULL;
				3768
				3769	/* Shortcut for single character strings */
				3770	if (PyUnicode_GET_SIZE(self) == 1)
				3771	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3772
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3773	/* Special case for empty strings */
				3774	if (PyString_GET_SIZE(self) == 0)
				3775	return PyInt_FromLong(0);
				3776
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3777	e = p + PyUnicode_GET_SIZE(self);
				3778	cased = 0;
				3779	for (; p < e; p++) {
				3780	register const Py_UNICODE ch = *p;
				3781
				3782	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3783	return PyInt_FromLong(0);
				3784	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3785	cased = 1;
				3786	}
				3787	return PyInt_FromLong(cased);
				3788	}
				3789
				3790	static char isupper__doc__[] =
				3791	"S.isupper() -> int\n\
				3792	\n\
				3793	Return 1 if all cased characters in S are uppercase and there is\n\
				3794	at least one cased character in S, 0 otherwise.";
				3795
				3796	static PyObject*
				3797	unicode_isupper(PyUnicodeObject self, PyObject args)
				3798	{
				3799	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3800	register const Py_UNICODE *e;
				3801	int cased;
				3802
				3803	if (!PyArg_NoArgs(args))
				3804	return NULL;
				3805
				3806	/* Shortcut for single character strings */
				3807	if (PyUnicode_GET_SIZE(self) == 1)
				3808	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3809
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3810	/* Special case for empty strings */
				3811	if (PyString_GET_SIZE(self) == 0)
				3812	return PyInt_FromLong(0);
				3813
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3814	e = p + PyUnicode_GET_SIZE(self);
				3815	cased = 0;
				3816	for (; p < e; p++) {
				3817	register const Py_UNICODE ch = *p;
				3818
				3819	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3820	return PyInt_FromLong(0);
				3821	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3822	cased = 1;
				3823	}
				3824	return PyInt_FromLong(cased);
				3825	}
				3826
				3827	static char istitle__doc__[] =
				3828	"S.istitle() -> int\n\
				3829	\n\
				3830	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3831	may only follow uncased characters and lowercase characters only cased\n\
				3832	ones. Return 0 otherwise.";
				3833
				3834	static PyObject*
				3835	unicode_istitle(PyUnicodeObject self, PyObject args)
				3836	{
				3837	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3838	register const Py_UNICODE *e;
				3839	int cased, previous_is_cased;
				3840
				3841	if (!PyArg_NoArgs(args))
				3842	return NULL;
				3843
				3844	/* Shortcut for single character strings */
				3845	if (PyUnicode_GET_SIZE(self) == 1)
				3846	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3847	(Py_UNICODE_ISUPPER(*p) != 0));
				3848
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3849	/* Special case for empty strings */
				3850	if (PyString_GET_SIZE(self) == 0)
				3851	return PyInt_FromLong(0);
				3852
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3853	e = p + PyUnicode_GET_SIZE(self);
				3854	cased = 0;
				3855	previous_is_cased = 0;
				3856	for (; p < e; p++) {
				3857	register const Py_UNICODE ch = *p;
				3858
				3859	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3860	if (previous_is_cased)
				3861	return PyInt_FromLong(0);
				3862	previous_is_cased = 1;
				3863	cased = 1;
				3864	}
				3865	else if (Py_UNICODE_ISLOWER(ch)) {
				3866	if (!previous_is_cased)
				3867	return PyInt_FromLong(0);
				3868	previous_is_cased = 1;
				3869	cased = 1;
				3870	}
				3871	else
				3872	previous_is_cased = 0;
				3873	}
				3874	return PyInt_FromLong(cased);
				3875	}
				3876
				3877	static char isspace__doc__[] =
				3878	"S.isspace() -> int\n\
				3879	\n\
				3880	Return 1 if there are only whitespace characters in S,\n\
				3881	0 otherwise.";
				3882
				3883	static PyObject*
				3884	unicode_isspace(PyUnicodeObject self, PyObject args)
				3885	{
				3886	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3887	register const Py_UNICODE *e;
				3888
				3889	if (!PyArg_NoArgs(args))
				3890	return NULL;
				3891
				3892	/* Shortcut for single character strings */
				3893	if (PyUnicode_GET_SIZE(self) == 1 &&
				3894	Py_UNICODE_ISSPACE(*p))
				3895	return PyInt_FromLong(1);
				3896
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3897	/* Special case for empty strings */
				3898	if (PyString_GET_SIZE(self) == 0)
				3899	return PyInt_FromLong(0);
				3900
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3901	e = p + PyUnicode_GET_SIZE(self);
				3902	for (; p < e; p++) {
				3903	if (!Py_UNICODE_ISSPACE(*p))
				3904	return PyInt_FromLong(0);
				3905	}
				3906	return PyInt_FromLong(1);
				3907	}
				3908
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3909	static char isalpha__doc__[] =
				3910	"S.isalpha() -> int\n\
				3911	\n\
				3912	Return 1 if all characters in S are alphabetic\n\
				3913	and there is at least one character in S, 0 otherwise.";
				3914
				3915	static PyObject*
				3916	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3917	{
				3918	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3919	register const Py_UNICODE *e;
				3920
				3921	if (!PyArg_NoArgs(args))
				3922	return NULL;
				3923
				3924	/* Shortcut for single character strings */
				3925	if (PyUnicode_GET_SIZE(self) == 1 &&
				3926	Py_UNICODE_ISALPHA(*p))
				3927	return PyInt_FromLong(1);
				3928
				3929	/* Special case for empty strings */
				3930	if (PyString_GET_SIZE(self) == 0)
				3931	return PyInt_FromLong(0);
				3932
				3933	e = p + PyUnicode_GET_SIZE(self);
				3934	for (; p < e; p++) {
				3935	if (!Py_UNICODE_ISALPHA(*p))
				3936	return PyInt_FromLong(0);
				3937	}
				3938	return PyInt_FromLong(1);
				3939	}
				3940
				3941	static char isalnum__doc__[] =
				3942	"S.isalnum() -> int\n\
				3943	\n\
				3944	Return 1 if all characters in S are alphanumeric\n\
				3945	and there is at least one character in S, 0 otherwise.";
				3946
				3947	static PyObject*
				3948	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3949	{
				3950	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3951	register const Py_UNICODE *e;
				3952
				3953	if (!PyArg_NoArgs(args))
				3954	return NULL;
				3955
				3956	/* Shortcut for single character strings */
				3957	if (PyUnicode_GET_SIZE(self) == 1 &&
				3958	Py_UNICODE_ISALNUM(*p))
				3959	return PyInt_FromLong(1);
				3960
				3961	/* Special case for empty strings */
				3962	if (PyString_GET_SIZE(self) == 0)
				3963	return PyInt_FromLong(0);
				3964
				3965	e = p + PyUnicode_GET_SIZE(self);
				3966	for (; p < e; p++) {
				3967	if (!Py_UNICODE_ISALNUM(*p))
				3968	return PyInt_FromLong(0);
				3969	}
				3970	return PyInt_FromLong(1);
				3971	}
				3972
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3973	static char isdecimal__doc__[] =
				3974	"S.isdecimal() -> int\n\
				3975	\n\
				3976	Return 1 if there are only decimal characters in S,\n\
				3977	0 otherwise.";
				3978
				3979	static PyObject*
				3980	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3981	{
				3982	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3983	register const Py_UNICODE *e;
				3984
				3985	if (!PyArg_NoArgs(args))
				3986	return NULL;
				3987
				3988	/* Shortcut for single character strings */
				3989	if (PyUnicode_GET_SIZE(self) == 1 &&
				3990	Py_UNICODE_ISDECIMAL(*p))
				3991	return PyInt_FromLong(1);
				3992
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3993	/* Special case for empty strings */
				3994	if (PyString_GET_SIZE(self) == 0)
				3995	return PyInt_FromLong(0);
				3996
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3997	e = p + PyUnicode_GET_SIZE(self);
				3998	for (; p < e; p++) {
				3999	if (!Py_UNICODE_ISDECIMAL(*p))
				4000	return PyInt_FromLong(0);
				4001	}
				4002	return PyInt_FromLong(1);
				4003	}
				4004
				4005	static char isdigit__doc__[] =
				4006	"S.isdigit() -> int\n\
				4007	\n\
				4008	Return 1 if there are only digit characters in S,\n\
				4009	0 otherwise.";
				4010
				4011	static PyObject*
				4012	unicode_isdigit(PyUnicodeObject self, PyObject args)
				4013	{
				4014	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4015	register const Py_UNICODE *e;
				4016
				4017	if (!PyArg_NoArgs(args))
				4018	return NULL;
				4019
				4020	/* Shortcut for single character strings */
				4021	if (PyUnicode_GET_SIZE(self) == 1 &&
				4022	Py_UNICODE_ISDIGIT(*p))
				4023	return PyInt_FromLong(1);
				4024
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4025	/* Special case for empty strings */
				4026	if (PyString_GET_SIZE(self) == 0)
				4027	return PyInt_FromLong(0);
				4028
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4029	e = p + PyUnicode_GET_SIZE(self);
				4030	for (; p < e; p++) {
				4031	if (!Py_UNICODE_ISDIGIT(*p))
				4032	return PyInt_FromLong(0);
				4033	}
				4034	return PyInt_FromLong(1);
				4035	}
				4036
				4037	static char isnumeric__doc__[] =
				4038	"S.isnumeric() -> int\n\
				4039	\n\
				4040	Return 1 if there are only numeric characters in S,\n\
				4041	0 otherwise.";
				4042
				4043	static PyObject*
				4044	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				4045	{
				4046	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4047	register const Py_UNICODE *e;
				4048
				4049	if (!PyArg_NoArgs(args))
				4050	return NULL;
				4051
				4052	/* Shortcut for single character strings */
				4053	if (PyUnicode_GET_SIZE(self) == 1 &&
				4054	Py_UNICODE_ISNUMERIC(*p))
				4055	return PyInt_FromLong(1);
				4056
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4057	/* Special case for empty strings */
				4058	if (PyString_GET_SIZE(self) == 0)
				4059	return PyInt_FromLong(0);
				4060
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4061	e = p + PyUnicode_GET_SIZE(self);
				4062	for (; p < e; p++) {
				4063	if (!Py_UNICODE_ISNUMERIC(*p))
				4064	return PyInt_FromLong(0);
				4065	}
				4066	return PyInt_FromLong(1);
				4067	}
				4068
				4069	static char join__doc__[] =
				4070	"S.join(sequence) -> unicode\n\
				4071	\n\
				4072	Return a string which is the concatenation of the strings in the\n\
				4073	sequence. The separator between elements is S.";
				4074
				4075	static PyObject*
				4076	unicode_join(PyUnicodeObject self, PyObject args)
				4077	{
				4078	PyObject *data;
				4079	if (!PyArg_ParseTuple(args, "O:join", &data))
				4080	return NULL;
				4081
				4082	return PyUnicode_Join((PyObject *)self, data);
				4083	}
				4084
				4085	static int
				4086	unicode_length(PyUnicodeObject *self)
				4087	{
				4088	return self->length;
				4089	}
				4090
				4091	static char ljust__doc__[] =
				4092	"S.ljust(width) -> unicode\n\
				4093	\n\
				4094	Return S left justified in a Unicode string of length width. Padding is\n\
				4095	done using spaces.";
				4096
				4097	static PyObject *
				4098	unicode_ljust(PyUnicodeObject self, PyObject args)
				4099	{
				4100	int width;
				4101	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4102	return NULL;
				4103
				4104	if (self->length >= width) {
				4105	Py_INCREF(self);
				4106	return (PyObject*) self;
				4107	}
				4108
				4109	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4110	}
				4111
				4112	static char lower__doc__[] =
				4113	"S.lower() -> unicode\n\
				4114	\n\
				4115	Return a copy of the string S converted to lowercase.";
				4116
				4117	static PyObject*
				4118	unicode_lower(PyUnicodeObject self, PyObject args)
				4119	{
				4120	if (!PyArg_NoArgs(args))
				4121	return NULL;
				4122	return fixup(self, fixlower);
				4123	}
				4124
				4125	static char lstrip__doc__[] =
				4126	"S.lstrip() -> unicode\n\
				4127	\n\
				4128	Return a copy of the string S with leading whitespace removed.";
				4129
				4130	static PyObject *
				4131	unicode_lstrip(PyUnicodeObject self, PyObject args)
				4132	{
				4133	if (!PyArg_NoArgs(args))
				4134	return NULL;
				4135	return strip(self, 1, 0);
				4136	}
				4137
				4138	static PyObject*
				4139	unicode_repeat(PyUnicodeObject *str, int len)
				4140	{
				4141	PyUnicodeObject *u;
				4142	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4143	int nchars;
				4144	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4145
				4146	if (len < 0)
				4147	len = 0;
				4148
				4149	if (len == 1) {
				4150	/* no repeat, return original string */
				4151	Py_INCREF(str);
				4152	return (PyObject*) str;
				4153	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4154
				4155	/* ensure # of chars needed doesn't overflow int and # of bytes
				4156	* needed doesn't overflow size_t
				4157	*/
				4158	nchars = len * str->length;
				4159	if (len && nchars / len != str->length) {
				4160	PyErr_SetString(PyExc_OverflowError,
				4161	"repeated string is too long");
				4162	return NULL;
				4163	}
				4164	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4165	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4166	PyErr_SetString(PyExc_OverflowError,
				4167	"repeated string is too long");
				4168	return NULL;
				4169	}
				4170	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4171	if (!u)
				4172	return NULL;
				4173
				4174	p = u->str;
				4175
				4176	while (len-- > 0) {
				4177	Py_UNICODE_COPY(p, str->str, str->length);
				4178	p += str->length;
				4179	}
				4180
				4181	return (PyObject*) u;
				4182	}
				4183
				4184	PyObject PyUnicode_Replace(PyObject obj,
				4185	PyObject *subobj,
				4186	PyObject *replobj,
				4187	int maxcount)
				4188	{
				4189	PyObject *self;
				4190	PyObject *str1;
				4191	PyObject *str2;
				4192	PyObject *result;
				4193
				4194	self = PyUnicode_FromObject(obj);
				4195	if (self == NULL)
				4196	return NULL;
				4197	str1 = PyUnicode_FromObject(subobj);
				4198	if (str1 == NULL) {
				4199	Py_DECREF(self);
				4200	return NULL;
				4201	}
				4202	str2 = PyUnicode_FromObject(replobj);
				4203	if (str2 == NULL) {
				4204	Py_DECREF(self);
				4205	Py_DECREF(str1);
				4206	return NULL;
				4207	}
				4208	result = replace((PyUnicodeObject *)self,
				4209	(PyUnicodeObject *)str1,
				4210	(PyUnicodeObject *)str2,
				4211	maxcount);
				4212	Py_DECREF(self);
				4213	Py_DECREF(str1);
				4214	Py_DECREF(str2);
				4215	return result;
				4216	}
				4217
				4218	static char replace__doc__[] =
				4219	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4220	\n\
				4221	Return a copy of S with all occurrences of substring\n\
				4222	old replaced by new. If the optional argument maxsplit is\n\
				4223	given, only the first maxsplit occurrences are replaced.";
				4224
				4225	static PyObject*
				4226	unicode_replace(PyUnicodeObject self, PyObject args)
				4227	{
				4228	PyUnicodeObject *str1;
				4229	PyUnicodeObject *str2;
				4230	int maxcount = -1;
				4231	PyObject *result;
				4232
				4233	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4234	return NULL;
				4235	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4236	if (str1 == NULL)
				4237	return NULL;
				4238	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4239	if (str2 == NULL)
				4240	return NULL;
				4241
				4242	result = replace(self, str1, str2, maxcount);
				4243
				4244	Py_DECREF(str1);
				4245	Py_DECREF(str2);
				4246	return result;
				4247	}
				4248
				4249	static
				4250	PyObject unicode_repr(PyObject unicode)
				4251	{
				4252	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4253	PyUnicode_GET_SIZE(unicode),
				4254	1);
				4255	}
				4256
				4257	static char rfind__doc__[] =
				4258	"S.rfind(sub [,start [,end]]) -> int\n\
				4259	\n\
				4260	Return the highest index in S where substring sub is found,\n\
				4261	such that sub is contained within s[start,end]. Optional\n\
				4262	arguments start and end are interpreted as in slice notation.\n\
				4263	\n\
				4264	Return -1 on failure.";
				4265
				4266	static PyObject *
				4267	unicode_rfind(PyUnicodeObject self, PyObject args)
				4268	{
				4269	PyUnicodeObject *substring;
				4270	int start = 0;
				4271	int end = INT_MAX;
				4272	PyObject *result;
				4273
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4274	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4275	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4276	return NULL;
				4277	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4278	(PyObject *)substring);
				4279	if (substring == NULL)
				4280	return NULL;
				4281
				4282	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4283
				4284	Py_DECREF(substring);
				4285	return result;
				4286	}
				4287
				4288	static char rindex__doc__[] =
				4289	"S.rindex(sub [,start [,end]]) -> int\n\
				4290	\n\
				4291	Like S.rfind() but raise ValueError when the substring is not found.";
				4292
				4293	static PyObject *
				4294	unicode_rindex(PyUnicodeObject self, PyObject args)
				4295	{
				4296	int result;
				4297	PyUnicodeObject *substring;
				4298	int start = 0;
				4299	int end = INT_MAX;
				4300
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4301	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4302	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4303	return NULL;
				4304	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4305	(PyObject *)substring);
				4306	if (substring == NULL)
				4307	return NULL;
				4308
				4309	result = findstring(self, substring, start, end, -1);
				4310
				4311	Py_DECREF(substring);
				4312	if (result < 0) {
				4313	PyErr_SetString(PyExc_ValueError, "substring not found");
				4314	return NULL;
				4315	}
				4316	return PyInt_FromLong(result);
				4317	}
				4318
				4319	static char rjust__doc__[] =
				4320	"S.rjust(width) -> unicode\n\
				4321	\n\
				4322	Return S right justified in a Unicode string of length width. Padding is\n\
				4323	done using spaces.";
				4324
				4325	static PyObject *
				4326	unicode_rjust(PyUnicodeObject self, PyObject args)
				4327	{
				4328	int width;
				4329	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4330	return NULL;
				4331
				4332	if (self->length >= width) {
				4333	Py_INCREF(self);
				4334	return (PyObject*) self;
				4335	}
				4336
				4337	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4338	}
				4339
				4340	static char rstrip__doc__[] =
				4341	"S.rstrip() -> unicode\n\
				4342	\n\
				4343	Return a copy of the string S with trailing whitespace removed.";
				4344
				4345	static PyObject *
				4346	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4347	{
				4348	if (!PyArg_NoArgs(args))
				4349	return NULL;
				4350	return strip(self, 0, 1);
				4351	}
				4352
				4353	static PyObject*
				4354	unicode_slice(PyUnicodeObject *self, int start, int end)
				4355	{
				4356	/* standard clamping */
				4357	if (start < 0)
				4358	start = 0;
				4359	if (end < 0)
				4360	end = 0;
				4361	if (end > self->length)
				4362	end = self->length;
				4363	if (start == 0 && end == self->length) {
				4364	/* full slice, return original string */
				4365	Py_INCREF(self);
				4366	return (PyObject*) self;
				4367	}
				4368	if (start > end)
				4369	start = end;
				4370	/* copy slice */
				4371	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4372	end - start);
				4373	}
				4374
				4375	PyObject PyUnicode_Split(PyObject s,
				4376	PyObject *sep,
				4377	int maxsplit)
				4378	{
				4379	PyObject *result;
				4380
				4381	s = PyUnicode_FromObject(s);
				4382	if (s == NULL)
				4383	return NULL;
				4384	if (sep != NULL) {
				4385	sep = PyUnicode_FromObject(sep);
				4386	if (sep == NULL) {
				4387	Py_DECREF(s);
				4388	return NULL;
				4389	}
				4390	}
				4391
				4392	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4393
				4394	Py_DECREF(s);
				4395	Py_XDECREF(sep);
				4396	return result;
				4397	}
				4398
				4399	static char split__doc__[] =
				4400	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4401	\n\
				4402	Return a list of the words in S, using sep as the\n\
				4403	delimiter string. If maxsplit is given, at most maxsplit\n\
				4404	splits are done. If sep is not specified, any whitespace string\n\
				4405	is a separator.";
				4406
				4407	static PyObject*
				4408	unicode_split(PyUnicodeObject self, PyObject args)
				4409	{
				4410	PyObject *substring = Py_None;
				4411	int maxcount = -1;
				4412
				4413	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4414	return NULL;
				4415
				4416	if (substring == Py_None)
				4417	return split(self, NULL, maxcount);
				4418	else if (PyUnicode_Check(substring))
				4419	return split(self, (PyUnicodeObject *)substring, maxcount);
				4420	else
				4421	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4422	}
				4423
				4424	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4425	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4426	\n\
				4427	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4428	Line breaks are not included in the resulting list unless keepends\n\
				4429	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4430
				4431	static PyObject*
				4432	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4433	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4434	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4435
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4436	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4437	return NULL;
				4438
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4439	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4440	}
				4441
				4442	static
				4443	PyObject unicode_str(PyUnicodeObject self)
				4444	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4445	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4446	}
				4447
				4448	static char strip__doc__[] =
				4449	"S.strip() -> unicode\n\
				4450	\n\
				4451	Return a copy of S with leading and trailing whitespace removed.";
				4452
				4453	static PyObject *
				4454	unicode_strip(PyUnicodeObject self, PyObject args)
				4455	{
				4456	if (!PyArg_NoArgs(args))
				4457	return NULL;
				4458	return strip(self, 1, 1);
				4459	}
				4460
				4461	static char swapcase__doc__[] =
				4462	"S.swapcase() -> unicode\n\
				4463	\n\
				4464	Return a copy of S with uppercase characters converted to lowercase\n\
				4465	and vice versa.";
				4466
				4467	static PyObject*
				4468	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4469	{
				4470	if (!PyArg_NoArgs(args))
				4471	return NULL;
				4472	return fixup(self, fixswapcase);
				4473	}
				4474
				4475	static char translate__doc__[] =
				4476	"S.translate(table) -> unicode\n\
				4477	\n\
				4478	Return a copy of the string S, where all characters have been mapped\n\
				4479	through the given translation table, which must be a mapping of\n\
				4480	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4481	are left untouched. Characters mapped to None are deleted.";
				4482
				4483	static PyObject*
				4484	unicode_translate(PyUnicodeObject self, PyObject args)
				4485	{
				4486	PyObject *table;
				4487
				4488	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4489	return NULL;
				4490	return PyUnicode_TranslateCharmap(self->str,
				4491	self->length,
				4492	table,
				4493	"ignore");
				4494	}
				4495
				4496	static char upper__doc__[] =
				4497	"S.upper() -> unicode\n\
				4498	\n\
				4499	Return a copy of S converted to uppercase.";
				4500
				4501	static PyObject*
				4502	unicode_upper(PyUnicodeObject self, PyObject args)
				4503	{
				4504	if (!PyArg_NoArgs(args))
				4505	return NULL;
				4506	return fixup(self, fixupper);
				4507	}
				4508
				4509	#if 0
				4510	static char zfill__doc__[] =
				4511	"S.zfill(width) -> unicode\n\
				4512	\n\
				4513	Pad a numeric string x with zeros on the left, to fill a field\n\
				4514	of the specified width. The string x is never truncated.";
				4515
				4516	static PyObject *
				4517	unicode_zfill(PyUnicodeObject self, PyObject args)
				4518	{
				4519	int fill;
				4520	PyUnicodeObject *u;
				4521
				4522	int width;
				4523	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4524	return NULL;
				4525
				4526	if (self->length >= width) {
				4527	Py_INCREF(self);
				4528	return (PyObject*) self;
				4529	}
				4530
				4531	fill = width - self->length;
				4532
				4533	u = pad(self, fill, 0, '0');
				4534
				4535	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4536	/* move sign to beginning of string */
				4537	u->str[0] = u->str[fill];
				4538	u->str[fill] = '0';
				4539	}
				4540
				4541	return (PyObject*) u;
				4542	}
				4543	#endif
				4544
				4545	#if 0
				4546	static PyObject*
				4547	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4548	{
				4549	if (!PyArg_NoArgs(args))
				4550	return NULL;
				4551	return PyInt_FromLong(unicode_freelist_size);
				4552	}
				4553	#endif
				4554
				4555	static char startswith__doc__[] =
				4556	"S.startswith(prefix[, start[, end]]) -> int\n\
				4557	\n\
				4558	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4559	optional start, test S beginning at that position. With optional end, stop\n\
				4560	comparing S at that position.";
				4561
				4562	static PyObject *
				4563	unicode_startswith(PyUnicodeObject *self,
				4564	PyObject *args)
				4565	{
				4566	PyUnicodeObject *substring;
				4567	int start = 0;
				4568	int end = INT_MAX;
				4569	PyObject *result;
				4570
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4571	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4572	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4573	return NULL;
				4574	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4575	(PyObject *)substring);
				4576	if (substring == NULL)
				4577	return NULL;
				4578
				4579	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4580
				4581	Py_DECREF(substring);
				4582	return result;
				4583	}
				4584
				4585
				4586	static char endswith__doc__[] =
				4587	"S.endswith(suffix[, start[, end]]) -> int\n\
				4588	\n\
				4589	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4590	optional start, test S beginning at that position. With optional end, stop\n\
				4591	comparing S at that position.";
				4592
				4593	static PyObject *
				4594	unicode_endswith(PyUnicodeObject *self,
				4595	PyObject *args)
				4596	{
				4597	PyUnicodeObject *substring;
				4598	int start = 0;
				4599	int end = INT_MAX;
				4600	PyObject *result;
				4601
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4602	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4603	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4604	return NULL;
				4605	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4606	(PyObject *)substring);
				4607	if (substring == NULL)
				4608	return NULL;
				4609
				4610	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4611
				4612	Py_DECREF(substring);
				4613	return result;
				4614	}
				4615
				4616
				4617	static PyMethodDef unicode_methods[] = {
				4618
				4619	/* Order is according to common usage: often used methods should
				4620	appear first, since lookup is done sequentially. */
				4621
				4622	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4623	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4624	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4625	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4626	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4627	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4628	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4629	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4630	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4631	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4632	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4633	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4634	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4635	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4636	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4637	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4638	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4639	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4640	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4641	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4642	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4643	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4644	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4645	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4646	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4647	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4648	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4649	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4650	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4651	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4652	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4653	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4654	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4655	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4656	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4657	#if 0
				4658	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4659	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4660	#endif
				4661
				4662	#if 0
				4663	/* This one is just used for debugging the implementation. */
				4664	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4665	#endif
				4666
				4667	{NULL, NULL}
				4668	};
				4669
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4670	static PySequenceMethods unicode_as_sequence = {
				4671	(inquiry) unicode_length, /* sq_length */
				4672	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4673	(intargfunc) unicode_repeat, /* sq_repeat */
				4674	(intargfunc) unicode_getitem, /* sq_item */
				4675	(intintargfunc) unicode_slice, /* sq_slice */
				4676	0, /* sq_ass_item */
				4677	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4678	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4679	};
				4680
				4681	static int
				4682	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4683	int index,
				4684	const void **ptr)
				4685	{
				4686	if (index != 0) {
				4687	PyErr_SetString(PyExc_SystemError,
				4688	"accessing non-existent unicode segment");
				4689	return -1;
				4690	}
				4691	ptr = (void ) self->str;
				4692	return PyUnicode_GET_DATA_SIZE(self);
				4693	}
				4694
				4695	static int
				4696	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4697	const void **ptr)
				4698	{
				4699	PyErr_SetString(PyExc_TypeError,
				4700	"cannot use unicode as modifyable buffer");
				4701	return -1;
				4702	}
				4703
				4704	static int
				4705	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4706	int *lenp)
				4707	{
				4708	if (lenp)
				4709	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4710	return 1;
				4711	}
				4712
				4713	static int
				4714	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4715	int index,
				4716	const void **ptr)
				4717	{
				4718	PyObject *str;
				4719
				4720	if (index != 0) {
				4721	PyErr_SetString(PyExc_SystemError,
				4722	"accessing non-existent unicode segment");
				4723	return -1;
				4724	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4725	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4726	if (str == NULL)
				4727	return -1;
				4728	ptr = (void ) PyString_AS_STRING(str);
				4729	return PyString_GET_SIZE(str);
				4730	}
				4731
				4732	/* Helpers for PyUnicode_Format() */
				4733
				4734	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4735	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4736	{
				4737	int argidx = *p_argidx;
				4738	if (argidx < arglen) {
				4739	(*p_argidx)++;
				4740	if (arglen < 0)
				4741	return args;
				4742	else
				4743	return PyTuple_GetItem(args, argidx);
				4744	}
				4745	PyErr_SetString(PyExc_TypeError,
				4746	"not enough arguments for format string");
				4747	return NULL;
				4748	}
				4749
				4750	#define F_LJUST (1<<0)
				4751	#define F_SIGN (1<<1)
				4752	#define F_BLANK (1<<2)
				4753	#define F_ALT (1<<3)
				4754	#define F_ZERO (1<<4)
				4755
				4756	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4757	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4758	{
				4759	register int i;
				4760	int len;
				4761	va_list va;
				4762	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4763	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4764
				4765	/* First, format the string as char array, then expand to Py_UNICODE
				4766	array. */
				4767	charbuffer = (char *)buffer;
				4768	len = vsprintf(charbuffer, format, va);
				4769	for (i = len - 1; i >= 0; i--)
				4770	buffer[i] = (Py_UNICODE) charbuffer[i];
				4771
				4772	va_end(va);
				4773	return len;
				4774	}
				4775
				4776	static int
				4777	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4778	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4779	int flags,
				4780	int prec,
				4781	int type,
				4782	PyObject *v)
				4783	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4784	/* fmt = '%#.' + `prec` + `type`
				4785	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4786	char fmt[20];
				4787	double x;
				4788
				4789	x = PyFloat_AsDouble(v);
				4790	if (x == -1.0 && PyErr_Occurred())
				4791	return -1;
				4792	if (prec < 0)
				4793	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4794	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4795	type = 'g';
				4796	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4797	/* worst case length calc to ensure no buffer overrun:
				4798	fmt = %#.<prec>g
				4799	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4800	for any double rep.)
				4801	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4802	If prec=0 the effective precision is 1 (the leading digit is
				4803	always given), therefore increase by one to 10+prec. */
				4804	if (buflen <= (size_t)10 + (size_t)prec) {
				4805	PyErr_SetString(PyExc_OverflowError,
				4806	"formatted float is too long (precision too long?)");
				4807	return -1;
				4808	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4809	return usprintf(buf, fmt, x);
				4810	}
				4811
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4812	static PyObject*
				4813	formatlong(PyObject *val, int flags, int prec, int type)
				4814	{
				4815	char *buf;
				4816	int i, len;
				4817	PyObject str; / temporary string object. */
				4818	PyUnicodeObject *result;
				4819
				4820	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4821	if (!str)
				4822	return NULL;
				4823	result = _PyUnicode_New(len);
				4824	for (i = 0; i < len; i++)
				4825	result->str[i] = buf[i];
				4826	result->str[len] = 0;
				4827	Py_DECREF(str);
				4828	return (PyObject*)result;
				4829	}
				4830
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4831	static int
				4832	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4833	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4834	int flags,
				4835	int prec,
				4836	int type,
				4837	PyObject *v)
				4838	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4839	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4840	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4841	+ 1 + 1 = 24*/
				4842	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4843	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4844	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4845
				4846	x = PyInt_AsLong(v);
				4847	if (x == -1 && PyErr_Occurred())
				4848	return -1;
				4849	if (prec < 0)
				4850	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4851	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4852	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4853	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4854	PyErr_SetString(PyExc_OverflowError,
				4855	"formatted integer is too long (precision too long?)");
				4856	return -1;
				4857	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4858	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				4859	* but we want it (for consistency with other %#x conversions, and
				4860	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4861	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				4862	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				4863	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4864	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4865	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				4866	/* Only way to know what the platform does is to try it. */
				4867	sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
				4868	if (fmt[1] != (char)type) {
				4869	/* Supply our own leading 0x/0X -- needed under std C */
				4870	use_native_c_format = 0;
				4871	sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
				4872	}
				4873	}
				4874	if (use_native_c_format)
				4875	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4876	return usprintf(buf, fmt, x);
				4877	}
				4878
				4879	static int
				4880	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4881	size_t buflen,
				4882	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4883	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4884	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4885	if (PyUnicode_Check(v)) {
				4886	if (PyUnicode_GET_SIZE(v) != 1)
				4887	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4888	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4889	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4890
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4891	else if (PyString_Check(v)) {
				4892	if (PyString_GET_SIZE(v) != 1)
				4893	goto onError;
				4894	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4895	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4896
				4897	else {
				4898	/* Integer input truncated to a character */
				4899	long x;
				4900	x = PyInt_AsLong(v);
				4901	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4902	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4903	buf[0] = (char) x;
				4904	}
				4905	buf[1] = '\0';
				4906	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4907
				4908	onError:
				4909	PyErr_SetString(PyExc_TypeError,
				4910	"%c requires int or char");
				4911	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4912	}
				4913
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4914	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4915
				4916	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4917	chars are formatted. XXX This is a magic number. Each formatting
				4918	routine does bounds checking to ensure no overflow, but a better
				4919	solution may be to malloc a buffer of appropriate size for each
				4920	format. For now, the current solution is sufficient.
				4921	*/
				4922	#define FORMATBUFLEN (size_t)120
				4923
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4924	PyObject PyUnicode_Format(PyObject format,
				4925	PyObject *args)
				4926	{
				4927	Py_UNICODE fmt, res;
				4928	int fmtcnt, rescnt, reslen, arglen, argidx;
				4929	int args_owned = 0;
				4930	PyUnicodeObject *result = NULL;
				4931	PyObject *dict = NULL;
				4932	PyObject *uformat;
				4933
				4934	if (format == NULL \|\| args == NULL) {
				4935	PyErr_BadInternalCall();
				4936	return NULL;
				4937	}
				4938	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4939	if (uformat == NULL)
				4940	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4941	fmt = PyUnicode_AS_UNICODE(uformat);
				4942	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4943
				4944	reslen = rescnt = fmtcnt + 100;
				4945	result = _PyUnicode_New(reslen);
				4946	if (result == NULL)
				4947	goto onError;
				4948	res = PyUnicode_AS_UNICODE(result);
				4949
				4950	if (PyTuple_Check(args)) {
				4951	arglen = PyTuple_Size(args);
				4952	argidx = 0;
				4953	}
				4954	else {
				4955	arglen = -1;
				4956	argidx = -2;
				4957	}
				4958	if (args->ob_type->tp_as_mapping)
				4959	dict = args;
				4960
				4961	while (--fmtcnt >= 0) {
				4962	if (*fmt != '%') {
				4963	if (--rescnt < 0) {
				4964	rescnt = fmtcnt + 100;
				4965	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	4966	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4967	return NULL;
				4968	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4969	--rescnt;
				4970	}
				4971	res++ = fmt++;
				4972	}
				4973	else {
				4974	/* Got a format specifier */
				4975	int flags = 0;
				4976	int width = -1;
				4977	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4978	Py_UNICODE c = '\0';
				4979	Py_UNICODE fill;
				4980	PyObject *v = NULL;
				4981	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4982	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4983	Py_UNICODE sign;
				4984	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4985	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4986
				4987	fmt++;
				4988	if (*fmt == '(') {
				4989	Py_UNICODE *keystart;
				4990	int keylen;
				4991	PyObject *key;
				4992	int pcount = 1;
				4993
				4994	if (dict == NULL) {
				4995	PyErr_SetString(PyExc_TypeError,
				4996	"format requires a mapping");
				4997	goto onError;
				4998	}
				4999	++fmt;
				5000	--fmtcnt;
				5001	keystart = fmt;
				5002	/* Skip over balanced parentheses */
				5003	while (pcount > 0 && --fmtcnt >= 0) {
				5004	if (*fmt == ')')
				5005	--pcount;
				5006	else if (*fmt == '(')
				5007	++pcount;
				5008	fmt++;
				5009	}
				5010	keylen = fmt - keystart - 1;
				5011	if (fmtcnt < 0 \|\| pcount > 0) {
				5012	PyErr_SetString(PyExc_ValueError,
				5013	"incomplete format key");
				5014	goto onError;
				5015	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5016	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5017	then looked up since Python uses strings to hold
				5018	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5019	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5020	key = PyUnicode_EncodeUTF8(keystart,
				5021	keylen,
				5022	NULL);
				5023	if (key == NULL)
				5024	goto onError;
				5025	if (args_owned) {
				5026	Py_DECREF(args);
				5027	args_owned = 0;
				5028	}
				5029	args = PyObject_GetItem(dict, key);
				5030	Py_DECREF(key);
				5031	if (args == NULL) {
				5032	goto onError;
				5033	}
				5034	args_owned = 1;
				5035	arglen = -1;
				5036	argidx = -2;
				5037	}
				5038	while (--fmtcnt >= 0) {
				5039	switch (c = *fmt++) {
				5040	case '-': flags \|= F_LJUST; continue;
				5041	case '+': flags \|= F_SIGN; continue;
				5042	case ' ': flags \|= F_BLANK; continue;
				5043	case '#': flags \|= F_ALT; continue;
				5044	case '0': flags \|= F_ZERO; continue;
				5045	}
				5046	break;
				5047	}
				5048	if (c == '*') {
				5049	v = getnextarg(args, arglen, &argidx);
				5050	if (v == NULL)
				5051	goto onError;
				5052	if (!PyInt_Check(v)) {
				5053	PyErr_SetString(PyExc_TypeError,
				5054	"* wants int");
				5055	goto onError;
				5056	}
				5057	width = PyInt_AsLong(v);
				5058	if (width < 0) {
				5059	flags \|= F_LJUST;
				5060	width = -width;
				5061	}
				5062	if (--fmtcnt >= 0)
				5063	c = *fmt++;
				5064	}
				5065	else if (c >= '0' && c <= '9') {
				5066	width = c - '0';
				5067	while (--fmtcnt >= 0) {
				5068	c = *fmt++;
				5069	if (c < '0' \|\| c > '9')
				5070	break;
				5071	if ((width*10) / 10 != width) {
				5072	PyErr_SetString(PyExc_ValueError,
				5073	"width too big");
				5074	goto onError;
				5075	}
				5076	width = width*10 + (c - '0');
				5077	}
				5078	}
				5079	if (c == '.') {
				5080	prec = 0;
				5081	if (--fmtcnt >= 0)
				5082	c = *fmt++;
				5083	if (c == '*') {
				5084	v = getnextarg(args, arglen, &argidx);
				5085	if (v == NULL)
				5086	goto onError;
				5087	if (!PyInt_Check(v)) {
				5088	PyErr_SetString(PyExc_TypeError,
				5089	"* wants int");
				5090	goto onError;
				5091	}
				5092	prec = PyInt_AsLong(v);
				5093	if (prec < 0)
				5094	prec = 0;
				5095	if (--fmtcnt >= 0)
				5096	c = *fmt++;
				5097	}
				5098	else if (c >= '0' && c <= '9') {
				5099	prec = c - '0';
				5100	while (--fmtcnt >= 0) {
				5101	c = Py_CHARMASK(*fmt++);
				5102	if (c < '0' \|\| c > '9')
				5103	break;
				5104	if ((prec*10) / 10 != prec) {
				5105	PyErr_SetString(PyExc_ValueError,
				5106	"prec too big");
				5107	goto onError;
				5108	}
				5109	prec = prec*10 + (c - '0');
				5110	}
				5111	}
				5112	} /* prec */
				5113	if (fmtcnt >= 0) {
				5114	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5115	if (--fmtcnt >= 0)
				5116	c = *fmt++;
				5117	}
				5118	}
				5119	if (fmtcnt < 0) {
				5120	PyErr_SetString(PyExc_ValueError,
				5121	"incomplete format");
				5122	goto onError;
				5123	}
				5124	if (c != '%') {
				5125	v = getnextarg(args, arglen, &argidx);
				5126	if (v == NULL)
				5127	goto onError;
				5128	}
				5129	sign = 0;
				5130	fill = ' ';
				5131	switch (c) {
				5132
				5133	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5134	pbuf = formatbuf;
				5135	/* presume that buffer length is at least 1 */
				5136	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5137	len = 1;
				5138	break;
				5139
				5140	case 's':
				5141	case 'r':
				5142	if (PyUnicode_Check(v) && c == 's') {
				5143	temp = v;
				5144	Py_INCREF(temp);
				5145	}
				5146	else {
				5147	PyObject *unicode;
				5148	if (c == 's')
				5149	temp = PyObject_Str(v);
				5150	else
				5151	temp = PyObject_Repr(v);
				5152	if (temp == NULL)
				5153	goto onError;
				5154	if (!PyString_Check(temp)) {
				5155	/* XXX Note: this should never happen, since
				5156	PyObject_Repr() and PyObject_Str() assure
				5157	this */
				5158	Py_DECREF(temp);
				5159	PyErr_SetString(PyExc_TypeError,
				5160	"%s argument has non-string str()");
				5161	goto onError;
				5162	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5163	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5164	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5165	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5166	"strict");
				5167	Py_DECREF(temp);
				5168	temp = unicode;
				5169	if (temp == NULL)
				5170	goto onError;
				5171	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5172	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5173	len = PyUnicode_GET_SIZE(temp);
				5174	if (prec >= 0 && len > prec)
				5175	len = prec;
				5176	break;
				5177
				5178	case 'i':
				5179	case 'd':
				5180	case 'u':
				5181	case 'o':
				5182	case 'x':
				5183	case 'X':
				5184	if (c == 'i')
				5185	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5186	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5187	temp = formatlong(v, flags, prec, c);
				5188	if (!temp)
				5189	goto onError;
				5190	pbuf = PyUnicode_AS_UNICODE(temp);
				5191	len = PyUnicode_GET_SIZE(temp);
				5192	/* unbounded ints can always produce
				5193	a sign character! */
				5194	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5195	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5196	else {
				5197	pbuf = formatbuf;
				5198	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5199	flags, prec, c, v);
				5200	if (len < 0)
				5201	goto onError;
				5202	/* only d conversion is signed */
				5203	sign = c == 'd';
				5204	}
				5205	if (flags & F_ZERO)
				5206	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5207	break;
				5208
				5209	case 'e':
				5210	case 'E':
				5211	case 'f':
				5212	case 'g':
				5213	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5214	pbuf = formatbuf;
				5215	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5216	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5217	if (len < 0)
				5218	goto onError;
				5219	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5220	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5221	fill = '0';
				5222	break;
				5223
				5224	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5225	pbuf = formatbuf;
				5226	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5227	if (len < 0)
				5228	goto onError;
				5229	break;
				5230
				5231	default:
				5232	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5233	"unsupported format character '%c' (0x%x) "
				5234	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5235	(31<=c && c<=126) ? c : '?',
				5236	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5237	goto onError;
				5238	}
				5239	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5240	if (pbuf == '-' \|\| pbuf == '+') {
				5241	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5242	len--;
				5243	}
				5244	else if (flags & F_SIGN)
				5245	sign = '+';
				5246	else if (flags & F_BLANK)
				5247	sign = ' ';
				5248	else
				5249	sign = 0;
				5250	}
				5251	if (width < len)
				5252	width = len;
				5253	if (rescnt < width + (sign != 0)) {
				5254	reslen -= rescnt;
				5255	rescnt = width + fmtcnt + 100;
				5256	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5257	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5258	return NULL;
				5259	res = PyUnicode_AS_UNICODE(result)
				5260	+ reslen - rescnt;
				5261	}
				5262	if (sign) {
				5263	if (fill != ' ')
				5264	*res++ = sign;
				5265	rescnt--;
				5266	if (width > len)
				5267	width--;
				5268	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5269	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5270	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5271	assert(pbuf[1] == c);
				5272	if (fill != ' ') {
				5273	res++ = pbuf++;
				5274	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5275	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5276	rescnt -= 2;
				5277	width -= 2;
				5278	if (width < 0)
				5279	width = 0;
				5280	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5281	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5282	if (width > len && !(flags & F_LJUST)) {
				5283	do {
				5284	--rescnt;
				5285	*res++ = fill;
				5286	} while (--width > len);
				5287	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5288	if (fill == ' ') {
				5289	if (sign)
				5290	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5291	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5292	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5293	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5294	res++ = pbuf++;
				5295	res++ = pbuf++;
				5296	}
				5297	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5298	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5299	res += len;
				5300	rescnt -= len;
				5301	while (--width >= len) {
				5302	--rescnt;
				5303	*res++ = ' ';
				5304	}
				5305	if (dict && (argidx < arglen) && c != '%') {
				5306	PyErr_SetString(PyExc_TypeError,
				5307	"not all arguments converted");
				5308	goto onError;
				5309	}
				5310	Py_XDECREF(temp);
				5311	} /* '%' */
				5312	} /* until end */
				5313	if (argidx < arglen && !dict) {
				5314	PyErr_SetString(PyExc_TypeError,
				5315	"not all arguments converted");
				5316	goto onError;
				5317	}
				5318
				5319	if (args_owned) {
				5320	Py_DECREF(args);
				5321	}
				5322	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5323	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5324	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5325	return (PyObject *)result;
				5326
				5327	onError:
				5328	Py_XDECREF(result);
				5329	Py_DECREF(uformat);
				5330	if (args_owned) {
				5331	Py_DECREF(args);
				5332	}
				5333	return NULL;
				5334	}
				5335
				5336	static PyBufferProcs unicode_as_buffer = {
				5337	(getreadbufferproc) unicode_buffer_getreadbuf,
				5338	(getwritebufferproc) unicode_buffer_getwritebuf,
				5339	(getsegcountproc) unicode_buffer_getsegcount,
				5340	(getcharbufferproc) unicode_buffer_getcharbuf,
				5341	};
				5342
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5343	static PyObject *
				5344	unicode_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5345	{
				5346	PyObject *x = NULL;
				5347	static char *kwlist[] = {"string", "encoding", "errors", 0};
				5348	char *encoding = NULL;
				5349	char *errors = NULL;
				5350
				5351	assert(type == &PyUnicode_Type);
				5352	if (!PyArg_ParseTupleAndKeywords(args, kwds, "\|Oss:unicode",
				5353	kwlist, &x, &encoding, &errors))
				5354	return NULL;
				5355	if (x == NULL)
				5356	return (PyObject *)_PyUnicode_New(0);
				5357	return PyUnicode_FromEncodedObject(x, encoding, errors);
				5358	}
				5359
				5360	static char unicode_doc[] =
				5361	"unicode(string [, encoding[, errors]]) -> object\n\
				5362	\n\
				5363	Create a new Unicode object from the given encoded string.\n\
				5364	encoding defaults to the current default string encoding and \n\
				5365	errors, defining the error handling, to 'strict'.";
				5366
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5367	PyTypeObject PyUnicode_Type = {
				5368	PyObject_HEAD_INIT(&PyType_Type)
				5369	0, /* ob_size */
				5370	"unicode", /* tp_name */
				5371	sizeof(PyUnicodeObject), /* tp_size */
				5372	0, /* tp_itemsize */
				5373	/* Slots */
				5374	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5375	0, /* tp_print */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5376	0, /* tp_getattr */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5377	0, /* tp_setattr */
				5378	(cmpfunc) unicode_compare, /* tp_compare */
				5379	(reprfunc) unicode_repr, /* tp_repr */
				5380	0, /* tp_as_number */
				5381	&unicode_as_sequence, /* tp_as_sequence */
				5382	0, /* tp_as_mapping */
				5383	(hashfunc) unicode_hash, /* tp_hash*/
				5384	0, /* tp_call*/
				5385	(reprfunc) unicode_str, /* tp_str */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5386	PyObject_GenericGetAttr, /* tp_getattro */
				5387	0, /* tp_setattro */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5388	&unicode_as_buffer, /* tp_as_buffer */
				5389	Py_TPFLAGS_DEFAULT, /* tp_flags */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5390	unicode_doc, /* tp_doc */
				5391	0, /* tp_traverse */
				5392	0, /* tp_clear */
				5393	0, /* tp_richcompare */
				5394	0, /* tp_weaklistoffset */
				5395	0, /* tp_iter */
				5396	0, /* tp_iternext */
				5397	unicode_methods, /* tp_methods */
				5398	0, /* tp_members */
				5399	0, /* tp_getset */
				5400	0, /* tp_base */
				5401	0, /* tp_dict */
				5402	0, /* tp_descr_get */
				5403	0, /* tp_descr_set */
				5404	0, /* tp_dictoffset */
				5405	0, /* tp_init */
				5406	0, /* tp_alloc */
				5407	unicode_new, /* tp_new */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5408	};
				5409
				5410	/* Initialize the Unicode implementation */
				5411
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5412	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5413	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5414	int i;
				5415
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5416	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5417	unicode_freelist = NULL;
				5418	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5419	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5420	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5421	for (i = 0; i < 256; i++)
				5422	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5423	}
				5424
				5425	/* Finalize the Unicode implementation */
				5426
				5427	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5428	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5429	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5430	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5431	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5432
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5433	Py_XDECREF(unicode_empty);
				5434	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5435
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5436	for (i = 0; i < 256; i++) {
				5437	if (unicode_latin1[i]) {
				5438	Py_DECREF(unicode_latin1[i]);
				5439	unicode_latin1[i] = NULL;
				5440	}
				5441	}
				5442
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5443	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5444	PyUnicodeObject *v = u;
				5445	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5446	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5447	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5448	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5449	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5450	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5451	unicode_freelist = NULL;
				5452	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5453	}