Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 50f2f5c627349cd521cab1716686f24d64a4cd03 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	107	PyUnicode_GetMax(void)
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	108	{
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	109	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
				204	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	222	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
				227	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				228	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	230	/* Keep-Alive optimization */
				231	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	232	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	233	unicode->str = NULL;
				234	unicode->length = 0;
				235	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	236	if (unicode->defenc) {
				237	Py_DECREF(unicode->defenc);
				238	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	239	}
				240	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	(PyUnicodeObject *)unicode = unicode_freelist;
				242	unicode_freelist = unicode;
				243	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	244	}
				245	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	246	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	247	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	248	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	249	}
				250	}
				251
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	252	int PyUnicode_Resize(PyObject **unicode,
				253	int length)
				254	{
				255	register PyUnicodeObject *v;
				256
				257	/* Argument checks */
				258	if (unicode == NULL) {
				259	PyErr_BadInternalCall();
				260	return -1;
				261	}
				262	v = (PyUnicodeObject )unicode;
				263	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				264	PyErr_BadInternalCall();
				265	return -1;
				266	}
				267
				268	/* Resizing unicode_empty and single character objects is not
				269	possible since these are being shared. We simply return a fresh
				270	copy with the same Unicode content. */
				271	if (v->length != length &&
				272	(v == unicode_empty \|\| v->length == 1)) {
				273	PyUnicodeObject *w = _PyUnicode_New(length);
				274	if (w == NULL)
				275	return -1;
				276	Py_UNICODE_COPY(w->str, v->str,
				277	length < v->length ? length : v->length);
				278	unicode = (PyObject )w;
				279	return 0;
				280	}
				281
				282	/* Note that we don't have to modify *unicode for unshared Unicode
				283	objects, since we can modify them in-place. */
				284	return unicode_resize(v, length);
				285	}
				286
				287	/* Internal API for use in unicodeobject.c only ! */
				288	#define _PyUnicode_Resize(unicodevar, length) \
				289	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				290
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	291	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				292	int size)
				293	{
				294	PyUnicodeObject *unicode;
				295
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	296	/* If the Unicode data is known at construction time, we can apply
				297	some optimizations which share commonly used objects. */
				298	if (u != NULL) {
				299
				300	/* Optimization for empty strings */
				301	if (size == 0 && unicode_empty != NULL) {
				302	Py_INCREF(unicode_empty);
				303	return (PyObject *)unicode_empty;
				304	}
				305
				306	/* Single character Unicode objects in the Latin-1 range are
				307	shared when using this constructor */
				308	if (size == 1 && *u < 256) {
				309	unicode = unicode_latin1[*u];
				310	if (!unicode) {
				311	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	312	if (!unicode)
				313	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	314	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	315	unicode_latin1[*u] = unicode;
				316	}
				317	Py_INCREF(unicode);
				318	return (PyObject *)unicode;
				319	}
				320	}
				321
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	322	unicode = _PyUnicode_New(size);
				323	if (!unicode)
				324	return NULL;
				325
				326	/* Copy the Unicode data into the new object */
				327	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	328	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	329
				330	return (PyObject *)unicode;
				331	}
				332
				333	#ifdef HAVE_WCHAR_H
				334
				335	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				336	int size)
				337	{
				338	PyUnicodeObject *unicode;
				339
				340	if (w == NULL) {
				341	PyErr_BadInternalCall();
				342	return NULL;
				343	}
				344
				345	unicode = _PyUnicode_New(size);
				346	if (!unicode)
				347	return NULL;
				348
				349	/* Copy the wchar_t data into the new object */
				350	#ifdef HAVE_USABLE_WCHAR_T
				351	memcpy(unicode->str, w, size * sizeof(wchar_t));
				352	#else
				353	{
				354	register Py_UNICODE *u;
				355	register int i;
				356	u = PyUnicode_AS_UNICODE(unicode);
				357	for (i = size; i >= 0; i--)
				358	u++ = w++;
				359	}
				360	#endif
				361
				362	return (PyObject *)unicode;
				363	}
				364
				365	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				366	register wchar_t *w,
				367	int size)
				368	{
				369	if (unicode == NULL) {
				370	PyErr_BadInternalCall();
				371	return -1;
				372	}
				373	if (size > PyUnicode_GET_SIZE(unicode))
				374	size = PyUnicode_GET_SIZE(unicode);
				375	#ifdef HAVE_USABLE_WCHAR_T
				376	memcpy(w, unicode->str, size * sizeof(wchar_t));
				377	#else
				378	{
				379	register Py_UNICODE *u;
				380	register int i;
				381	u = PyUnicode_AS_UNICODE(unicode);
				382	for (i = size; i >= 0; i--)
				383	w++ = u++;
				384	}
				385	#endif
				386
				387	return size;
				388	}
				389
				390	#endif
				391
				392	PyObject PyUnicode_FromObject(register PyObject obj)
				393	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	394	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				395	}
				396
				397	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				398	const char *encoding,
				399	const char *errors)
				400	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	401	const char *s;
				402	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	403	int owned = 0;
				404	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	405
				406	if (obj == NULL) {
				407	PyErr_BadInternalCall();
				408	return NULL;
				409	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	410
				411	/* Coerce object */
				412	if (PyInstance_Check(obj)) {
				413	PyObject *func;
				414	func = PyObject_GetAttrString(obj, "__str__");
				415	if (func == NULL) {
				416	PyErr_SetString(PyExc_TypeError,
				417	"coercing to Unicode: instance doesn't define __str__");
				418	return NULL;
				419	}
				420	obj = PyEval_CallObject(func, NULL);
				421	Py_DECREF(func);
				422	if (obj == NULL)
				423	return NULL;
				424	owned = 1;
				425	}
				426	if (PyUnicode_Check(obj)) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	427	if (encoding) {
Tim Peters	78e0fc7	2001-09-11 03:07:38 +0000	[diff] [blame]	428	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	429	"decoding Unicode is not supported");
Tim Peters	78e0fc7	2001-09-11 03:07:38 +0000	[diff] [blame]	430	return NULL;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	431	}
Tim Peters	78e0fc7	2001-09-11 03:07:38 +0000	[diff] [blame]	432	if (PyUnicode_CheckExact(obj)) {
				433	Py_INCREF(obj);
				434	v = obj;
				435	}
				436	else {
				437	/* For a subclass of unicode, return a true unicode object
				438	with the same string value. */
				439	v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
				440	PyUnicode_GET_SIZE(obj));
				441	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	442	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	443	}
				444	else if (PyString_Check(obj)) {
				445	s = PyString_AS_STRING(obj);
				446	len = PyString_GET_SIZE(obj);
				447	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	448	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				449	/* Overwrite the error message with something more useful in
				450	case of a TypeError. */
				451	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	452	PyErr_Format(PyExc_TypeError,
				453	"coercing to Unicode: need string or buffer, "
				454	"%.80s found",
				455	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	456	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	457	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	458
				459	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	460	if (len == 0) {
				461	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	462	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	463	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	464	else
				465	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	466
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	467	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	468	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	469	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	470	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	471	return v;
				472
				473	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	474	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	475	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	476	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	477	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	478	}
				479
				480	PyObject PyUnicode_Decode(const char s,
				481	int size,
				482	const char *encoding,
				483	const char *errors)
				484	{
				485	PyObject buffer = NULL, unicode;
				486
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	487	if (encoding == NULL)
				488	encoding = PyUnicode_GetDefaultEncoding();
				489
				490	/* Shortcuts for common default encodings */
				491	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	492	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	493	else if (strcmp(encoding, "latin-1") == 0)
				494	return PyUnicode_DecodeLatin1(s, size, errors);
				495	else if (strcmp(encoding, "ascii") == 0)
				496	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	497
				498	/* Decode via the codec registry */
				499	buffer = PyBuffer_FromMemory((void *)s, size);
				500	if (buffer == NULL)
				501	goto onError;
				502	unicode = PyCodec_Decode(buffer, encoding, errors);
				503	if (unicode == NULL)
				504	goto onError;
				505	if (!PyUnicode_Check(unicode)) {
				506	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	507	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	508	unicode->ob_type->tp_name);
				509	Py_DECREF(unicode);
				510	goto onError;
				511	}
				512	Py_DECREF(buffer);
				513	return unicode;
				514
				515	onError:
				516	Py_XDECREF(buffer);
				517	return NULL;
				518	}
				519
				520	PyObject PyUnicode_Encode(const Py_UNICODE s,
				521	int size,
				522	const char *encoding,
				523	const char *errors)
				524	{
				525	PyObject v, unicode;
				526
				527	unicode = PyUnicode_FromUnicode(s, size);
				528	if (unicode == NULL)
				529	return NULL;
				530	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				531	Py_DECREF(unicode);
				532	return v;
				533	}
				534
				535	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				536	const char *encoding,
				537	const char *errors)
				538	{
				539	PyObject *v;
				540
				541	if (!PyUnicode_Check(unicode)) {
				542	PyErr_BadArgument();
				543	goto onError;
				544	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	545
				546	if (encoding == NULL)
				547	encoding = PyUnicode_GetDefaultEncoding();
				548
				549	/* Shortcuts for common default encodings */
				550	if (errors == NULL) {
				551	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	552	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	553	else if (strcmp(encoding, "latin-1") == 0)
				554	return PyUnicode_AsLatin1String(unicode);
				555	else if (strcmp(encoding, "ascii") == 0)
				556	return PyUnicode_AsASCIIString(unicode);
				557	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	558
				559	/* Encode via the codec registry */
				560	v = PyCodec_Encode(unicode, encoding, errors);
				561	if (v == NULL)
				562	goto onError;
				563	/* XXX Should we really enforce this ? */
				564	if (!PyString_Check(v)) {
				565	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	566	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	567	v->ob_type->tp_name);
				568	Py_DECREF(v);
				569	goto onError;
				570	}
				571	return v;
				572
				573	onError:
				574	return NULL;
				575	}
				576
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	577	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				578	const char *errors)
				579	{
				580	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				581
				582	if (v)
				583	return v;
				584	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				585	if (v && errors == NULL)
				586	((PyUnicodeObject *)unicode)->defenc = v;
				587	return v;
				588	}
				589
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	590	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				591	{
				592	if (!PyUnicode_Check(unicode)) {
				593	PyErr_BadArgument();
				594	goto onError;
				595	}
				596	return PyUnicode_AS_UNICODE(unicode);
				597
				598	onError:
				599	return NULL;
				600	}
				601
				602	int PyUnicode_GetSize(PyObject *unicode)
				603	{
				604	if (!PyUnicode_Check(unicode)) {
				605	PyErr_BadArgument();
				606	goto onError;
				607	}
				608	return PyUnicode_GET_SIZE(unicode);
				609
				610	onError:
				611	return -1;
				612	}
				613
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	614	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	615	{
				616	return unicode_default_encoding;
				617	}
				618
				619	int PyUnicode_SetDefaultEncoding(const char *encoding)
				620	{
				621	PyObject *v;
				622
				623	/* Make sure the encoding is valid. As side effect, this also
				624	loads the encoding into the codec registry cache. */
				625	v = _PyCodec_Lookup(encoding);
				626	if (v == NULL)
				627	goto onError;
				628	Py_DECREF(v);
				629	strncpy(unicode_default_encoding,
				630	encoding,
				631	sizeof(unicode_default_encoding));
				632	return 0;
				633
				634	onError:
				635	return -1;
				636	}
				637
Marc-André Lemburg	c60e6f7	2001-09-20 10:35:46 +0000	[diff] [blame^]	638	/* --- UTF-7 Codec -------------------------------------------------------- */
				639
				640	/* see RFC2152 for details */
				641
				642	static
				643	char utf7_special[128] = {
				644	/* indicate whether a UTF-7 character is special i.e. cannot be directly
				645	encoded:
				646	0 - not special
				647	1 - special
				648	2 - whitespace (optional)
				649	3 - RFC2152 Set O (optional) */
				650	1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
				651	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				652	2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
				653	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
				654	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				655	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
				656	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				657	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
				658
				659	};
				660
				661	#define SPECIAL(c, encodeO, encodeWS) \
				662	(((c)>127 \|\| utf7_special[(c)] == 1) \|\| \
				663	(encodeWS && (utf7_special[(c)] == 2)) \|\| \
				664	(encodeO && (utf7_special[(c)] == 3)))
				665
				666	#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
				667	#define B64CHAR(c) (isalnum(c) \|\| (c) == '+' \|\| (c) == '/')
				668	#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
				669	(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
				670
				671	#define ENCODE(out, ch, bits) \
				672	while (bits >= 6) { \
				673	*out++ = B64(ch >> (bits-6)); \
				674	bits -= 6; \
				675	}
				676
				677	#define DECODE(out, ch, bits, surrogate) \
				678	while (bits >= 16) { \
				679	Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
				680	bits -= 16; \
				681	if (surrogate) { \
				682	/* We have already generated an error for the high surrogate
				683	so let's not bother seeing if the low surrogate is correct or not */\
				684	surrogate = 0; \
				685	} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
				686	/* This is a surrogate pair. Unfortunately we can't represent \
				687	it in a 16-bit character */ \
				688	surrogate = 1; \
				689	errmsg = "code pairs are not supported"; \
				690	goto utf7Error; \
				691	} else { \
				692	*out++ = outCh; \
				693	} \
				694	} \
				695
				696	static
				697	int utf7_decoding_error(Py_UNICODE **dest,
				698	const char *errors,
				699	const char *details)
				700	{
				701	if ((errors == NULL) \|\|
				702	(strcmp(errors,"strict") == 0)) {
				703	PyErr_Format(PyExc_UnicodeError,
				704	"UTF-7 decoding error: %.400s",
				705	details);
				706	return -1;
				707	}
				708	else if (strcmp(errors,"ignore") == 0) {
				709	return 0;
				710	}
				711	else if (strcmp(errors,"replace") == 0) {
				712	if (dest != NULL) {
				713	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				714	(*dest)++;
				715	}
				716	return 0;
				717	}
				718	else {
				719	PyErr_Format(PyExc_ValueError,
				720	"UTF-7 decoding error; unknown error handling code: %.400s",
				721	errors);
				722	return -1;
				723	}
				724	}
				725
				726	PyObject PyUnicode_DecodeUTF7(const char s,
				727	int size,
				728	const char *errors)
				729	{
				730	const char *e;
				731	PyUnicodeObject *unicode;
				732	Py_UNICODE *p;
				733	const char *errmsg = "";
				734	int inShift = 0;
				735	unsigned int bitsleft = 0;
				736	unsigned long charsleft = 0;
				737	int surrogate = 0;
				738
				739	unicode = _PyUnicode_New(size);
				740	if (!unicode)
				741	return NULL;
				742	if (size == 0)
				743	return (PyObject *)unicode;
				744
				745	p = unicode->str;
				746	e = s + size;
				747
				748	while (s < e) {
				749	Py_UNICODE ch = *s;
				750
				751	if (inShift) {
				752	if ((ch == '-') \|\| !B64CHAR(ch)) {
				753	inShift = 0;
				754	s++;
				755
				756	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				757	if (bitsleft >= 6) {
				758	/* The shift sequence has a partial character in it. If
				759	bitsleft < 6 then we could just classify it as padding
				760	but that is not the case here */
				761
				762	errmsg = "partial character in shift sequence";
				763	goto utf7Error;
				764	}
				765	/* According to RFC2152 the remaining bits should be zero. We
				766	choose to signal an error/insert a replacement character
				767	here so indicate the potential of a misencoded character. */
				768
				769	/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
				770	if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
				771	errmsg = "non-zero padding bits in shift sequence";
				772	goto utf7Error;
				773	}
				774
				775	if (ch == '-') {
				776	if ((s < e) && (*(s) == '-')) {
				777	*p++ = '-';
				778	inShift = 1;
				779	}
				780	} else if (SPECIAL(ch,0,0)) {
				781	errmsg = "unexpected special character";
				782	goto utf7Error;
				783	} else {
				784	*p++ = ch;
				785	}
				786	} else {
				787	charsleft = (charsleft << 6) \| UB64(ch);
				788	bitsleft += 6;
				789	s++;
				790	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				791	}
				792	}
				793	else if ( ch == '+' ) {
				794	s++;
				795	if (s < e && *s == '-') {
				796	s++;
				797	*p++ = '+';
				798	} else
				799	{
				800	inShift = 1;
				801	bitsleft = 0;
				802	}
				803	}
				804	else if (SPECIAL(ch,0,0)) {
				805	errmsg = "unexpected special character";
				806	s++;
				807	goto utf7Error;
				808	}
				809	else {
				810	*p++ = ch;
				811	s++;
				812	}
				813	continue;
				814	utf7Error:
				815	if (utf7_decoding_error(&p, errors, errmsg))
				816	goto onError;
				817	}
				818
				819	if (inShift) {
				820	if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
				821	goto onError;
				822	}
				823
				824	if (_PyUnicode_Resize(&unicode, p - unicode->str))
				825	goto onError;
				826
				827	return (PyObject *)unicode;
				828
				829	onError:
				830	Py_DECREF(unicode);
				831	return NULL;
				832	}
				833
				834
				835	PyObject PyUnicode_EncodeUTF7(const Py_UNICODE s,
				836	int size,
				837	int encodeSetO,
				838	int encodeWhiteSpace,
				839	const char *errors)
				840	{
				841	PyObject *v;
				842	/* It might be possible to tighten this worst case */
				843	unsigned int cbAllocated = 5 * size;
				844	int inShift = 0;
				845	int i = 0;
				846	unsigned int bitsleft = 0;
				847	unsigned long charsleft = 0;
				848	char * out;
				849	char * start;
				850
				851	if (size == 0)
				852	return PyString_FromStringAndSize(NULL, 0);
				853
				854	v = PyString_FromStringAndSize(NULL, cbAllocated);
				855	if (v == NULL)
				856	return NULL;
				857
				858	start = out = PyString_AS_STRING(v);
				859	for (;i < size; ++i) {
				860	Py_UNICODE ch = s[i];
				861
				862	if (!inShift) {
				863	if (ch == '+') {
				864	*out++ = '+';
				865	*out++ = '-';
				866	} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				867	charsleft = ch;
				868	bitsleft = 16;
				869	*out++ = '+';
				870	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				871	inShift = bitsleft > 0;
				872	} else {
				873	*out++ = (char) ch;
				874	}
				875	} else {
				876	if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				877	*out++ = B64(charsleft << (6-bitsleft));
				878	charsleft = 0;
				879	bitsleft = 0;
				880	/* Characters not in the BASE64 set implicitly unshift the sequence
				881	so no '-' is required, except if the character is itself a '-' */
				882	if (B64CHAR(ch) \|\| ch == '-') {
				883	*out++ = '-';
				884	}
				885	inShift = 0;
				886	*out++ = (char) ch;
				887	} else {
				888	bitsleft += 16;
				889	charsleft = (charsleft << 16) \| ch;
				890	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				891
				892	/* If the next character is special then we dont' need to terminate
				893	the shift sequence. If the next character is not a BASE64 character
				894	or '-' then the shift sequence will be terminated implicitly and we
				895	don't have to insert a '-'. */
				896
				897	if (bitsleft == 0) {
				898	if (i + 1 < size) {
				899	Py_UNICODE ch2 = s[i+1];
				900
				901	if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
				902
				903	} else if (B64CHAR(ch2) \|\| ch2 == '-') {
				904	*out++ = '-';
				905	inShift = 0;
				906	} else {
				907	inShift = 0;
				908	}
				909
				910	}
				911	else {
				912	*out++ = '-';
				913	inShift = 0;
				914	}
				915	}
				916	}
				917	}
				918	}
				919	if (bitsleft) {
				920	*out++= B64(charsleft << (6-bitsleft) );
				921	*out++ = '-';
				922	}
				923
				924	if (_PyString_Resize(&v, out - start)) {
				925	Py_DECREF(v);
				926	return NULL;
				927	}
				928	return v;
				929	}
				930
				931	#undef SPECIAL
				932	#undef B64
				933	#undef B64CHAR
				934	#undef UB64
				935	#undef ENCODE
				936	#undef DECODE
				937
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	938	/* --- UTF-8 Codec -------------------------------------------------------- */
				939
				940	static
				941	char utf8_code_length[256] = {
				942	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				943	illegal prefix. see RFC 2279 for details */
				944	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				945	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				946	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				947	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				948	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				949	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				950	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				951	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				952	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				953	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				954	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				955	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				956	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				957	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				958	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				959	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				960	};
				961
				962	static
				963	int utf8_decoding_error(const char **source,
				964	Py_UNICODE **dest,
				965	const char *errors,
				966	const char *details)
				967	{
				968	if ((errors == NULL) \|\|
				969	(strcmp(errors,"strict") == 0)) {
				970	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	971	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	972	details);
				973	return -1;
				974	}
				975	else if (strcmp(errors,"ignore") == 0) {
				976	(*source)++;
				977	return 0;
				978	}
				979	else if (strcmp(errors,"replace") == 0) {
				980	(*source)++;
				981	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				982	(*dest)++;
				983	return 0;
				984	}
				985	else {
				986	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	987	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	988	errors);
				989	return -1;
				990	}
				991	}
				992
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	993	PyObject PyUnicode_DecodeUTF8(const char s,
				994	int size,
				995	const char *errors)
				996	{
				997	int n;
				998	const char *e;
				999	PyUnicodeObject *unicode;
				1000	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1001	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1002
				1003	/* Note: size will always be longer than the resulting Unicode
				1004	character count */
				1005	unicode = _PyUnicode_New(size);
				1006	if (!unicode)
				1007	return NULL;
				1008	if (size == 0)
				1009	return (PyObject *)unicode;
				1010
				1011	/* Unpack UTF-8 encoded data */
				1012	p = unicode->str;
				1013	e = s + size;
				1014
				1015	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1016	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1017
				1018	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1019	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1020	s++;
				1021	continue;
				1022	}
				1023
				1024	n = utf8_code_length[ch];
				1025
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1026	if (s + n > e) {
				1027	errmsg = "unexpected end of data";
				1028	goto utf8Error;
				1029	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1030
				1031	switch (n) {
				1032
				1033	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1034	errmsg = "unexpected code byte";
				1035	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1036
				1037	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1038	errmsg = "internal error";
				1039	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1040
				1041	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1042	if ((s[1] & 0xc0) != 0x80) {
				1043	errmsg = "invalid data";
				1044	goto utf8Error;
				1045	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1046	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1047	if (ch < 0x80) {
				1048	errmsg = "illegal encoding";
				1049	goto utf8Error;
				1050	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1051	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1052	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1053	break;
				1054
				1055	case 3:
				1056	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1057	(s[2] & 0xc0) != 0x80) {
				1058	errmsg = "invalid data";
				1059	goto utf8Error;
				1060	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1061	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1062	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				1063	errmsg = "illegal encoding";
				1064	goto utf8Error;
				1065	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1066	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1067	*p++ = (Py_UNICODE)ch;
				1068	break;
				1069
				1070	case 4:
				1071	if ((s[1] & 0xc0) != 0x80 \|\|
				1072	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1073	(s[3] & 0xc0) != 0x80) {
				1074	errmsg = "invalid data";
				1075	goto utf8Error;
				1076	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1077	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				1078	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				1079	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1080	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1081	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1082	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1083	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1084	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1085	errmsg = "illegal encoding";
				1086	goto utf8Error;
				1087	}
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1088	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1089	*p++ = (Py_UNICODE)ch;
				1090	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1091	/* compute and append the two surrogates: */
				1092
				1093	/* translate from 10000..10FFFF to 0..FFFF */
				1094	ch -= 0x10000;
				1095
				1096	/* high surrogate = top 10 bits added to D800 */
				1097	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				1098
				1099	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1100	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1101	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1102	break;
				1103
				1104	default:
				1105	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1106	errmsg = "unsupported Unicode code range";
				1107	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1108	}
				1109	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1110	continue;
				1111
				1112	utf8Error:
				1113	if (utf8_decoding_error(&s, &p, errors, errmsg))
				1114	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1115	}
				1116
				1117	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1118	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1119	goto onError;
				1120
				1121	return (PyObject *)unicode;
				1122
				1123	onError:
				1124	Py_DECREF(unicode);
				1125	return NULL;
				1126	}
				1127
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1128	/* Not used anymore, now that the encoder supports UTF-16
				1129	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1130	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1131	static
				1132	int utf8_encoding_error(const Py_UNICODE **source,
				1133	char **dest,
				1134	const char *errors,
				1135	const char *details)
				1136	{
				1137	if ((errors == NULL) \|\|
				1138	(strcmp(errors,"strict") == 0)) {
				1139	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1140	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1141	details);
				1142	return -1;
				1143	}
				1144	else if (strcmp(errors,"ignore") == 0) {
				1145	return 0;
				1146	}
				1147	else if (strcmp(errors,"replace") == 0) {
				1148	**dest = '?';
				1149	(*dest)++;
				1150	return 0;
				1151	}
				1152	else {
				1153	PyErr_Format(PyExc_ValueError,
				1154	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1155	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1156	errors);
				1157	return -1;
				1158	}
				1159	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1160	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1161
				1162	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				1163	int size,
				1164	const char *errors)
				1165	{
				1166	PyObject *v;
				1167	char *p;
				1168	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1169	Py_UCS4 ch2;
				1170	unsigned int cbAllocated = 3 * size;
				1171	unsigned int cbWritten = 0;
				1172	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1173
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1174	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1175	if (v == NULL)
				1176	return NULL;
				1177	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1178	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1179
				1180	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1181	while (i < size) {
				1182	Py_UCS4 ch = s[i++];
				1183	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1184	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1185	cbWritten++;
				1186	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1187	else if (ch < 0x0800) {
				1188	*p++ = 0xc0 \| (ch >> 6);
				1189	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1190	cbWritten += 2;
				1191	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1192	else if (ch < 0x10000) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1193	/* Check for high surrogate */
				1194	if (0xD800 <= ch && ch <= 0xDBFF) {
				1195	if (i != size) {
				1196	ch2 = s[i];
				1197	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				1198
				1199	if (cbWritten >= (cbAllocated - 4)) {
				1200	/* Provide enough room for some more
				1201	surrogates */
				1202	cbAllocated += 4*10;
				1203	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1204	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1205	}
				1206
				1207	/* combine the two values */
				1208	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				1209
				1210	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1211	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1212	i++;
				1213	cbWritten += 4;
				1214	}
				1215	}
				1216	}
				1217	else {
				1218	*p++ = (char)(0xe0 \| (ch >> 12));
				1219	cbWritten += 3;
				1220	}
				1221	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				1222	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1223	} else {
				1224	*p++ = 0xf0 \| (ch>>18);
				1225	*p++ = 0x80 \| ((ch>>12) & 0x3f);
				1226	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				1227	*p++ = 0x80 \| (ch & 0x3f);
				1228	cbWritten += 4;
				1229	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1230	}
				1231	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1232	if (_PyString_Resize(&v, p - q))
				1233	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1234	return v;
				1235
				1236	onError:
				1237	Py_DECREF(v);
				1238	return NULL;
				1239	}
				1240
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1241	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				1242	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1243	if (!PyUnicode_Check(unicode)) {
				1244	PyErr_BadArgument();
				1245	return NULL;
				1246	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	1247	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				1248	PyUnicode_GET_SIZE(unicode),
				1249	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1250	}
				1251
				1252	/* --- UTF-16 Codec ------------------------------------------------------- */
				1253
				1254	static
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1255	int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1256	const char *errors,
				1257	const char *details)
				1258	{
				1259	if ((errors == NULL) \|\|
				1260	(strcmp(errors,"strict") == 0)) {
				1261	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1262	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1263	details);
				1264	return -1;
				1265	}
				1266	else if (strcmp(errors,"ignore") == 0) {
				1267	return 0;
				1268	}
				1269	else if (strcmp(errors,"replace") == 0) {
				1270	if (dest) {
				1271	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1272	(*dest)++;
				1273	}
				1274	return 0;
				1275	}
				1276	else {
				1277	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	1278	"UTF-16 decoding error; "
				1279	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1280	errors);
				1281	return -1;
				1282	}
				1283	}
				1284
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1285	PyObject *
				1286	PyUnicode_DecodeUTF16(const char *s,
				1287	int size,
				1288	const char *errors,
				1289	int *byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1290	{
				1291	PyUnicodeObject *unicode;
				1292	Py_UNICODE *p;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1293	const unsigned char q, e;
				1294	int bo = 0; /* assume native ordering by default */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1295	const char *errmsg = "";
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1296	/* Offsets from q for retrieving byte pairs in the right order. */
				1297	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1298	int ihi = 1, ilo = 0;
				1299	#else
				1300	int ihi = 0, ilo = 1;
				1301	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1302
				1303	/* size should be an even number */
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1304	if (size & 1) {
				1305	if (utf16_decoding_error(NULL, errors, "truncated data"))
				1306	return NULL;
				1307	--size; /* else ignore the oddball byte */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1308	}
				1309
				1310	/* Note: size will always be longer than the resulting Unicode
				1311	character count */
				1312	unicode = _PyUnicode_New(size);
				1313	if (!unicode)
				1314	return NULL;
				1315	if (size == 0)
				1316	return (PyObject *)unicode;
				1317
				1318	/* Unpack UTF-16 encoded data */
				1319	p = unicode->str;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1320	q = (unsigned char *)s;
				1321	e = q + size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1322
				1323	if (byteorder)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1324	bo = *byteorder;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1325
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1326	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1327	byte order setting accordingly. In native mode, the leading BOM
				1328	mark is skipped, in all other modes, it is copied to the output
				1329	stream as-is (giving a ZWNBSP character). */
				1330	if (bo == 0) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1331	const Py_UNICODE bom = (q[ihi] << 8) \| q[ilo];
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1332	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1333	if (bom == 0xFEFF) {
				1334	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1335	bo = -1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1336	}
				1337	else if (bom == 0xFFFE) {
				1338	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1339	bo = 1;
				1340	}
				1341	#else
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1342	if (bom == 0xFEFF) {
				1343	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1344	bo = 1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1345	}
				1346	else if (bom == 0xFFFE) {
				1347	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1348	bo = -1;
				1349	}
				1350	#endif
				1351	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1352
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1353	if (bo == -1) {
				1354	/* force LE */
				1355	ihi = 1;
				1356	ilo = 0;
				1357	}
				1358	else if (bo == 1) {
				1359	/* force BE */
				1360	ihi = 0;
				1361	ilo = 1;
				1362	}
				1363
				1364	while (q < e) {
				1365	Py_UNICODE ch = (q[ihi] << 8) \| q[ilo];
				1366	q += 2;
				1367
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1368	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1369	*p++ = ch;
				1370	continue;
				1371	}
				1372
				1373	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1374	if (q >= e) {
				1375	errmsg = "unexpected end of data";
				1376	goto utf16Error;
				1377	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1378	if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1379	Py_UNICODE ch2 = (q[ihi] << 8) \| q[ilo];
				1380	q += 2;
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1381	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1382	#ifndef Py_UNICODE_WIDE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1383	*p++ = ch;
				1384	*p++ = ch2;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1385	#else
				1386	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1387	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1388	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1389	}
				1390	else {
				1391	errmsg = "illegal UTF-16 surrogate";
				1392	goto utf16Error;
				1393	}
				1394
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1395	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1396	errmsg = "illegal encoding";
				1397	/* Fall through to report the error */
				1398
				1399	utf16Error:
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1400	if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1401	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1402	}
				1403
				1404	if (byteorder)
				1405	*byteorder = bo;
				1406
				1407	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1408	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1409	goto onError;
				1410
				1411	return (PyObject *)unicode;
				1412
				1413	onError:
				1414	Py_DECREF(unicode);
				1415	return NULL;
				1416	}
				1417
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1418	PyObject *
				1419	PyUnicode_EncodeUTF16(const Py_UNICODE *s,
				1420	int size,
				1421	const char *errors,
				1422	int byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1423	{
				1424	PyObject *v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1425	unsigned char *p;
				1426	int i, pairs;
				1427	/* Offsets from p for storing byte pairs in the right order. */
				1428	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1429	int ihi = 1, ilo = 0;
				1430	#else
				1431	int ihi = 0, ilo = 1;
				1432	#endif
				1433
				1434	#define STORECHAR(CH) \
				1435	do { \
				1436	p[ihi] = ((CH) >> 8) & 0xff; \
				1437	p[ilo] = (CH) & 0xff; \
				1438	p += 2; \
				1439	} while(0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1440
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1441	for (i = pairs = 0; i < size; i++)
				1442	if (s[i] >= 0x10000)
				1443	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1444	v = PyString_FromStringAndSize(NULL,
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1445	2 * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1446	if (v == NULL)
				1447	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1448
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1449	p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1450	if (byteorder == 0)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1451	STORECHAR(0xFEFF);
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1452	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1453	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1454
				1455	if (byteorder == -1) {
				1456	/* force LE */
				1457	ihi = 1;
				1458	ilo = 0;
				1459	}
				1460	else if (byteorder == 1) {
				1461	/* force BE */
				1462	ihi = 0;
				1463	ilo = 1;
				1464	}
				1465
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1466	while (size-- > 0) {
				1467	Py_UNICODE ch = *s++;
				1468	Py_UNICODE ch2 = 0;
				1469	if (ch >= 0x10000) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1470	ch2 = 0xDC00 \| ((ch-0x10000) & 0x3FF);
				1471	ch = 0xD800 \| ((ch-0x10000) >> 10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1472	}
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1473	STORECHAR(ch);
				1474	if (ch2)
				1475	STORECHAR(ch2);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1476	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1477	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1478	#undef STORECHAR
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1479	}
				1480
				1481	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1482	{
				1483	if (!PyUnicode_Check(unicode)) {
				1484	PyErr_BadArgument();
				1485	return NULL;
				1486	}
				1487	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1488	PyUnicode_GET_SIZE(unicode),
				1489	NULL,
				1490	0);
				1491	}
				1492
				1493	/* --- Unicode Escape Codec ----------------------------------------------- */
				1494
				1495	static
				1496	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1497	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1498	const char *errors,
				1499	const char *details)
				1500	{
				1501	if ((errors == NULL) \|\|
				1502	(strcmp(errors,"strict") == 0)) {
				1503	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1504	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1505	details);
				1506	return -1;
				1507	}
				1508	else if (strcmp(errors,"ignore") == 0) {
				1509	return 0;
				1510	}
				1511	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1512	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1513	return 0;
				1514	}
				1515	else {
				1516	PyErr_Format(PyExc_ValueError,
				1517	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1518	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1519	errors);
				1520	return -1;
				1521	}
				1522	}
				1523
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1524	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1525
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1526	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1527	int size,
				1528	const char *errors)
				1529	{
				1530	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1531	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1532	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1533	char* message;
				1534	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1535
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1536	/* Escaped strings will always be longer than the resulting
				1537	Unicode string, so we start with size here and then reduce the
				1538	length after conversion to the true value. */
				1539	v = _PyUnicode_New(size);
				1540	if (v == NULL)
				1541	goto onError;
				1542	if (size == 0)
				1543	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1544
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1545	p = buf = PyUnicode_AS_UNICODE(v);
				1546	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1547
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1548	while (s < end) {
				1549	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1550	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1551	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1552
				1553	/* Non-escape characters are interpreted as Unicode ordinals */
				1554	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1555	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1556	continue;
				1557	}
				1558
				1559	/* \ - Escapes */
				1560	s++;
				1561	switch (*s++) {
				1562
				1563	/* \x escapes */
				1564	case '\n': break;
				1565	case '\\': *p++ = '\\'; break;
				1566	case '\'': *p++ = '\''; break;
				1567	case '\"': *p++ = '\"'; break;
				1568	case 'b': *p++ = '\b'; break;
				1569	case 'f': p++ = '\014'; break; / FF */
				1570	case 't': *p++ = '\t'; break;
				1571	case 'n': *p++ = '\n'; break;
				1572	case 'r': *p++ = '\r'; break;
				1573	case 'v': p++ = '\013'; break; / VT */
				1574	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1575
				1576	/* \OOO (octal) escapes */
				1577	case '0': case '1': case '2': case '3':
				1578	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1579	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1580	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1581	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1582	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1583	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1584	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1585	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1586	break;
				1587
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1588	/* hex escapes */
				1589	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1590	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1591	digits = 2;
				1592	message = "truncated \\xXX escape";
				1593	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1594
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1595	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1596	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1597	digits = 4;
				1598	message = "truncated \\uXXXX escape";
				1599	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1600
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1601	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1602	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1603	digits = 8;
				1604	message = "truncated \\UXXXXXXXX escape";
				1605	hexescape:
				1606	chr = 0;
				1607	for (i = 0; i < digits; i++) {
				1608	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1609	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1610	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1611	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1612	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1613	i++;
				1614	break;
				1615	}
				1616	chr = (chr<<4) & ~0xF;
				1617	if (c >= '0' && c <= '9')
				1618	chr += c - '0';
				1619	else if (c >= 'a' && c <= 'f')
				1620	chr += 10 + c - 'a';
				1621	else
				1622	chr += 10 + c - 'A';
				1623	}
				1624	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1625	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1626	/* when we get here, chr is a 32-bit unicode character */
				1627	if (chr <= 0xffff)
				1628	/* UCS-2 character */
				1629	*p++ = (Py_UNICODE) chr;
				1630	else if (chr <= 0x10ffff) {
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1631	/* UCS-4 character. Either store directly, or as
				1632	surrogate pair. */
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1633	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1634	*p++ = chr;
				1635	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1636	chr -= 0x10000L;
				1637	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1638	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1639	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1640	} else {
				1641	if (unicodeescape_decoding_error(
				1642	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1643	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1644	)
				1645	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1646	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1647	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1648	break;
				1649
				1650	/* \N{name} */
				1651	case 'N':
				1652	message = "malformed \\N character escape";
				1653	if (ucnhash_CAPI == NULL) {
				1654	/* load the unicode data module */
				1655	PyObject m, v;
				1656	m = PyImport_ImportModule("unicodedata");
				1657	if (m == NULL)
				1658	goto ucnhashError;
				1659	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1660	Py_DECREF(m);
				1661	if (v == NULL)
				1662	goto ucnhashError;
				1663	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1664	Py_DECREF(v);
				1665	if (ucnhash_CAPI == NULL)
				1666	goto ucnhashError;
				1667	}
				1668	if (*s == '{') {
				1669	const char *start = s+1;
				1670	/* look for the closing brace */
				1671	while (*s != '}' && s < end)
				1672	s++;
				1673	if (s > start && s < end && *s == '}') {
				1674	/* found a name. look it up in the unicode database */
				1675	message = "unknown Unicode character name";
				1676	s++;
				1677	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1678	goto store;
				1679	}
				1680	}
				1681	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1682	goto onError;
				1683	*p++ = x;
				1684	break;
				1685
				1686	default:
				1687	*p++ = '\\';
				1688	*p++ = (unsigned char)s[-1];
				1689	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1690	}
				1691	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1692	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1693	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1694	return (PyObject *)v;
				1695
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1696	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1697	PyErr_SetString(
				1698	PyExc_UnicodeError,
				1699	"\\N escapes not supported (can't load unicodedata module)"
				1700	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1701	return NULL;
				1702
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1703	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1704	Py_XDECREF(v);
				1705	return NULL;
				1706	}
				1707
				1708	/* Return a Unicode-Escape string version of the Unicode object.
				1709
				1710	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1711	appropriate.
				1712
				1713	*/
				1714
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1715	static const Py_UNICODE findchar(const Py_UNICODE s,
				1716	int size,
				1717	Py_UNICODE ch);
				1718
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1719	static
				1720	PyObject unicodeescape_string(const Py_UNICODE s,
				1721	int size,
				1722	int quotes)
				1723	{
				1724	PyObject *repr;
				1725	char *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1726
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1727	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1728
				1729	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1730	if (repr == NULL)
				1731	return NULL;
				1732
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1733	p = PyString_AS_STRING(repr);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1734
				1735	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1736	*p++ = 'u';
				1737	*p++ = (findchar(s, size, '\'') &&
				1738	!findchar(s, size, '"')) ? '"' : '\'';
				1739	}
				1740	while (size-- > 0) {
				1741	Py_UNICODE ch = *s++;
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1742
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1743	/* Escape quotes */
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1744	if (quotes &&
				1745	(ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1746	*p++ = '\\';
				1747	*p++ = (char) ch;
				1748	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1749
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1750	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1751	/* Map 21-bit characters to '\U00xxxxxx' */
				1752	else if (ch >= 0x10000) {
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1753	int offset = p - PyString_AS_STRING(repr);
				1754
				1755	/* Resize the string if necessary */
				1756	if (offset + 12 > PyString_GET_SIZE(repr)) {
				1757	if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
				1758	goto onError;
				1759	p = PyString_AS_STRING(repr) + offset;
				1760	}
				1761
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1762	*p++ = '\\';
				1763	*p++ = 'U';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1764	*p++ = hexdigit[(ch >> 28) & 0x0000000F];
				1765	*p++ = hexdigit[(ch >> 24) & 0x0000000F];
				1766	*p++ = hexdigit[(ch >> 20) & 0x0000000F];
				1767	*p++ = hexdigit[(ch >> 16) & 0x0000000F];
				1768	*p++ = hexdigit[(ch >> 12) & 0x0000000F];
				1769	*p++ = hexdigit[(ch >> 8) & 0x0000000F];
				1770	*p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1771	*p++ = hexdigit[ch & 0x0000000F];
				1772	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1773	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1774	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1775	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
				1776	else if (ch >= 0xD800 && ch < 0xDC00) {
				1777	Py_UNICODE ch2;
				1778	Py_UCS4 ucs;
				1779
				1780	ch2 = *s++;
				1781	size--;
				1782	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
				1783	ucs = (((ch & 0x03FF) << 10) \| (ch2 & 0x03FF)) + 0x00010000;
				1784	*p++ = '\\';
				1785	*p++ = 'U';
				1786	*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
				1787	*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
				1788	*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
				1789	*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
				1790	*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
				1791	*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
				1792	*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
				1793	*p++ = hexdigit[ucs & 0x0000000F];
				1794	continue;
				1795	}
				1796	/* Fall through: isolated surrogates are copied as-is */
				1797	s--;
				1798	size++;
				1799	}
				1800
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1801	/* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1802	if (ch >= 256) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1803	*p++ = '\\';
				1804	*p++ = 'u';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1805	*p++ = hexdigit[(ch >> 12) & 0x000F];
				1806	*p++ = hexdigit[(ch >> 8) & 0x000F];
				1807	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1808	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1809	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1810
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1811	/* Map special whitespace to '\t', \n', '\r' */
				1812	else if (ch == '\t') {
				1813	*p++ = '\\';
				1814	*p++ = 't';
				1815	}
				1816	else if (ch == '\n') {
				1817	*p++ = '\\';
				1818	*p++ = 'n';
				1819	}
				1820	else if (ch == '\r') {
				1821	*p++ = '\\';
				1822	*p++ = 'r';
				1823	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1824
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1825	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1826	else if (ch < ' ' \|\| ch >= 128) {
				1827	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1828	*p++ = 'x';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1829	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1830	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1831	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1832
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1833	/* Copy everything else as-is */
				1834	else
				1835	*p++ = (char) ch;
				1836	}
				1837	if (quotes)
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1838	*p++ = PyString_AS_STRING(repr)[1];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1839
				1840	*p = '\0';
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1841	if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1842	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1843
				1844	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1845
				1846	onError:
				1847	Py_DECREF(repr);
				1848	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1849	}
				1850
				1851	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1852	int size)
				1853	{
				1854	return unicodeescape_string(s, size, 0);
				1855	}
				1856
				1857	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1858	{
				1859	if (!PyUnicode_Check(unicode)) {
				1860	PyErr_BadArgument();
				1861	return NULL;
				1862	}
				1863	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1864	PyUnicode_GET_SIZE(unicode));
				1865	}
				1866
				1867	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1868
				1869	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1870	int size,
				1871	const char *errors)
				1872	{
				1873	PyUnicodeObject *v;
				1874	Py_UNICODE p, buf;
				1875	const char *end;
				1876	const char *bs;
				1877
				1878	/* Escaped strings will always be longer than the resulting
				1879	Unicode string, so we start with size here and then reduce the
				1880	length after conversion to the true value. */
				1881	v = _PyUnicode_New(size);
				1882	if (v == NULL)
				1883	goto onError;
				1884	if (size == 0)
				1885	return (PyObject *)v;
				1886	p = buf = PyUnicode_AS_UNICODE(v);
				1887	end = s + size;
				1888	while (s < end) {
				1889	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1890	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1891	int i;
				1892
				1893	/* Non-escape characters are interpreted as Unicode ordinals */
				1894	if (*s != '\\') {
				1895	p++ = (unsigned char)s++;
				1896	continue;
				1897	}
				1898
				1899	/* \u-escapes are only interpreted iff the number of leading
				1900	backslashes if odd */
				1901	bs = s;
				1902	for (;s < end;) {
				1903	if (*s != '\\')
				1904	break;
				1905	p++ = (unsigned char)s++;
				1906	}
				1907	if (((s - bs) & 1) == 0 \|\|
				1908	s >= end \|\|
				1909	*s != 'u') {
				1910	continue;
				1911	}
				1912	p--;
				1913	s++;
				1914
				1915	/* \uXXXX with 4 hex digits */
				1916	for (x = 0, i = 0; i < 4; i++) {
				1917	c = (unsigned char)s[i];
				1918	if (!isxdigit(c)) {
				1919	if (unicodeescape_decoding_error(&s, &x, errors,
				1920	"truncated \\uXXXX"))
				1921	goto onError;
				1922	i++;
				1923	break;
				1924	}
				1925	x = (x<<4) & ~0xF;
				1926	if (c >= '0' && c <= '9')
				1927	x += c - '0';
				1928	else if (c >= 'a' && c <= 'f')
				1929	x += 10 + c - 'a';
				1930	else
				1931	x += 10 + c - 'A';
				1932	}
				1933	s += i;
				1934	*p++ = x;
				1935	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1936	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1937	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1938	return (PyObject *)v;
				1939
				1940	onError:
				1941	Py_XDECREF(v);
				1942	return NULL;
				1943	}
				1944
				1945	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1946	int size)
				1947	{
				1948	PyObject *repr;
				1949	char *p;
				1950	char *q;
				1951
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1952	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1953
				1954	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1955	if (repr == NULL)
				1956	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1957	if (size == 0)
				1958	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1959
				1960	p = q = PyString_AS_STRING(repr);
				1961	while (size-- > 0) {
				1962	Py_UNICODE ch = *s++;
				1963	/* Map 16-bit characters to '\uxxxx' */
				1964	if (ch >= 256) {
				1965	*p++ = '\\';
				1966	*p++ = 'u';
				1967	*p++ = hexdigit[(ch >> 12) & 0xf];
				1968	*p++ = hexdigit[(ch >> 8) & 0xf];
				1969	*p++ = hexdigit[(ch >> 4) & 0xf];
				1970	*p++ = hexdigit[ch & 15];
				1971	}
				1972	/* Copy everything else as-is */
				1973	else
				1974	*p++ = (char) ch;
				1975	}
				1976	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1977	if (_PyString_Resize(&repr, p - q))
				1978	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1979
				1980	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1981
				1982	onError:
				1983	Py_DECREF(repr);
				1984	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1985	}
				1986
				1987	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1988	{
				1989	if (!PyUnicode_Check(unicode)) {
				1990	PyErr_BadArgument();
				1991	return NULL;
				1992	}
				1993	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1994	PyUnicode_GET_SIZE(unicode));
				1995	}
				1996
				1997	/* --- Latin-1 Codec ------------------------------------------------------ */
				1998
				1999	PyObject PyUnicode_DecodeLatin1(const char s,
				2000	int size,
				2001	const char *errors)
				2002	{
				2003	PyUnicodeObject *v;
				2004	Py_UNICODE *p;
				2005
				2006	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2007	if (size == 1 && (unsigned char)s < 256) {
				2008	Py_UNICODE r = (unsigned char)s;
				2009	return PyUnicode_FromUnicode(&r, 1);
				2010	}
				2011
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2012	v = _PyUnicode_New(size);
				2013	if (v == NULL)
				2014	goto onError;
				2015	if (size == 0)
				2016	return (PyObject *)v;
				2017	p = PyUnicode_AS_UNICODE(v);
				2018	while (size-- > 0)
				2019	p++ = (unsigned char)s++;
				2020	return (PyObject *)v;
				2021
				2022	onError:
				2023	Py_XDECREF(v);
				2024	return NULL;
				2025	}
				2026
				2027	static
				2028	int latin1_encoding_error(const Py_UNICODE **source,
				2029	char **dest,
				2030	const char *errors,
				2031	const char *details)
				2032	{
				2033	if ((errors == NULL) \|\|
				2034	(strcmp(errors,"strict") == 0)) {
				2035	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2036	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2037	details);
				2038	return -1;
				2039	}
				2040	else if (strcmp(errors,"ignore") == 0) {
				2041	return 0;
				2042	}
				2043	else if (strcmp(errors,"replace") == 0) {
				2044	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2045	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2046	return 0;
				2047	}
				2048	else {
				2049	PyErr_Format(PyExc_ValueError,
				2050	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2051	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2052	errors);
				2053	return -1;
				2054	}
				2055	}
				2056
				2057	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				2058	int size,
				2059	const char *errors)
				2060	{
				2061	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2062	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2063
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2064	repr = PyString_FromStringAndSize(NULL, size);
				2065	if (repr == NULL)
				2066	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2067	if (size == 0)
				2068	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2069
				2070	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2071	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2072	while (size-- > 0) {
				2073	Py_UNICODE ch = *p++;
				2074	if (ch >= 256) {
				2075	if (latin1_encoding_error(&p, &s, errors,
				2076	"ordinal not in range(256)"))
				2077	goto onError;
				2078	}
				2079	else
				2080	*s++ = (char)ch;
				2081	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2082	/* Resize if error handling skipped some characters */
				2083	if (s - start < PyString_GET_SIZE(repr))
				2084	if (_PyString_Resize(&repr, s - start))
				2085	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2086	return repr;
				2087
				2088	onError:
				2089	Py_DECREF(repr);
				2090	return NULL;
				2091	}
				2092
				2093	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				2094	{
				2095	if (!PyUnicode_Check(unicode)) {
				2096	PyErr_BadArgument();
				2097	return NULL;
				2098	}
				2099	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				2100	PyUnicode_GET_SIZE(unicode),
				2101	NULL);
				2102	}
				2103
				2104	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				2105
				2106	static
				2107	int ascii_decoding_error(const char **source,
				2108	Py_UNICODE **dest,
				2109	const char *errors,
				2110	const char *details)
				2111	{
				2112	if ((errors == NULL) \|\|
				2113	(strcmp(errors,"strict") == 0)) {
				2114	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2115	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2116	details);
				2117	return -1;
				2118	}
				2119	else if (strcmp(errors,"ignore") == 0) {
				2120	return 0;
				2121	}
				2122	else if (strcmp(errors,"replace") == 0) {
				2123	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2124	(*dest)++;
				2125	return 0;
				2126	}
				2127	else {
				2128	PyErr_Format(PyExc_ValueError,
				2129	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2130	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2131	errors);
				2132	return -1;
				2133	}
				2134	}
				2135
				2136	PyObject PyUnicode_DecodeASCII(const char s,
				2137	int size,
				2138	const char *errors)
				2139	{
				2140	PyUnicodeObject *v;
				2141	Py_UNICODE *p;
				2142
				2143	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2144	if (size == 1 && (unsigned char)s < 128) {
				2145	Py_UNICODE r = (unsigned char)s;
				2146	return PyUnicode_FromUnicode(&r, 1);
				2147	}
				2148
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2149	v = _PyUnicode_New(size);
				2150	if (v == NULL)
				2151	goto onError;
				2152	if (size == 0)
				2153	return (PyObject *)v;
				2154	p = PyUnicode_AS_UNICODE(v);
				2155	while (size-- > 0) {
				2156	register unsigned char c;
				2157
				2158	c = (unsigned char)*s++;
				2159	if (c < 128)
				2160	*p++ = c;
				2161	else if (ascii_decoding_error(&s, &p, errors,
				2162	"ordinal not in range(128)"))
				2163	goto onError;
				2164	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2165	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2166	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2167	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2168	return (PyObject *)v;
				2169
				2170	onError:
				2171	Py_XDECREF(v);
				2172	return NULL;
				2173	}
				2174
				2175	static
				2176	int ascii_encoding_error(const Py_UNICODE **source,
				2177	char **dest,
				2178	const char *errors,
				2179	const char *details)
				2180	{
				2181	if ((errors == NULL) \|\|
				2182	(strcmp(errors,"strict") == 0)) {
				2183	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2184	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2185	details);
				2186	return -1;
				2187	}
				2188	else if (strcmp(errors,"ignore") == 0) {
				2189	return 0;
				2190	}
				2191	else if (strcmp(errors,"replace") == 0) {
				2192	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2193	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2194	return 0;
				2195	}
				2196	else {
				2197	PyErr_Format(PyExc_ValueError,
				2198	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2199	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2200	errors);
				2201	return -1;
				2202	}
				2203	}
				2204
				2205	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				2206	int size,
				2207	const char *errors)
				2208	{
				2209	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2210	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2211
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2212	repr = PyString_FromStringAndSize(NULL, size);
				2213	if (repr == NULL)
				2214	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2215	if (size == 0)
				2216	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2217
				2218	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2219	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2220	while (size-- > 0) {
				2221	Py_UNICODE ch = *p++;
				2222	if (ch >= 128) {
				2223	if (ascii_encoding_error(&p, &s, errors,
				2224	"ordinal not in range(128)"))
				2225	goto onError;
				2226	}
				2227	else
				2228	*s++ = (char)ch;
				2229	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2230	/* Resize if error handling skipped some characters */
				2231	if (s - start < PyString_GET_SIZE(repr))
				2232	if (_PyString_Resize(&repr, s - start))
				2233	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2234	return repr;
				2235
				2236	onError:
				2237	Py_DECREF(repr);
				2238	return NULL;
				2239	}
				2240
				2241	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				2242	{
				2243	if (!PyUnicode_Check(unicode)) {
				2244	PyErr_BadArgument();
				2245	return NULL;
				2246	}
				2247	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				2248	PyUnicode_GET_SIZE(unicode),
				2249	NULL);
				2250	}
				2251
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	2252	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2253
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2254	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2255
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2256	PyObject PyUnicode_DecodeMBCS(const char s,
				2257	int size,
				2258	const char *errors)
				2259	{
				2260	PyUnicodeObject *v;
				2261	Py_UNICODE *p;
				2262
				2263	/* First get the size of the result */
				2264	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2265	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2266	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2267
				2268	v = _PyUnicode_New(usize);
				2269	if (v == NULL)
				2270	return NULL;
				2271	if (usize == 0)
				2272	return (PyObject *)v;
				2273	p = PyUnicode_AS_UNICODE(v);
				2274	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				2275	Py_DECREF(v);
				2276	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2277	}
				2278
				2279	return (PyObject *)v;
				2280	}
				2281
				2282	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				2283	int size,
				2284	const char *errors)
				2285	{
				2286	PyObject *repr;
				2287	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2288	DWORD mbcssize;
				2289
				2290	/* If there are no characters, bail now! */
				2291	if (size==0)
				2292	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2293
				2294	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2295	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2296	if (mbcssize==0)
				2297	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2298
				2299	repr = PyString_FromStringAndSize(NULL, mbcssize);
				2300	if (repr == NULL)
				2301	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2302	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2303	return repr;
				2304
				2305	/* Do the conversion */
				2306	s = PyString_AS_STRING(repr);
				2307	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				2308	Py_DECREF(repr);
				2309	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2310	}
				2311	return repr;
				2312	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2313
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2314	#endif /* MS_WIN32 */
				2315
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2316	/* --- Character Mapping Codec -------------------------------------------- */
				2317
				2318	static
				2319	int charmap_decoding_error(const char **source,
				2320	Py_UNICODE **dest,
				2321	const char *errors,
				2322	const char *details)
				2323	{
				2324	if ((errors == NULL) \|\|
				2325	(strcmp(errors,"strict") == 0)) {
				2326	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2327	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2328	details);
				2329	return -1;
				2330	}
				2331	else if (strcmp(errors,"ignore") == 0) {
				2332	return 0;
				2333	}
				2334	else if (strcmp(errors,"replace") == 0) {
				2335	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2336	(*dest)++;
				2337	return 0;
				2338	}
				2339	else {
				2340	PyErr_Format(PyExc_ValueError,
				2341	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2342	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2343	errors);
				2344	return -1;
				2345	}
				2346	}
				2347
				2348	PyObject PyUnicode_DecodeCharmap(const char s,
				2349	int size,
				2350	PyObject *mapping,
				2351	const char *errors)
				2352	{
				2353	PyUnicodeObject *v;
				2354	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2355	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2356
				2357	/* Default to Latin-1 */
				2358	if (mapping == NULL)
				2359	return PyUnicode_DecodeLatin1(s, size, errors);
				2360
				2361	v = _PyUnicode_New(size);
				2362	if (v == NULL)
				2363	goto onError;
				2364	if (size == 0)
				2365	return (PyObject *)v;
				2366	p = PyUnicode_AS_UNICODE(v);
				2367	while (size-- > 0) {
				2368	unsigned char ch = *s++;
				2369	PyObject w, x;
				2370
				2371	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2372	w = PyInt_FromLong((long)ch);
				2373	if (w == NULL)
				2374	goto onError;
				2375	x = PyObject_GetItem(mapping, w);
				2376	Py_DECREF(w);
				2377	if (x == NULL) {
				2378	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2379	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2380	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2381	x = Py_None;
				2382	Py_INCREF(x);
				2383	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2384	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2385	}
				2386
				2387	/* Apply mapping */
				2388	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2389	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2390	if (value < 0 \|\| value > 65535) {
				2391	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2392	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2393	Py_DECREF(x);
				2394	goto onError;
				2395	}
				2396	*p++ = (Py_UNICODE)value;
				2397	}
				2398	else if (x == Py_None) {
				2399	/* undefined mapping */
				2400	if (charmap_decoding_error(&s, &p, errors,
				2401	"character maps to <undefined>")) {
				2402	Py_DECREF(x);
				2403	goto onError;
				2404	}
				2405	}
				2406	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2407	int targetsize = PyUnicode_GET_SIZE(x);
				2408
				2409	if (targetsize == 1)
				2410	/* 1-1 mapping */
				2411	p++ = PyUnicode_AS_UNICODE(x);
				2412
				2413	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2414	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2415	if (targetsize > extrachars) {
				2416	/* resize first */
				2417	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2418	int needed = (targetsize - extrachars) + \
				2419	(targetsize << 2);
				2420	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2421	if (_PyUnicode_Resize(&v,
				2422	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2423	Py_DECREF(x);
				2424	goto onError;
				2425	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2426	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2427	}
				2428	Py_UNICODE_COPY(p,
				2429	PyUnicode_AS_UNICODE(x),
				2430	targetsize);
				2431	p += targetsize;
				2432	extrachars -= targetsize;
				2433	}
				2434	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2435	}
				2436	else {
				2437	/* wrong return value */
				2438	PyErr_SetString(PyExc_TypeError,
				2439	"character mapping must return integer, None or unicode");
				2440	Py_DECREF(x);
				2441	goto onError;
				2442	}
				2443	Py_DECREF(x);
				2444	}
				2445	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2446	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2447	goto onError;
				2448	return (PyObject *)v;
				2449
				2450	onError:
				2451	Py_XDECREF(v);
				2452	return NULL;
				2453	}
				2454
				2455	static
				2456	int charmap_encoding_error(const Py_UNICODE **source,
				2457	char **dest,
				2458	const char *errors,
				2459	const char *details)
				2460	{
				2461	if ((errors == NULL) \|\|
				2462	(strcmp(errors,"strict") == 0)) {
				2463	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2464	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2465	details);
				2466	return -1;
				2467	}
				2468	else if (strcmp(errors,"ignore") == 0) {
				2469	return 0;
				2470	}
				2471	else if (strcmp(errors,"replace") == 0) {
				2472	**dest = '?';
				2473	(*dest)++;
				2474	return 0;
				2475	}
				2476	else {
				2477	PyErr_Format(PyExc_ValueError,
				2478	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2479	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2480	errors);
				2481	return -1;
				2482	}
				2483	}
				2484
				2485	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2486	int size,
				2487	PyObject *mapping,
				2488	const char *errors)
				2489	{
				2490	PyObject *v;
				2491	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2492	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2493
				2494	/* Default to Latin-1 */
				2495	if (mapping == NULL)
				2496	return PyUnicode_EncodeLatin1(p, size, errors);
				2497
				2498	v = PyString_FromStringAndSize(NULL, size);
				2499	if (v == NULL)
				2500	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2501	if (size == 0)
				2502	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2503	s = PyString_AS_STRING(v);
				2504	while (size-- > 0) {
				2505	Py_UNICODE ch = *p++;
				2506	PyObject w, x;
				2507
				2508	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2509	w = PyInt_FromLong((long)ch);
				2510	if (w == NULL)
				2511	goto onError;
				2512	x = PyObject_GetItem(mapping, w);
				2513	Py_DECREF(w);
				2514	if (x == NULL) {
				2515	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2516	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2517	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2518	x = Py_None;
				2519	Py_INCREF(x);
				2520	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2521	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2522	}
				2523
				2524	/* Apply mapping */
				2525	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2526	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2527	if (value < 0 \|\| value > 255) {
				2528	PyErr_SetString(PyExc_TypeError,
				2529	"character mapping must be in range(256)");
				2530	Py_DECREF(x);
				2531	goto onError;
				2532	}
				2533	*s++ = (char)value;
				2534	}
				2535	else if (x == Py_None) {
				2536	/* undefined mapping */
				2537	if (charmap_encoding_error(&p, &s, errors,
				2538	"character maps to <undefined>")) {
				2539	Py_DECREF(x);
				2540	goto onError;
				2541	}
				2542	}
				2543	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2544	int targetsize = PyString_GET_SIZE(x);
				2545
				2546	if (targetsize == 1)
				2547	/* 1-1 mapping */
				2548	s++ = PyString_AS_STRING(x);
				2549
				2550	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2551	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2552	if (targetsize > extrachars) {
				2553	/* resize first */
				2554	int oldpos = (int)(s - PyString_AS_STRING(v));
				2555	int needed = (targetsize - extrachars) + \
				2556	(targetsize << 2);
				2557	extrachars += needed;
				2558	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2559	Py_DECREF(x);
				2560	goto onError;
				2561	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2562	s = PyString_AS_STRING(v) + oldpos;
				2563	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2564	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2565	s += targetsize;
				2566	extrachars -= targetsize;
				2567	}
				2568	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2569	}
				2570	else {
				2571	/* wrong return value */
				2572	PyErr_SetString(PyExc_TypeError,
				2573	"character mapping must return integer, None or unicode");
				2574	Py_DECREF(x);
				2575	goto onError;
				2576	}
				2577	Py_DECREF(x);
				2578	}
				2579	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2580	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2581	goto onError;
				2582	return v;
				2583
				2584	onError:
				2585	Py_DECREF(v);
				2586	return NULL;
				2587	}
				2588
				2589	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2590	PyObject *mapping)
				2591	{
				2592	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2593	PyErr_BadArgument();
				2594	return NULL;
				2595	}
				2596	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2597	PyUnicode_GET_SIZE(unicode),
				2598	mapping,
				2599	NULL);
				2600	}
				2601
				2602	static
				2603	int translate_error(const Py_UNICODE **source,
				2604	Py_UNICODE **dest,
				2605	const char *errors,
				2606	const char *details)
				2607	{
				2608	if ((errors == NULL) \|\|
				2609	(strcmp(errors,"strict") == 0)) {
				2610	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2611	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2612	details);
				2613	return -1;
				2614	}
				2615	else if (strcmp(errors,"ignore") == 0) {
				2616	return 0;
				2617	}
				2618	else if (strcmp(errors,"replace") == 0) {
				2619	**dest = '?';
				2620	(*dest)++;
				2621	return 0;
				2622	}
				2623	else {
				2624	PyErr_Format(PyExc_ValueError,
				2625	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2626	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2627	errors);
				2628	return -1;
				2629	}
				2630	}
				2631
				2632	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2633	int size,
				2634	PyObject *mapping,
				2635	const char *errors)
				2636	{
				2637	PyUnicodeObject *v;
				2638	Py_UNICODE *p;
				2639
				2640	if (mapping == NULL) {
				2641	PyErr_BadArgument();
				2642	return NULL;
				2643	}
				2644
				2645	/* Output will never be longer than input */
				2646	v = _PyUnicode_New(size);
				2647	if (v == NULL)
				2648	goto onError;
				2649	if (size == 0)
				2650	goto done;
				2651	p = PyUnicode_AS_UNICODE(v);
				2652	while (size-- > 0) {
				2653	Py_UNICODE ch = *s++;
				2654	PyObject w, x;
				2655
				2656	/* Get mapping */
				2657	w = PyInt_FromLong(ch);
				2658	if (w == NULL)
				2659	goto onError;
				2660	x = PyObject_GetItem(mapping, w);
				2661	Py_DECREF(w);
				2662	if (x == NULL) {
				2663	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2664	/* No mapping found: default to 1-1 mapping */
				2665	PyErr_Clear();
				2666	*p++ = ch;
				2667	continue;
				2668	}
				2669	goto onError;
				2670	}
				2671
				2672	/* Apply mapping */
				2673	if (PyInt_Check(x))
				2674	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2675	else if (x == Py_None) {
				2676	/* undefined mapping */
				2677	if (translate_error(&s, &p, errors,
				2678	"character maps to <undefined>")) {
				2679	Py_DECREF(x);
				2680	goto onError;
				2681	}
				2682	}
				2683	else if (PyUnicode_Check(x)) {
				2684	if (PyUnicode_GET_SIZE(x) != 1) {
				2685	/* 1-n mapping */
				2686	PyErr_SetString(PyExc_NotImplementedError,
				2687	"1-n mappings are currently not implemented");
				2688	Py_DECREF(x);
				2689	goto onError;
				2690	}
				2691	p++ = PyUnicode_AS_UNICODE(x);
				2692	}
				2693	else {
				2694	/* wrong return value */
				2695	PyErr_SetString(PyExc_TypeError,
				2696	"translate mapping must return integer, None or unicode");
				2697	Py_DECREF(x);
				2698	goto onError;
				2699	}
				2700	Py_DECREF(x);
				2701	}
				2702	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2703	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2704	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2705
				2706	done:
				2707	return (PyObject *)v;
				2708
				2709	onError:
				2710	Py_XDECREF(v);
				2711	return NULL;
				2712	}
				2713
				2714	PyObject PyUnicode_Translate(PyObject str,
				2715	PyObject *mapping,
				2716	const char *errors)
				2717	{
				2718	PyObject *result;
				2719
				2720	str = PyUnicode_FromObject(str);
				2721	if (str == NULL)
				2722	goto onError;
				2723	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2724	PyUnicode_GET_SIZE(str),
				2725	mapping,
				2726	errors);
				2727	Py_DECREF(str);
				2728	return result;
				2729
				2730	onError:
				2731	Py_XDECREF(str);
				2732	return NULL;
				2733	}
				2734
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2735	/* --- Decimal Encoder ---------------------------------------------------- */
				2736
				2737	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2738	int length,
				2739	char *output,
				2740	const char *errors)
				2741	{
				2742	Py_UNICODE p, end;
				2743
				2744	if (output == NULL) {
				2745	PyErr_BadArgument();
				2746	return -1;
				2747	}
				2748
				2749	p = s;
				2750	end = s + length;
				2751	while (p < end) {
				2752	register Py_UNICODE ch = *p++;
				2753	int decimal;
				2754
				2755	if (Py_UNICODE_ISSPACE(ch)) {
				2756	*output++ = ' ';
				2757	continue;
				2758	}
				2759	decimal = Py_UNICODE_TODECIMAL(ch);
				2760	if (decimal >= 0) {
				2761	*output++ = '0' + decimal;
				2762	continue;
				2763	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2764	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2765	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2766	continue;
				2767	}
				2768	/* All other characters are considered invalid */
				2769	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2770	PyErr_SetString(PyExc_ValueError,
				2771	"invalid decimal Unicode string");
				2772	goto onError;
				2773	}
				2774	else if (strcmp(errors, "ignore") == 0)
				2775	continue;
				2776	else if (strcmp(errors, "replace") == 0) {
				2777	*output++ = '?';
				2778	continue;
				2779	}
				2780	}
				2781	/* 0-terminate the output string */
				2782	*output++ = '\0';
				2783	return 0;
				2784
				2785	onError:
				2786	return -1;
				2787	}
				2788
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2789	/* --- Helpers ------------------------------------------------------------ */
				2790
				2791	static
				2792	int count(PyUnicodeObject *self,
				2793	int start,
				2794	int end,
				2795	PyUnicodeObject *substring)
				2796	{
				2797	int count = 0;
				2798
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2799	if (start < 0)
				2800	start += self->length;
				2801	if (start < 0)
				2802	start = 0;
				2803	if (end > self->length)
				2804	end = self->length;
				2805	if (end < 0)
				2806	end += self->length;
				2807	if (end < 0)
				2808	end = 0;
				2809
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2810	if (substring->length == 0)
				2811	return (end - start + 1);
				2812
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2813	end -= substring->length;
				2814
				2815	while (start <= end)
				2816	if (Py_UNICODE_MATCH(self, start, substring)) {
				2817	count++;
				2818	start += substring->length;
				2819	} else
				2820	start++;
				2821
				2822	return count;
				2823	}
				2824
				2825	int PyUnicode_Count(PyObject *str,
				2826	PyObject *substr,
				2827	int start,
				2828	int end)
				2829	{
				2830	int result;
				2831
				2832	str = PyUnicode_FromObject(str);
				2833	if (str == NULL)
				2834	return -1;
				2835	substr = PyUnicode_FromObject(substr);
				2836	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2837	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2838	return -1;
				2839	}
				2840
				2841	result = count((PyUnicodeObject *)str,
				2842	start, end,
				2843	(PyUnicodeObject *)substr);
				2844
				2845	Py_DECREF(str);
				2846	Py_DECREF(substr);
				2847	return result;
				2848	}
				2849
				2850	static
				2851	int findstring(PyUnicodeObject *self,
				2852	PyUnicodeObject *substring,
				2853	int start,
				2854	int end,
				2855	int direction)
				2856	{
				2857	if (start < 0)
				2858	start += self->length;
				2859	if (start < 0)
				2860	start = 0;
				2861
				2862	if (substring->length == 0)
				2863	return start;
				2864
				2865	if (end > self->length)
				2866	end = self->length;
				2867	if (end < 0)
				2868	end += self->length;
				2869	if (end < 0)
				2870	end = 0;
				2871
				2872	end -= substring->length;
				2873
				2874	if (direction < 0) {
				2875	for (; end >= start; end--)
				2876	if (Py_UNICODE_MATCH(self, end, substring))
				2877	return end;
				2878	} else {
				2879	for (; start <= end; start++)
				2880	if (Py_UNICODE_MATCH(self, start, substring))
				2881	return start;
				2882	}
				2883
				2884	return -1;
				2885	}
				2886
				2887	int PyUnicode_Find(PyObject *str,
				2888	PyObject *substr,
				2889	int start,
				2890	int end,
				2891	int direction)
				2892	{
				2893	int result;
				2894
				2895	str = PyUnicode_FromObject(str);
				2896	if (str == NULL)
				2897	return -1;
				2898	substr = PyUnicode_FromObject(substr);
				2899	if (substr == NULL) {
				2900	Py_DECREF(substr);
				2901	return -1;
				2902	}
				2903
				2904	result = findstring((PyUnicodeObject *)str,
				2905	(PyUnicodeObject *)substr,
				2906	start, end, direction);
				2907	Py_DECREF(str);
				2908	Py_DECREF(substr);
				2909	return result;
				2910	}
				2911
				2912	static
				2913	int tailmatch(PyUnicodeObject *self,
				2914	PyUnicodeObject *substring,
				2915	int start,
				2916	int end,
				2917	int direction)
				2918	{
				2919	if (start < 0)
				2920	start += self->length;
				2921	if (start < 0)
				2922	start = 0;
				2923
				2924	if (substring->length == 0)
				2925	return 1;
				2926
				2927	if (end > self->length)
				2928	end = self->length;
				2929	if (end < 0)
				2930	end += self->length;
				2931	if (end < 0)
				2932	end = 0;
				2933
				2934	end -= substring->length;
				2935	if (end < start)
				2936	return 0;
				2937
				2938	if (direction > 0) {
				2939	if (Py_UNICODE_MATCH(self, end, substring))
				2940	return 1;
				2941	} else {
				2942	if (Py_UNICODE_MATCH(self, start, substring))
				2943	return 1;
				2944	}
				2945
				2946	return 0;
				2947	}
				2948
				2949	int PyUnicode_Tailmatch(PyObject *str,
				2950	PyObject *substr,
				2951	int start,
				2952	int end,
				2953	int direction)
				2954	{
				2955	int result;
				2956
				2957	str = PyUnicode_FromObject(str);
				2958	if (str == NULL)
				2959	return -1;
				2960	substr = PyUnicode_FromObject(substr);
				2961	if (substr == NULL) {
				2962	Py_DECREF(substr);
				2963	return -1;
				2964	}
				2965
				2966	result = tailmatch((PyUnicodeObject *)str,
				2967	(PyUnicodeObject *)substr,
				2968	start, end, direction);
				2969	Py_DECREF(str);
				2970	Py_DECREF(substr);
				2971	return result;
				2972	}
				2973
				2974	static
				2975	const Py_UNICODE findchar(const Py_UNICODE s,
				2976	int size,
				2977	Py_UNICODE ch)
				2978	{
				2979	/* like wcschr, but doesn't stop at NULL characters */
				2980
				2981	while (size-- > 0) {
				2982	if (*s == ch)
				2983	return s;
				2984	s++;
				2985	}
				2986
				2987	return NULL;
				2988	}
				2989
				2990	/* Apply fixfct filter to the Unicode object self and return a
				2991	reference to the modified object */
				2992
				2993	static
				2994	PyObject fixup(PyUnicodeObject self,
				2995	int (fixfct)(PyUnicodeObject s))
				2996	{
				2997
				2998	PyUnicodeObject *u;
				2999
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3000	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3001	if (u == NULL)
				3002	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3003
				3004	Py_UNICODE_COPY(u->str, self->str, self->length);
				3005
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3006	if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3007	/* fixfct should return TRUE if it modified the buffer. If
				3008	FALSE, return a reference to the original buffer instead
				3009	(to save space, not time) */
				3010	Py_INCREF(self);
				3011	Py_DECREF(u);
				3012	return (PyObject*) self;
				3013	}
				3014	return (PyObject*) u;
				3015	}
				3016
				3017	static
				3018	int fixupper(PyUnicodeObject *self)
				3019	{
				3020	int len = self->length;
				3021	Py_UNICODE *s = self->str;
				3022	int status = 0;
				3023
				3024	while (len-- > 0) {
				3025	register Py_UNICODE ch;
				3026
				3027	ch = Py_UNICODE_TOUPPER(*s);
				3028	if (ch != *s) {
				3029	status = 1;
				3030	*s = ch;
				3031	}
				3032	s++;
				3033	}
				3034
				3035	return status;
				3036	}
				3037
				3038	static
				3039	int fixlower(PyUnicodeObject *self)
				3040	{
				3041	int len = self->length;
				3042	Py_UNICODE *s = self->str;
				3043	int status = 0;
				3044
				3045	while (len-- > 0) {
				3046	register Py_UNICODE ch;
				3047
				3048	ch = Py_UNICODE_TOLOWER(*s);
				3049	if (ch != *s) {
				3050	status = 1;
				3051	*s = ch;
				3052	}
				3053	s++;
				3054	}
				3055
				3056	return status;
				3057	}
				3058
				3059	static
				3060	int fixswapcase(PyUnicodeObject *self)
				3061	{
				3062	int len = self->length;
				3063	Py_UNICODE *s = self->str;
				3064	int status = 0;
				3065
				3066	while (len-- > 0) {
				3067	if (Py_UNICODE_ISUPPER(*s)) {
				3068	s = Py_UNICODE_TOLOWER(s);
				3069	status = 1;
				3070	} else if (Py_UNICODE_ISLOWER(*s)) {
				3071	s = Py_UNICODE_TOUPPER(s);
				3072	status = 1;
				3073	}
				3074	s++;
				3075	}
				3076
				3077	return status;
				3078	}
				3079
				3080	static
				3081	int fixcapitalize(PyUnicodeObject *self)
				3082	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3083	int len = self->length;
				3084	Py_UNICODE *s = self->str;
				3085	int status = 0;
				3086
				3087	if (len == 0)
				3088	return 0;
				3089	if (Py_UNICODE_ISLOWER(*s)) {
				3090	s = Py_UNICODE_TOUPPER(s);
				3091	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3092	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3093	s++;
				3094	while (--len > 0) {
				3095	if (Py_UNICODE_ISUPPER(*s)) {
				3096	s = Py_UNICODE_TOLOWER(s);
				3097	status = 1;
				3098	}
				3099	s++;
				3100	}
				3101	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3102	}
				3103
				3104	static
				3105	int fixtitle(PyUnicodeObject *self)
				3106	{
				3107	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3108	register Py_UNICODE *e;
				3109	int previous_is_cased;
				3110
				3111	/* Shortcut for single character strings */
				3112	if (PyUnicode_GET_SIZE(self) == 1) {
				3113	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				3114	if (*p != ch) {
				3115	*p = ch;
				3116	return 1;
				3117	}
				3118	else
				3119	return 0;
				3120	}
				3121
				3122	e = p + PyUnicode_GET_SIZE(self);
				3123	previous_is_cased = 0;
				3124	for (; p < e; p++) {
				3125	register const Py_UNICODE ch = *p;
				3126
				3127	if (previous_is_cased)
				3128	*p = Py_UNICODE_TOLOWER(ch);
				3129	else
				3130	*p = Py_UNICODE_TOTITLE(ch);
				3131
				3132	if (Py_UNICODE_ISLOWER(ch) \|\|
				3133	Py_UNICODE_ISUPPER(ch) \|\|
				3134	Py_UNICODE_ISTITLE(ch))
				3135	previous_is_cased = 1;
				3136	else
				3137	previous_is_cased = 0;
				3138	}
				3139	return 1;
				3140	}
				3141
				3142	PyObject PyUnicode_Join(PyObject separator,
				3143	PyObject *seq)
				3144	{
				3145	Py_UNICODE *sep;
				3146	int seplen;
				3147	PyUnicodeObject *res = NULL;
				3148	int reslen = 0;
				3149	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3150	int sz = 100;
				3151	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3152	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3153
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3154	it = PyObject_GetIter(seq);
				3155	if (it == NULL)
				3156	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3157
				3158	if (separator == NULL) {
				3159	Py_UNICODE blank = ' ';
				3160	sep = &blank;
				3161	seplen = 1;
				3162	}
				3163	else {
				3164	separator = PyUnicode_FromObject(separator);
				3165	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3166	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3167	sep = PyUnicode_AS_UNICODE(separator);
				3168	seplen = PyUnicode_GET_SIZE(separator);
				3169	}
				3170
				3171	res = _PyUnicode_New(sz);
				3172	if (res == NULL)
				3173	goto onError;
				3174	p = PyUnicode_AS_UNICODE(res);
				3175	reslen = 0;
				3176
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3177	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3178	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3179	PyObject *item = PyIter_Next(it);
				3180	if (item == NULL) {
				3181	if (PyErr_Occurred())
				3182	goto onError;
				3183	break;
				3184	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3185	if (!PyUnicode_Check(item)) {
				3186	PyObject *v;
				3187	v = PyUnicode_FromObject(item);
				3188	Py_DECREF(item);
				3189	item = v;
				3190	if (item == NULL)
				3191	goto onError;
				3192	}
				3193	itemlen = PyUnicode_GET_SIZE(item);
				3194	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3195	if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3196	goto onError;
				3197	sz *= 2;
				3198	p = PyUnicode_AS_UNICODE(res) + reslen;
				3199	}
				3200	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3201	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3202	p += seplen;
				3203	reslen += seplen;
				3204	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3205	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3206	p += itemlen;
				3207	reslen += itemlen;
				3208	Py_DECREF(item);
				3209	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3210	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3211	goto onError;
				3212
				3213	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3214	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3215	return (PyObject *)res;
				3216
				3217	onError:
				3218	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3219	Py_XDECREF(res);
				3220	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3221	return NULL;
				3222	}
				3223
				3224	static
				3225	PyUnicodeObject pad(PyUnicodeObject self,
				3226	int left,
				3227	int right,
				3228	Py_UNICODE fill)
				3229	{
				3230	PyUnicodeObject *u;
				3231
				3232	if (left < 0)
				3233	left = 0;
				3234	if (right < 0)
				3235	right = 0;
				3236
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3237	if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3238	Py_INCREF(self);
				3239	return self;
				3240	}
				3241
				3242	u = _PyUnicode_New(left + self->length + right);
				3243	if (u) {
				3244	if (left)
				3245	Py_UNICODE_FILL(u->str, fill, left);
				3246	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				3247	if (right)
				3248	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				3249	}
				3250
				3251	return u;
				3252	}
				3253
				3254	#define SPLIT_APPEND(data, left, right) \
				3255	str = PyUnicode_FromUnicode(data + left, right - left); \
				3256	if (!str) \
				3257	goto onError; \
				3258	if (PyList_Append(list, str)) { \
				3259	Py_DECREF(str); \
				3260	goto onError; \
				3261	} \
				3262	else \
				3263	Py_DECREF(str);
				3264
				3265	static
				3266	PyObject split_whitespace(PyUnicodeObject self,
				3267	PyObject *list,
				3268	int maxcount)
				3269	{
				3270	register int i;
				3271	register int j;
				3272	int len = self->length;
				3273	PyObject *str;
				3274
				3275	for (i = j = 0; i < len; ) {
				3276	/* find a token */
				3277	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3278	i++;
				3279	j = i;
				3280	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				3281	i++;
				3282	if (j < i) {
				3283	if (maxcount-- <= 0)
				3284	break;
				3285	SPLIT_APPEND(self->str, j, i);
				3286	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3287	i++;
				3288	j = i;
				3289	}
				3290	}
				3291	if (j < len) {
				3292	SPLIT_APPEND(self->str, j, len);
				3293	}
				3294	return list;
				3295
				3296	onError:
				3297	Py_DECREF(list);
				3298	return NULL;
				3299	}
				3300
				3301	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3302	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3303	{
				3304	register int i;
				3305	register int j;
				3306	int len;
				3307	PyObject *list;
				3308	PyObject *str;
				3309	Py_UNICODE *data;
				3310
				3311	string = PyUnicode_FromObject(string);
				3312	if (string == NULL)
				3313	return NULL;
				3314	data = PyUnicode_AS_UNICODE(string);
				3315	len = PyUnicode_GET_SIZE(string);
				3316
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3317	list = PyList_New(0);
				3318	if (!list)
				3319	goto onError;
				3320
				3321	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3322	int eol;
				3323
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3324	/* Find a line and append it */
				3325	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				3326	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3327
				3328	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3329	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3330	if (i < len) {
				3331	if (data[i] == '\r' && i + 1 < len &&
				3332	data[i+1] == '\n')
				3333	i += 2;
				3334	else
				3335	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3336	if (keepends)
				3337	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3338	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3339	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3340	j = i;
				3341	}
				3342	if (j < len) {
				3343	SPLIT_APPEND(data, j, len);
				3344	}
				3345
				3346	Py_DECREF(string);
				3347	return list;
				3348
				3349	onError:
				3350	Py_DECREF(list);
				3351	Py_DECREF(string);
				3352	return NULL;
				3353	}
				3354
				3355	static
				3356	PyObject split_char(PyUnicodeObject self,
				3357	PyObject *list,
				3358	Py_UNICODE ch,
				3359	int maxcount)
				3360	{
				3361	register int i;
				3362	register int j;
				3363	int len = self->length;
				3364	PyObject *str;
				3365
				3366	for (i = j = 0; i < len; ) {
				3367	if (self->str[i] == ch) {
				3368	if (maxcount-- <= 0)
				3369	break;
				3370	SPLIT_APPEND(self->str, j, i);
				3371	i = j = i + 1;
				3372	} else
				3373	i++;
				3374	}
				3375	if (j <= len) {
				3376	SPLIT_APPEND(self->str, j, len);
				3377	}
				3378	return list;
				3379
				3380	onError:
				3381	Py_DECREF(list);
				3382	return NULL;
				3383	}
				3384
				3385	static
				3386	PyObject split_substring(PyUnicodeObject self,
				3387	PyObject *list,
				3388	PyUnicodeObject *substring,
				3389	int maxcount)
				3390	{
				3391	register int i;
				3392	register int j;
				3393	int len = self->length;
				3394	int sublen = substring->length;
				3395	PyObject *str;
				3396
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3397	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3398	if (Py_UNICODE_MATCH(self, i, substring)) {
				3399	if (maxcount-- <= 0)
				3400	break;
				3401	SPLIT_APPEND(self->str, j, i);
				3402	i = j = i + sublen;
				3403	} else
				3404	i++;
				3405	}
				3406	if (j <= len) {
				3407	SPLIT_APPEND(self->str, j, len);
				3408	}
				3409	return list;
				3410
				3411	onError:
				3412	Py_DECREF(list);
				3413	return NULL;
				3414	}
				3415
				3416	#undef SPLIT_APPEND
				3417
				3418	static
				3419	PyObject split(PyUnicodeObject self,
				3420	PyUnicodeObject *substring,
				3421	int maxcount)
				3422	{
				3423	PyObject *list;
				3424
				3425	if (maxcount < 0)
				3426	maxcount = INT_MAX;
				3427
				3428	list = PyList_New(0);
				3429	if (!list)
				3430	return NULL;
				3431
				3432	if (substring == NULL)
				3433	return split_whitespace(self,list,maxcount);
				3434
				3435	else if (substring->length == 1)
				3436	return split_char(self,list,substring->str[0],maxcount);
				3437
				3438	else if (substring->length == 0) {
				3439	Py_DECREF(list);
				3440	PyErr_SetString(PyExc_ValueError, "empty separator");
				3441	return NULL;
				3442	}
				3443	else
				3444	return split_substring(self,list,substring,maxcount);
				3445	}
				3446
				3447	static
				3448	PyObject strip(PyUnicodeObject self,
				3449	int left,
				3450	int right)
				3451	{
				3452	Py_UNICODE *p = self->str;
				3453	int start = 0;
				3454	int end = self->length;
				3455
				3456	if (left)
				3457	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3458	start++;
				3459
				3460	if (right)
				3461	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3462	end--;
				3463
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3464	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3465	/* couldn't strip anything off, return original string */
				3466	Py_INCREF(self);
				3467	return (PyObject*) self;
				3468	}
				3469
				3470	return (PyObject*) PyUnicode_FromUnicode(
				3471	self->str + start,
				3472	end - start
				3473	);
				3474	}
				3475
				3476	static
				3477	PyObject replace(PyUnicodeObject self,
				3478	PyUnicodeObject *str1,
				3479	PyUnicodeObject *str2,
				3480	int maxcount)
				3481	{
				3482	PyUnicodeObject *u;
				3483
				3484	if (maxcount < 0)
				3485	maxcount = INT_MAX;
				3486
				3487	if (str1->length == 1 && str2->length == 1) {
				3488	int i;
				3489
				3490	/* replace characters */
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3491	if (!findchar(self->str, self->length, str1->str[0]) &&
				3492	PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3493	/* nothing to replace, return original string */
				3494	Py_INCREF(self);
				3495	u = self;
				3496	} else {
				3497	Py_UNICODE u1 = str1->str[0];
				3498	Py_UNICODE u2 = str2->str[0];
				3499
				3500	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3501	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3502	self->length
				3503	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3504	if (u != NULL) {
				3505	Py_UNICODE_COPY(u->str, self->str,
				3506	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3507	for (i = 0; i < u->length; i++)
				3508	if (u->str[i] == u1) {
				3509	if (--maxcount < 0)
				3510	break;
				3511	u->str[i] = u2;
				3512	}
				3513	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3514	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3515
				3516	} else {
				3517	int n, i;
				3518	Py_UNICODE *p;
				3519
				3520	/* replace strings */
				3521	n = count(self, 0, self->length, str1);
				3522	if (n > maxcount)
				3523	n = maxcount;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3524	if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3525	/* nothing to replace, return original string */
				3526	Py_INCREF(self);
				3527	u = self;
				3528	} else {
				3529	u = _PyUnicode_New(
				3530	self->length + n * (str2->length - str1->length));
				3531	if (u) {
				3532	i = 0;
				3533	p = u->str;
				3534	while (i <= self->length - str1->length)
				3535	if (Py_UNICODE_MATCH(self, i, str1)) {
				3536	/* replace string segment */
				3537	Py_UNICODE_COPY(p, str2->str, str2->length);
				3538	p += str2->length;
				3539	i += str1->length;
				3540	if (--n <= 0) {
				3541	/* copy remaining part */
				3542	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3543	break;
				3544	}
				3545	} else
				3546	*p++ = self->str[i++];
				3547	}
				3548	}
				3549	}
				3550
				3551	return (PyObject *) u;
				3552	}
				3553
				3554	/* --- Unicode Object Methods --------------------------------------------- */
				3555
				3556	static char title__doc__[] =
				3557	"S.title() -> unicode\n\
				3558	\n\
				3559	Return a titlecased version of S, i.e. words start with title case\n\
				3560	characters, all remaining cased characters have lower case.";
				3561
				3562	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3563	unicode_title(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3564	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3565	return fixup(self, fixtitle);
				3566	}
				3567
				3568	static char capitalize__doc__[] =
				3569	"S.capitalize() -> unicode\n\
				3570	\n\
				3571	Return a capitalized version of S, i.e. make the first character\n\
				3572	have upper case.";
				3573
				3574	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3575	unicode_capitalize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3576	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3577	return fixup(self, fixcapitalize);
				3578	}
				3579
				3580	#if 0
				3581	static char capwords__doc__[] =
				3582	"S.capwords() -> unicode\n\
				3583	\n\
				3584	Apply .capitalize() to all words in S and return the result with\n\
				3585	normalized whitespace (all whitespace strings are replaced by ' ').";
				3586
				3587	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3588	unicode_capwords(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3589	{
				3590	PyObject *list;
				3591	PyObject *item;
				3592	int i;
				3593
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3594	/* Split into words */
				3595	list = split(self, NULL, -1);
				3596	if (!list)
				3597	return NULL;
				3598
				3599	/* Capitalize each word */
				3600	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3601	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3602	fixcapitalize);
				3603	if (item == NULL)
				3604	goto onError;
				3605	Py_DECREF(PyList_GET_ITEM(list, i));
				3606	PyList_SET_ITEM(list, i, item);
				3607	}
				3608
				3609	/* Join the words to form a new string */
				3610	item = PyUnicode_Join(NULL, list);
				3611
				3612	onError:
				3613	Py_DECREF(list);
				3614	return (PyObject *)item;
				3615	}
				3616	#endif
				3617
				3618	static char center__doc__[] =
				3619	"S.center(width) -> unicode\n\
				3620	\n\
				3621	Return S centered in a Unicode string of length width. Padding is done\n\
				3622	using spaces.";
				3623
				3624	static PyObject *
				3625	unicode_center(PyUnicodeObject self, PyObject args)
				3626	{
				3627	int marg, left;
				3628	int width;
				3629
				3630	if (!PyArg_ParseTuple(args, "i:center", &width))
				3631	return NULL;
				3632
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3633	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3634	Py_INCREF(self);
				3635	return (PyObject*) self;
				3636	}
				3637
				3638	marg = width - self->length;
				3639	left = marg / 2 + (marg & width & 1);
				3640
				3641	return (PyObject*) pad(self, left, marg - left, ' ');
				3642	}
				3643
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3644	#if 0
				3645
				3646	/* This code should go into some future Unicode collation support
				3647	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3648	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3649
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3650	/* speedy UTF-16 code point order comparison */
				3651	/* gleaned from: */
				3652	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3653
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3654	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3655	{
				3656	0, 0, 0, 0, 0, 0, 0, 0,
				3657	0, 0, 0, 0, 0, 0, 0, 0,
				3658	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3659	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3660	};
				3661
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3662	static int
				3663	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3664	{
				3665	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3666
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3667	Py_UNICODE *s1 = str1->str;
				3668	Py_UNICODE *s2 = str2->str;
				3669
				3670	len1 = str1->length;
				3671	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3672
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3673	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3674	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3675
				3676	c1 = *s1++;
				3677	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3678
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3679	if (c1 > (1<<11) * 26)
				3680	c1 += utf16Fixup[c1>>11];
				3681	if (c2 > (1<<11) * 26)
				3682	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3683	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3684
				3685	if (c1 != c2)
				3686	return (c1 < c2) ? -1 : 1;
				3687
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3688	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3689	}
				3690
				3691	return (len1 < len2) ? -1 : (len1 != len2);
				3692	}
				3693
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3694	#else
				3695
				3696	static int
				3697	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3698	{
				3699	register int len1, len2;
				3700
				3701	Py_UNICODE *s1 = str1->str;
				3702	Py_UNICODE *s2 = str2->str;
				3703
				3704	len1 = str1->length;
				3705	len2 = str2->length;
				3706
				3707	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3708	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3709
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3710	c1 = *s1++;
				3711	c2 = *s2++;
				3712
				3713	if (c1 != c2)
				3714	return (c1 < c2) ? -1 : 1;
				3715
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3716	len1--; len2--;
				3717	}
				3718
				3719	return (len1 < len2) ? -1 : (len1 != len2);
				3720	}
				3721
				3722	#endif
				3723
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3724	int PyUnicode_Compare(PyObject *left,
				3725	PyObject *right)
				3726	{
				3727	PyUnicodeObject u = NULL, v = NULL;
				3728	int result;
				3729
				3730	/* Coerce the two arguments */
				3731	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3732	if (u == NULL)
				3733	goto onError;
				3734	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3735	if (v == NULL)
				3736	goto onError;
				3737
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3738	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3739	if (v == u) {
				3740	Py_DECREF(u);
				3741	Py_DECREF(v);
				3742	return 0;
				3743	}
				3744
				3745	result = unicode_compare(u, v);
				3746
				3747	Py_DECREF(u);
				3748	Py_DECREF(v);
				3749	return result;
				3750
				3751	onError:
				3752	Py_XDECREF(u);
				3753	Py_XDECREF(v);
				3754	return -1;
				3755	}
				3756
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3757	int PyUnicode_Contains(PyObject *container,
				3758	PyObject *element)
				3759	{
				3760	PyUnicodeObject u = NULL, v = NULL;
				3761	int result;
				3762	register const Py_UNICODE p, e;
				3763	register Py_UNICODE ch;
				3764
				3765	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3766	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3767	if (v == NULL) {
				3768	PyErr_SetString(PyExc_TypeError,
				3769	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3770	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3771	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3772	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3773	if (u == NULL) {
				3774	Py_DECREF(v);
				3775	goto onError;
				3776	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3777
				3778	/* Check v in u */
				3779	if (PyUnicode_GET_SIZE(v) != 1) {
				3780	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3781	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3782	goto onError;
				3783	}
				3784	ch = *PyUnicode_AS_UNICODE(v);
				3785	p = PyUnicode_AS_UNICODE(u);
				3786	e = p + PyUnicode_GET_SIZE(u);
				3787	result = 0;
				3788	while (p < e) {
				3789	if (*p++ == ch) {
				3790	result = 1;
				3791	break;
				3792	}
				3793	}
				3794
				3795	Py_DECREF(u);
				3796	Py_DECREF(v);
				3797	return result;
				3798
				3799	onError:
				3800	Py_XDECREF(u);
				3801	Py_XDECREF(v);
				3802	return -1;
				3803	}
				3804
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3805	/* Concat to string or Unicode object giving a new Unicode object. */
				3806
				3807	PyObject PyUnicode_Concat(PyObject left,
				3808	PyObject *right)
				3809	{
				3810	PyUnicodeObject u = NULL, v = NULL, *w;
				3811
				3812	/* Coerce the two arguments */
				3813	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3814	if (u == NULL)
				3815	goto onError;
				3816	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3817	if (v == NULL)
				3818	goto onError;
				3819
				3820	/* Shortcuts */
				3821	if (v == unicode_empty) {
				3822	Py_DECREF(v);
				3823	return (PyObject *)u;
				3824	}
				3825	if (u == unicode_empty) {
				3826	Py_DECREF(u);
				3827	return (PyObject *)v;
				3828	}
				3829
				3830	/* Concat the two Unicode strings */
				3831	w = _PyUnicode_New(u->length + v->length);
				3832	if (w == NULL)
				3833	goto onError;
				3834	Py_UNICODE_COPY(w->str, u->str, u->length);
				3835	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3836
				3837	Py_DECREF(u);
				3838	Py_DECREF(v);
				3839	return (PyObject *)w;
				3840
				3841	onError:
				3842	Py_XDECREF(u);
				3843	Py_XDECREF(v);
				3844	return NULL;
				3845	}
				3846
				3847	static char count__doc__[] =
				3848	"S.count(sub[, start[, end]]) -> int\n\
				3849	\n\
				3850	Return the number of occurrences of substring sub in Unicode string\n\
				3851	S[start:end]. Optional arguments start and end are\n\
				3852	interpreted as in slice notation.";
				3853
				3854	static PyObject *
				3855	unicode_count(PyUnicodeObject self, PyObject args)
				3856	{
				3857	PyUnicodeObject *substring;
				3858	int start = 0;
				3859	int end = INT_MAX;
				3860	PyObject *result;
				3861
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3862	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3863	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3864	return NULL;
				3865
				3866	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3867	(PyObject *)substring);
				3868	if (substring == NULL)
				3869	return NULL;
				3870
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3871	if (start < 0)
				3872	start += self->length;
				3873	if (start < 0)
				3874	start = 0;
				3875	if (end > self->length)
				3876	end = self->length;
				3877	if (end < 0)
				3878	end += self->length;
				3879	if (end < 0)
				3880	end = 0;
				3881
				3882	result = PyInt_FromLong((long) count(self, start, end, substring));
				3883
				3884	Py_DECREF(substring);
				3885	return result;
				3886	}
				3887
				3888	static char encode__doc__[] =
				3889	"S.encode([encoding[,errors]]) -> string\n\
				3890	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3891	Return an encoded string version of S. Default encoding is the current\n\
				3892	default string encoding. errors may be given to set a different error\n\
				3893	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3894	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3895
				3896	static PyObject *
				3897	unicode_encode(PyUnicodeObject self, PyObject args)
				3898	{
				3899	char *encoding = NULL;
				3900	char *errors = NULL;
				3901	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3902	return NULL;
				3903	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3904	}
				3905
				3906	static char expandtabs__doc__[] =
				3907	"S.expandtabs([tabsize]) -> unicode\n\
				3908	\n\
				3909	Return a copy of S where all tab characters are expanded using spaces.\n\
				3910	If tabsize is not given, a tab size of 8 characters is assumed.";
				3911
				3912	static PyObject*
				3913	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3914	{
				3915	Py_UNICODE *e;
				3916	Py_UNICODE *p;
				3917	Py_UNICODE *q;
				3918	int i, j;
				3919	PyUnicodeObject *u;
				3920	int tabsize = 8;
				3921
				3922	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3923	return NULL;
				3924
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3925	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3926	i = j = 0;
				3927	e = self->str + self->length;
				3928	for (p = self->str; p < e; p++)
				3929	if (*p == '\t') {
				3930	if (tabsize > 0)
				3931	j += tabsize - (j % tabsize);
				3932	}
				3933	else {
				3934	j++;
				3935	if (p == '\n' \|\| p == '\r') {
				3936	i += j;
				3937	j = 0;
				3938	}
				3939	}
				3940
				3941	/* Second pass: create output string and fill it */
				3942	u = _PyUnicode_New(i + j);
				3943	if (!u)
				3944	return NULL;
				3945
				3946	j = 0;
				3947	q = u->str;
				3948
				3949	for (p = self->str; p < e; p++)
				3950	if (*p == '\t') {
				3951	if (tabsize > 0) {
				3952	i = tabsize - (j % tabsize);
				3953	j += i;
				3954	while (i--)
				3955	*q++ = ' ';
				3956	}
				3957	}
				3958	else {
				3959	j++;
				3960	q++ = p;
				3961	if (p == '\n' \|\| p == '\r')
				3962	j = 0;
				3963	}
				3964
				3965	return (PyObject*) u;
				3966	}
				3967
				3968	static char find__doc__[] =
				3969	"S.find(sub [,start [,end]]) -> int\n\
				3970	\n\
				3971	Return the lowest index in S where substring sub is found,\n\
				3972	such that sub is contained within s[start,end]. Optional\n\
				3973	arguments start and end are interpreted as in slice notation.\n\
				3974	\n\
				3975	Return -1 on failure.";
				3976
				3977	static PyObject *
				3978	unicode_find(PyUnicodeObject self, PyObject args)
				3979	{
				3980	PyUnicodeObject *substring;
				3981	int start = 0;
				3982	int end = INT_MAX;
				3983	PyObject *result;
				3984
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3985	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3986	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3987	return NULL;
				3988	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3989	(PyObject *)substring);
				3990	if (substring == NULL)
				3991	return NULL;
				3992
				3993	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3994
				3995	Py_DECREF(substring);
				3996	return result;
				3997	}
				3998
				3999	static PyObject *
				4000	unicode_getitem(PyUnicodeObject *self, int index)
				4001	{
				4002	if (index < 0 \|\| index >= self->length) {
				4003	PyErr_SetString(PyExc_IndexError, "string index out of range");
				4004	return NULL;
				4005	}
				4006
				4007	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				4008	}
				4009
				4010	static long
				4011	unicode_hash(PyUnicodeObject *self)
				4012	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4013	/* Since Unicode objects compare equal to their ASCII string
				4014	counterparts, they should use the individual character values
				4015	as basis for their hash value. This is needed to assure that
				4016	strings and Unicode objects behave in the same way as
				4017	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4018
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4019	register int len;
				4020	register Py_UNICODE *p;
				4021	register long x;
				4022
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4023	if (self->hash != -1)
				4024	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4025	len = PyUnicode_GET_SIZE(self);
				4026	p = PyUnicode_AS_UNICODE(self);
				4027	x = *p << 7;
				4028	while (--len >= 0)
				4029	x = (1000003x) ^ p++;
				4030	x ^= PyUnicode_GET_SIZE(self);
				4031	if (x == -1)
				4032	x = -2;
				4033	self->hash = x;
				4034	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4035	}
				4036
				4037	static char index__doc__[] =
				4038	"S.index(sub [,start [,end]]) -> int\n\
				4039	\n\
				4040	Like S.find() but raise ValueError when the substring is not found.";
				4041
				4042	static PyObject *
				4043	unicode_index(PyUnicodeObject self, PyObject args)
				4044	{
				4045	int result;
				4046	PyUnicodeObject *substring;
				4047	int start = 0;
				4048	int end = INT_MAX;
				4049
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4050	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				4051	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4052	return NULL;
				4053
				4054	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4055	(PyObject *)substring);
				4056	if (substring == NULL)
				4057	return NULL;
				4058
				4059	result = findstring(self, substring, start, end, 1);
				4060
				4061	Py_DECREF(substring);
				4062	if (result < 0) {
				4063	PyErr_SetString(PyExc_ValueError, "substring not found");
				4064	return NULL;
				4065	}
				4066	return PyInt_FromLong(result);
				4067	}
				4068
				4069	static char islower__doc__[] =
				4070	"S.islower() -> int\n\
				4071	\n\
				4072	Return 1 if all cased characters in S are lowercase and there is\n\
				4073	at least one cased character in S, 0 otherwise.";
				4074
				4075	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4076	unicode_islower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4077	{
				4078	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4079	register const Py_UNICODE *e;
				4080	int cased;
				4081
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4082	/* Shortcut for single character strings */
				4083	if (PyUnicode_GET_SIZE(self) == 1)
				4084	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				4085
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4086	/* Special case for empty strings */
				4087	if (PyString_GET_SIZE(self) == 0)
				4088	return PyInt_FromLong(0);
				4089
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4090	e = p + PyUnicode_GET_SIZE(self);
				4091	cased = 0;
				4092	for (; p < e; p++) {
				4093	register const Py_UNICODE ch = *p;
				4094
				4095	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4096	return PyInt_FromLong(0);
				4097	else if (!cased && Py_UNICODE_ISLOWER(ch))
				4098	cased = 1;
				4099	}
				4100	return PyInt_FromLong(cased);
				4101	}
				4102
				4103	static char isupper__doc__[] =
				4104	"S.isupper() -> int\n\
				4105	\n\
				4106	Return 1 if all cased characters in S are uppercase and there is\n\
				4107	at least one cased character in S, 0 otherwise.";
				4108
				4109	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4110	unicode_isupper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4111	{
				4112	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4113	register const Py_UNICODE *e;
				4114	int cased;
				4115
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4116	/* Shortcut for single character strings */
				4117	if (PyUnicode_GET_SIZE(self) == 1)
				4118	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				4119
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4120	/* Special case for empty strings */
				4121	if (PyString_GET_SIZE(self) == 0)
				4122	return PyInt_FromLong(0);
				4123
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4124	e = p + PyUnicode_GET_SIZE(self);
				4125	cased = 0;
				4126	for (; p < e; p++) {
				4127	register const Py_UNICODE ch = *p;
				4128
				4129	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4130	return PyInt_FromLong(0);
				4131	else if (!cased && Py_UNICODE_ISUPPER(ch))
				4132	cased = 1;
				4133	}
				4134	return PyInt_FromLong(cased);
				4135	}
				4136
				4137	static char istitle__doc__[] =
				4138	"S.istitle() -> int\n\
				4139	\n\
				4140	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				4141	may only follow uncased characters and lowercase characters only cased\n\
				4142	ones. Return 0 otherwise.";
				4143
				4144	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4145	unicode_istitle(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4146	{
				4147	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4148	register const Py_UNICODE *e;
				4149	int cased, previous_is_cased;
				4150
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4151	/* Shortcut for single character strings */
				4152	if (PyUnicode_GET_SIZE(self) == 1)
				4153	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				4154	(Py_UNICODE_ISUPPER(*p) != 0));
				4155
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4156	/* Special case for empty strings */
				4157	if (PyString_GET_SIZE(self) == 0)
				4158	return PyInt_FromLong(0);
				4159
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4160	e = p + PyUnicode_GET_SIZE(self);
				4161	cased = 0;
				4162	previous_is_cased = 0;
				4163	for (; p < e; p++) {
				4164	register const Py_UNICODE ch = *p;
				4165
				4166	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				4167	if (previous_is_cased)
				4168	return PyInt_FromLong(0);
				4169	previous_is_cased = 1;
				4170	cased = 1;
				4171	}
				4172	else if (Py_UNICODE_ISLOWER(ch)) {
				4173	if (!previous_is_cased)
				4174	return PyInt_FromLong(0);
				4175	previous_is_cased = 1;
				4176	cased = 1;
				4177	}
				4178	else
				4179	previous_is_cased = 0;
				4180	}
				4181	return PyInt_FromLong(cased);
				4182	}
				4183
				4184	static char isspace__doc__[] =
				4185	"S.isspace() -> int\n\
				4186	\n\
				4187	Return 1 if there are only whitespace characters in S,\n\
				4188	0 otherwise.";
				4189
				4190	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4191	unicode_isspace(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4192	{
				4193	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4194	register const Py_UNICODE *e;
				4195
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4196	/* Shortcut for single character strings */
				4197	if (PyUnicode_GET_SIZE(self) == 1 &&
				4198	Py_UNICODE_ISSPACE(*p))
				4199	return PyInt_FromLong(1);
				4200
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4201	/* Special case for empty strings */
				4202	if (PyString_GET_SIZE(self) == 0)
				4203	return PyInt_FromLong(0);
				4204
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4205	e = p + PyUnicode_GET_SIZE(self);
				4206	for (; p < e; p++) {
				4207	if (!Py_UNICODE_ISSPACE(*p))
				4208	return PyInt_FromLong(0);
				4209	}
				4210	return PyInt_FromLong(1);
				4211	}
				4212
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4213	static char isalpha__doc__[] =
				4214	"S.isalpha() -> int\n\
				4215	\n\
				4216	Return 1 if all characters in S are alphabetic\n\
				4217	and there is at least one character in S, 0 otherwise.";
				4218
				4219	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4220	unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4221	{
				4222	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4223	register const Py_UNICODE *e;
				4224
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4225	/* Shortcut for single character strings */
				4226	if (PyUnicode_GET_SIZE(self) == 1 &&
				4227	Py_UNICODE_ISALPHA(*p))
				4228	return PyInt_FromLong(1);
				4229
				4230	/* Special case for empty strings */
				4231	if (PyString_GET_SIZE(self) == 0)
				4232	return PyInt_FromLong(0);
				4233
				4234	e = p + PyUnicode_GET_SIZE(self);
				4235	for (; p < e; p++) {
				4236	if (!Py_UNICODE_ISALPHA(*p))
				4237	return PyInt_FromLong(0);
				4238	}
				4239	return PyInt_FromLong(1);
				4240	}
				4241
				4242	static char isalnum__doc__[] =
				4243	"S.isalnum() -> int\n\
				4244	\n\
				4245	Return 1 if all characters in S are alphanumeric\n\
				4246	and there is at least one character in S, 0 otherwise.";
				4247
				4248	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4249	unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4250	{
				4251	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4252	register const Py_UNICODE *e;
				4253
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4254	/* Shortcut for single character strings */
				4255	if (PyUnicode_GET_SIZE(self) == 1 &&
				4256	Py_UNICODE_ISALNUM(*p))
				4257	return PyInt_FromLong(1);
				4258
				4259	/* Special case for empty strings */
				4260	if (PyString_GET_SIZE(self) == 0)
				4261	return PyInt_FromLong(0);
				4262
				4263	e = p + PyUnicode_GET_SIZE(self);
				4264	for (; p < e; p++) {
				4265	if (!Py_UNICODE_ISALNUM(*p))
				4266	return PyInt_FromLong(0);
				4267	}
				4268	return PyInt_FromLong(1);
				4269	}
				4270
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4271	static char isdecimal__doc__[] =
				4272	"S.isdecimal() -> int\n\
				4273	\n\
				4274	Return 1 if there are only decimal characters in S,\n\
				4275	0 otherwise.";
				4276
				4277	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4278	unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4279	{
				4280	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4281	register const Py_UNICODE *e;
				4282
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4283	/* Shortcut for single character strings */
				4284	if (PyUnicode_GET_SIZE(self) == 1 &&
				4285	Py_UNICODE_ISDECIMAL(*p))
				4286	return PyInt_FromLong(1);
				4287
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4288	/* Special case for empty strings */
				4289	if (PyString_GET_SIZE(self) == 0)
				4290	return PyInt_FromLong(0);
				4291
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4292	e = p + PyUnicode_GET_SIZE(self);
				4293	for (; p < e; p++) {
				4294	if (!Py_UNICODE_ISDECIMAL(*p))
				4295	return PyInt_FromLong(0);
				4296	}
				4297	return PyInt_FromLong(1);
				4298	}
				4299
				4300	static char isdigit__doc__[] =
				4301	"S.isdigit() -> int\n\
				4302	\n\
				4303	Return 1 if there are only digit characters in S,\n\
				4304	0 otherwise.";
				4305
				4306	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4307	unicode_isdigit(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4308	{
				4309	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4310	register const Py_UNICODE *e;
				4311
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4312	/* Shortcut for single character strings */
				4313	if (PyUnicode_GET_SIZE(self) == 1 &&
				4314	Py_UNICODE_ISDIGIT(*p))
				4315	return PyInt_FromLong(1);
				4316
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4317	/* Special case for empty strings */
				4318	if (PyString_GET_SIZE(self) == 0)
				4319	return PyInt_FromLong(0);
				4320
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4321	e = p + PyUnicode_GET_SIZE(self);
				4322	for (; p < e; p++) {
				4323	if (!Py_UNICODE_ISDIGIT(*p))
				4324	return PyInt_FromLong(0);
				4325	}
				4326	return PyInt_FromLong(1);
				4327	}
				4328
				4329	static char isnumeric__doc__[] =
				4330	"S.isnumeric() -> int\n\
				4331	\n\
				4332	Return 1 if there are only numeric characters in S,\n\
				4333	0 otherwise.";
				4334
				4335	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4336	unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4337	{
				4338	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4339	register const Py_UNICODE *e;
				4340
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4341	/* Shortcut for single character strings */
				4342	if (PyUnicode_GET_SIZE(self) == 1 &&
				4343	Py_UNICODE_ISNUMERIC(*p))
				4344	return PyInt_FromLong(1);
				4345
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4346	/* Special case for empty strings */
				4347	if (PyString_GET_SIZE(self) == 0)
				4348	return PyInt_FromLong(0);
				4349
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4350	e = p + PyUnicode_GET_SIZE(self);
				4351	for (; p < e; p++) {
				4352	if (!Py_UNICODE_ISNUMERIC(*p))
				4353	return PyInt_FromLong(0);
				4354	}
				4355	return PyInt_FromLong(1);
				4356	}
				4357
				4358	static char join__doc__[] =
				4359	"S.join(sequence) -> unicode\n\
				4360	\n\
				4361	Return a string which is the concatenation of the strings in the\n\
				4362	sequence. The separator between elements is S.";
				4363
				4364	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4365	unicode_join(PyObject self, PyObject data)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4366	{
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4367	return PyUnicode_Join(self, data);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4368	}
				4369
				4370	static int
				4371	unicode_length(PyUnicodeObject *self)
				4372	{
				4373	return self->length;
				4374	}
				4375
				4376	static char ljust__doc__[] =
				4377	"S.ljust(width) -> unicode\n\
				4378	\n\
				4379	Return S left justified in a Unicode string of length width. Padding is\n\
				4380	done using spaces.";
				4381
				4382	static PyObject *
				4383	unicode_ljust(PyUnicodeObject self, PyObject args)
				4384	{
				4385	int width;
				4386	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4387	return NULL;
				4388
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4389	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4390	Py_INCREF(self);
				4391	return (PyObject*) self;
				4392	}
				4393
				4394	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4395	}
				4396
				4397	static char lower__doc__[] =
				4398	"S.lower() -> unicode\n\
				4399	\n\
				4400	Return a copy of the string S converted to lowercase.";
				4401
				4402	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4403	unicode_lower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4404	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4405	return fixup(self, fixlower);
				4406	}
				4407
				4408	static char lstrip__doc__[] =
				4409	"S.lstrip() -> unicode\n\
				4410	\n\
				4411	Return a copy of the string S with leading whitespace removed.";
				4412
				4413	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4414	unicode_lstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4415	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4416	return strip(self, 1, 0);
				4417	}
				4418
				4419	static PyObject*
				4420	unicode_repeat(PyUnicodeObject *str, int len)
				4421	{
				4422	PyUnicodeObject *u;
				4423	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4424	int nchars;
				4425	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4426
				4427	if (len < 0)
				4428	len = 0;
				4429
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4430	if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4431	/* no repeat, return original string */
				4432	Py_INCREF(str);
				4433	return (PyObject*) str;
				4434	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4435
				4436	/* ensure # of chars needed doesn't overflow int and # of bytes
				4437	* needed doesn't overflow size_t
				4438	*/
				4439	nchars = len * str->length;
				4440	if (len && nchars / len != str->length) {
				4441	PyErr_SetString(PyExc_OverflowError,
				4442	"repeated string is too long");
				4443	return NULL;
				4444	}
				4445	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4446	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4447	PyErr_SetString(PyExc_OverflowError,
				4448	"repeated string is too long");
				4449	return NULL;
				4450	}
				4451	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4452	if (!u)
				4453	return NULL;
				4454
				4455	p = u->str;
				4456
				4457	while (len-- > 0) {
				4458	Py_UNICODE_COPY(p, str->str, str->length);
				4459	p += str->length;
				4460	}
				4461
				4462	return (PyObject*) u;
				4463	}
				4464
				4465	PyObject PyUnicode_Replace(PyObject obj,
				4466	PyObject *subobj,
				4467	PyObject *replobj,
				4468	int maxcount)
				4469	{
				4470	PyObject *self;
				4471	PyObject *str1;
				4472	PyObject *str2;
				4473	PyObject *result;
				4474
				4475	self = PyUnicode_FromObject(obj);
				4476	if (self == NULL)
				4477	return NULL;
				4478	str1 = PyUnicode_FromObject(subobj);
				4479	if (str1 == NULL) {
				4480	Py_DECREF(self);
				4481	return NULL;
				4482	}
				4483	str2 = PyUnicode_FromObject(replobj);
				4484	if (str2 == NULL) {
				4485	Py_DECREF(self);
				4486	Py_DECREF(str1);
				4487	return NULL;
				4488	}
				4489	result = replace((PyUnicodeObject *)self,
				4490	(PyUnicodeObject *)str1,
				4491	(PyUnicodeObject *)str2,
				4492	maxcount);
				4493	Py_DECREF(self);
				4494	Py_DECREF(str1);
				4495	Py_DECREF(str2);
				4496	return result;
				4497	}
				4498
				4499	static char replace__doc__[] =
				4500	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4501	\n\
				4502	Return a copy of S with all occurrences of substring\n\
				4503	old replaced by new. If the optional argument maxsplit is\n\
				4504	given, only the first maxsplit occurrences are replaced.";
				4505
				4506	static PyObject*
				4507	unicode_replace(PyUnicodeObject self, PyObject args)
				4508	{
				4509	PyUnicodeObject *str1;
				4510	PyUnicodeObject *str2;
				4511	int maxcount = -1;
				4512	PyObject *result;
				4513
				4514	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4515	return NULL;
				4516	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4517	if (str1 == NULL)
				4518	return NULL;
				4519	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4520	if (str2 == NULL)
				4521	return NULL;
				4522
				4523	result = replace(self, str1, str2, maxcount);
				4524
				4525	Py_DECREF(str1);
				4526	Py_DECREF(str2);
				4527	return result;
				4528	}
				4529
				4530	static
				4531	PyObject unicode_repr(PyObject unicode)
				4532	{
				4533	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4534	PyUnicode_GET_SIZE(unicode),
				4535	1);
				4536	}
				4537
				4538	static char rfind__doc__[] =
				4539	"S.rfind(sub [,start [,end]]) -> int\n\
				4540	\n\
				4541	Return the highest index in S where substring sub is found,\n\
				4542	such that sub is contained within s[start,end]. Optional\n\
				4543	arguments start and end are interpreted as in slice notation.\n\
				4544	\n\
				4545	Return -1 on failure.";
				4546
				4547	static PyObject *
				4548	unicode_rfind(PyUnicodeObject self, PyObject args)
				4549	{
				4550	PyUnicodeObject *substring;
				4551	int start = 0;
				4552	int end = INT_MAX;
				4553	PyObject *result;
				4554
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4555	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4556	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4557	return NULL;
				4558	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4559	(PyObject *)substring);
				4560	if (substring == NULL)
				4561	return NULL;
				4562
				4563	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4564
				4565	Py_DECREF(substring);
				4566	return result;
				4567	}
				4568
				4569	static char rindex__doc__[] =
				4570	"S.rindex(sub [,start [,end]]) -> int\n\
				4571	\n\
				4572	Like S.rfind() but raise ValueError when the substring is not found.";
				4573
				4574	static PyObject *
				4575	unicode_rindex(PyUnicodeObject self, PyObject args)
				4576	{
				4577	int result;
				4578	PyUnicodeObject *substring;
				4579	int start = 0;
				4580	int end = INT_MAX;
				4581
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4582	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4583	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4584	return NULL;
				4585	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4586	(PyObject *)substring);
				4587	if (substring == NULL)
				4588	return NULL;
				4589
				4590	result = findstring(self, substring, start, end, -1);
				4591
				4592	Py_DECREF(substring);
				4593	if (result < 0) {
				4594	PyErr_SetString(PyExc_ValueError, "substring not found");
				4595	return NULL;
				4596	}
				4597	return PyInt_FromLong(result);
				4598	}
				4599
				4600	static char rjust__doc__[] =
				4601	"S.rjust(width) -> unicode\n\
				4602	\n\
				4603	Return S right justified in a Unicode string of length width. Padding is\n\
				4604	done using spaces.";
				4605
				4606	static PyObject *
				4607	unicode_rjust(PyUnicodeObject self, PyObject args)
				4608	{
				4609	int width;
				4610	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4611	return NULL;
				4612
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4613	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4614	Py_INCREF(self);
				4615	return (PyObject*) self;
				4616	}
				4617
				4618	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4619	}
				4620
				4621	static char rstrip__doc__[] =
				4622	"S.rstrip() -> unicode\n\
				4623	\n\
				4624	Return a copy of the string S with trailing whitespace removed.";
				4625
				4626	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4627	unicode_rstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4628	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4629	return strip(self, 0, 1);
				4630	}
				4631
				4632	static PyObject*
				4633	unicode_slice(PyUnicodeObject *self, int start, int end)
				4634	{
				4635	/* standard clamping */
				4636	if (start < 0)
				4637	start = 0;
				4638	if (end < 0)
				4639	end = 0;
				4640	if (end > self->length)
				4641	end = self->length;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4642	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4643	/* full slice, return original string */
				4644	Py_INCREF(self);
				4645	return (PyObject*) self;
				4646	}
				4647	if (start > end)
				4648	start = end;
				4649	/* copy slice */
				4650	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4651	end - start);
				4652	}
				4653
				4654	PyObject PyUnicode_Split(PyObject s,
				4655	PyObject *sep,
				4656	int maxsplit)
				4657	{
				4658	PyObject *result;
				4659
				4660	s = PyUnicode_FromObject(s);
				4661	if (s == NULL)
				4662	return NULL;
				4663	if (sep != NULL) {
				4664	sep = PyUnicode_FromObject(sep);
				4665	if (sep == NULL) {
				4666	Py_DECREF(s);
				4667	return NULL;
				4668	}
				4669	}
				4670
				4671	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4672
				4673	Py_DECREF(s);
				4674	Py_XDECREF(sep);
				4675	return result;
				4676	}
				4677
				4678	static char split__doc__[] =
				4679	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4680	\n\
				4681	Return a list of the words in S, using sep as the\n\
				4682	delimiter string. If maxsplit is given, at most maxsplit\n\
				4683	splits are done. If sep is not specified, any whitespace string\n\
				4684	is a separator.";
				4685
				4686	static PyObject*
				4687	unicode_split(PyUnicodeObject self, PyObject args)
				4688	{
				4689	PyObject *substring = Py_None;
				4690	int maxcount = -1;
				4691
				4692	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4693	return NULL;
				4694
				4695	if (substring == Py_None)
				4696	return split(self, NULL, maxcount);
				4697	else if (PyUnicode_Check(substring))
				4698	return split(self, (PyUnicodeObject *)substring, maxcount);
				4699	else
				4700	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4701	}
				4702
				4703	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4704	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4705	\n\
				4706	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4707	Line breaks are not included in the resulting list unless keepends\n\
				4708	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4709
				4710	static PyObject*
				4711	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4712	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4713	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4714
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4715	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4716	return NULL;
				4717
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4718	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4719	}
				4720
				4721	static
				4722	PyObject unicode_str(PyUnicodeObject self)
				4723	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4724	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4725	}
				4726
				4727	static char strip__doc__[] =
				4728	"S.strip() -> unicode\n\
				4729	\n\
				4730	Return a copy of S with leading and trailing whitespace removed.";
				4731
				4732	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4733	unicode_strip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4734	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4735	return strip(self, 1, 1);
				4736	}
				4737
				4738	static char swapcase__doc__[] =
				4739	"S.swapcase() -> unicode\n\
				4740	\n\
				4741	Return a copy of S with uppercase characters converted to lowercase\n\
				4742	and vice versa.";
				4743
				4744	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4745	unicode_swapcase(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4746	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4747	return fixup(self, fixswapcase);
				4748	}
				4749
				4750	static char translate__doc__[] =
				4751	"S.translate(table) -> unicode\n\
				4752	\n\
				4753	Return a copy of the string S, where all characters have been mapped\n\
				4754	through the given translation table, which must be a mapping of\n\
				4755	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4756	are left untouched. Characters mapped to None are deleted.";
				4757
				4758	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4759	unicode_translate(PyUnicodeObject self, PyObject table)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4760	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4761	return PyUnicode_TranslateCharmap(self->str,
				4762	self->length,
				4763	table,
				4764	"ignore");
				4765	}
				4766
				4767	static char upper__doc__[] =
				4768	"S.upper() -> unicode\n\
				4769	\n\
				4770	Return a copy of S converted to uppercase.";
				4771
				4772	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4773	unicode_upper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4774	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4775	return fixup(self, fixupper);
				4776	}
				4777
				4778	#if 0
				4779	static char zfill__doc__[] =
				4780	"S.zfill(width) -> unicode\n\
				4781	\n\
				4782	Pad a numeric string x with zeros on the left, to fill a field\n\
				4783	of the specified width. The string x is never truncated.";
				4784
				4785	static PyObject *
				4786	unicode_zfill(PyUnicodeObject self, PyObject args)
				4787	{
				4788	int fill;
				4789	PyUnicodeObject *u;
				4790
				4791	int width;
				4792	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4793	return NULL;
				4794
				4795	if (self->length >= width) {
				4796	Py_INCREF(self);
				4797	return (PyObject*) self;
				4798	}
				4799
				4800	fill = width - self->length;
				4801
				4802	u = pad(self, fill, 0, '0');
				4803
				4804	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4805	/* move sign to beginning of string */
				4806	u->str[0] = u->str[fill];
				4807	u->str[fill] = '0';
				4808	}
				4809
				4810	return (PyObject*) u;
				4811	}
				4812	#endif
				4813
				4814	#if 0
				4815	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4816	unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4817	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4818	return PyInt_FromLong(unicode_freelist_size);
				4819	}
				4820	#endif
				4821
				4822	static char startswith__doc__[] =
				4823	"S.startswith(prefix[, start[, end]]) -> int\n\
				4824	\n\
				4825	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4826	optional start, test S beginning at that position. With optional end, stop\n\
				4827	comparing S at that position.";
				4828
				4829	static PyObject *
				4830	unicode_startswith(PyUnicodeObject *self,
				4831	PyObject *args)
				4832	{
				4833	PyUnicodeObject *substring;
				4834	int start = 0;
				4835	int end = INT_MAX;
				4836	PyObject *result;
				4837
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4838	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4839	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4840	return NULL;
				4841	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4842	(PyObject *)substring);
				4843	if (substring == NULL)
				4844	return NULL;
				4845
				4846	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4847
				4848	Py_DECREF(substring);
				4849	return result;
				4850	}
				4851
				4852
				4853	static char endswith__doc__[] =
				4854	"S.endswith(suffix[, start[, end]]) -> int\n\
				4855	\n\
				4856	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4857	optional start, test S beginning at that position. With optional end, stop\n\
				4858	comparing S at that position.";
				4859
				4860	static PyObject *
				4861	unicode_endswith(PyUnicodeObject *self,
				4862	PyObject *args)
				4863	{
				4864	PyUnicodeObject *substring;
				4865	int start = 0;
				4866	int end = INT_MAX;
				4867	PyObject *result;
				4868
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4869	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4870	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4871	return NULL;
				4872	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4873	(PyObject *)substring);
				4874	if (substring == NULL)
				4875	return NULL;
				4876
				4877	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4878
				4879	Py_DECREF(substring);
				4880	return result;
				4881	}
				4882
				4883
				4884	static PyMethodDef unicode_methods[] = {
				4885
				4886	/* Order is according to common usage: often used methods should
				4887	appear first, since lookup is done sequentially. */
				4888
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4889	{"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
				4890	{"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
				4891	{"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
				4892	{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
				4893	{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
				4894	{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
				4895	{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
				4896	{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
				4897	{"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
				4898	{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
				4899	{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
				4900	{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
				4901	{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
				4902	{"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
				4903	/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
				4904	{"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
				4905	{"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
				4906	{"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
				4907	{"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
				4908	{"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
				4909	{"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
				4910	{"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
				4911	{"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
				4912	{"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
				4913	{"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
				4914	{"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
				4915	{"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
				4916	{"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
				4917	{"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
				4918	{"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
				4919	{"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
				4920	{"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
				4921	{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
				4922	{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
				4923	{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4924	#if 0
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4925	{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
				4926	{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4927	#endif
				4928
				4929	#if 0
				4930	/* This one is just used for debugging the implementation. */
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4931	{"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4932	#endif
				4933
				4934	{NULL, NULL}
				4935	};
				4936
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4937	static PySequenceMethods unicode_as_sequence = {
				4938	(inquiry) unicode_length, /* sq_length */
				4939	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4940	(intargfunc) unicode_repeat, /* sq_repeat */
				4941	(intargfunc) unicode_getitem, /* sq_item */
				4942	(intintargfunc) unicode_slice, /* sq_slice */
				4943	0, /* sq_ass_item */
				4944	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4945	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4946	};
				4947
				4948	static int
				4949	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4950	int index,
				4951	const void **ptr)
				4952	{
				4953	if (index != 0) {
				4954	PyErr_SetString(PyExc_SystemError,
				4955	"accessing non-existent unicode segment");
				4956	return -1;
				4957	}
				4958	ptr = (void ) self->str;
				4959	return PyUnicode_GET_DATA_SIZE(self);
				4960	}
				4961
				4962	static int
				4963	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4964	const void **ptr)
				4965	{
				4966	PyErr_SetString(PyExc_TypeError,
				4967	"cannot use unicode as modifyable buffer");
				4968	return -1;
				4969	}
				4970
				4971	static int
				4972	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4973	int *lenp)
				4974	{
				4975	if (lenp)
				4976	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4977	return 1;
				4978	}
				4979
				4980	static int
				4981	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4982	int index,
				4983	const void **ptr)
				4984	{
				4985	PyObject *str;
				4986
				4987	if (index != 0) {
				4988	PyErr_SetString(PyExc_SystemError,
				4989	"accessing non-existent unicode segment");
				4990	return -1;
				4991	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4992	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4993	if (str == NULL)
				4994	return -1;
				4995	ptr = (void ) PyString_AS_STRING(str);
				4996	return PyString_GET_SIZE(str);
				4997	}
				4998
				4999	/* Helpers for PyUnicode_Format() */
				5000
				5001	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5002	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5003	{
				5004	int argidx = *p_argidx;
				5005	if (argidx < arglen) {
				5006	(*p_argidx)++;
				5007	if (arglen < 0)
				5008	return args;
				5009	else
				5010	return PyTuple_GetItem(args, argidx);
				5011	}
				5012	PyErr_SetString(PyExc_TypeError,
				5013	"not enough arguments for format string");
				5014	return NULL;
				5015	}
				5016
				5017	#define F_LJUST (1<<0)
				5018	#define F_SIGN (1<<1)
				5019	#define F_BLANK (1<<2)
				5020	#define F_ALT (1<<3)
				5021	#define F_ZERO (1<<4)
				5022
				5023	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5024	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5025	{
				5026	register int i;
				5027	int len;
				5028	va_list va;
				5029	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5030	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5031
				5032	/* First, format the string as char array, then expand to Py_UNICODE
				5033	array. */
				5034	charbuffer = (char *)buffer;
				5035	len = vsprintf(charbuffer, format, va);
				5036	for (i = len - 1; i >= 0; i--)
				5037	buffer[i] = (Py_UNICODE) charbuffer[i];
				5038
				5039	va_end(va);
				5040	return len;
				5041	}
				5042
				5043	static int
				5044	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5045	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5046	int flags,
				5047	int prec,
				5048	int type,
				5049	PyObject *v)
				5050	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5051	/* fmt = '%#.' + `prec` + `type`
				5052	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5053	char fmt[20];
				5054	double x;
				5055
				5056	x = PyFloat_AsDouble(v);
				5057	if (x == -1.0 && PyErr_Occurred())
				5058	return -1;
				5059	if (prec < 0)
				5060	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5061	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				5062	type = 'g';
				5063	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5064	/* worst case length calc to ensure no buffer overrun:
				5065	fmt = %#.<prec>g
				5066	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				5067	for any double rep.)
				5068	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				5069	If prec=0 the effective precision is 1 (the leading digit is
				5070	always given), therefore increase by one to 10+prec. */
				5071	if (buflen <= (size_t)10 + (size_t)prec) {
				5072	PyErr_SetString(PyExc_OverflowError,
				5073	"formatted float is too long (precision too long?)");
				5074	return -1;
				5075	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5076	return usprintf(buf, fmt, x);
				5077	}
				5078
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5079	static PyObject*
				5080	formatlong(PyObject *val, int flags, int prec, int type)
				5081	{
				5082	char *buf;
				5083	int i, len;
				5084	PyObject str; / temporary string object. */
				5085	PyUnicodeObject *result;
				5086
				5087	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				5088	if (!str)
				5089	return NULL;
				5090	result = _PyUnicode_New(len);
				5091	for (i = 0; i < len; i++)
				5092	result->str[i] = buf[i];
				5093	result->str[len] = 0;
				5094	Py_DECREF(str);
				5095	return (PyObject*)result;
				5096	}
				5097
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5098	static int
				5099	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5100	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5101	int flags,
				5102	int prec,
				5103	int type,
				5104	PyObject *v)
				5105	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5106	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5107	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				5108	+ 1 + 1 = 24*/
				5109	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5110	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5111	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5112
				5113	x = PyInt_AsLong(v);
				5114	if (x == -1 && PyErr_Occurred())
				5115	return -1;
				5116	if (prec < 0)
				5117	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5118	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				5119	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				5120	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				5121	PyErr_SetString(PyExc_OverflowError,
				5122	"formatted integer is too long (precision too long?)");
				5123	return -1;
				5124	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5125	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				5126	* but we want it (for consistency with other %#x conversions, and
				5127	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5128	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				5129	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				5130	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5131	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5132	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				5133	/* Only way to know what the platform does is to try it. */
				5134	sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
				5135	if (fmt[1] != (char)type) {
				5136	/* Supply our own leading 0x/0X -- needed under std C */
				5137	use_native_c_format = 0;
				5138	sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
				5139	}
				5140	}
				5141	if (use_native_c_format)
				5142	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5143	return usprintf(buf, fmt, x);
				5144	}
				5145
				5146	static int
				5147	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5148	size_t buflen,
				5149	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5150	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5151	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5152	if (PyUnicode_Check(v)) {
				5153	if (PyUnicode_GET_SIZE(v) != 1)
				5154	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5155	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5156	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5157
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5158	else if (PyString_Check(v)) {
				5159	if (PyString_GET_SIZE(v) != 1)
				5160	goto onError;
				5161	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				5162	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5163
				5164	else {
				5165	/* Integer input truncated to a character */
				5166	long x;
				5167	x = PyInt_AsLong(v);
				5168	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5169	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5170	buf[0] = (char) x;
				5171	}
				5172	buf[1] = '\0';
				5173	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5174
				5175	onError:
				5176	PyErr_SetString(PyExc_TypeError,
				5177	"%c requires int or char");
				5178	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5179	}
				5180
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5181	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				5182
				5183	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				5184	chars are formatted. XXX This is a magic number. Each formatting
				5185	routine does bounds checking to ensure no overflow, but a better
				5186	solution may be to malloc a buffer of appropriate size for each
				5187	format. For now, the current solution is sufficient.
				5188	*/
				5189	#define FORMATBUFLEN (size_t)120
				5190
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5191	PyObject PyUnicode_Format(PyObject format,
				5192	PyObject *args)
				5193	{
				5194	Py_UNICODE fmt, res;
				5195	int fmtcnt, rescnt, reslen, arglen, argidx;
				5196	int args_owned = 0;
				5197	PyUnicodeObject *result = NULL;
				5198	PyObject *dict = NULL;
				5199	PyObject *uformat;
				5200
				5201	if (format == NULL \|\| args == NULL) {
				5202	PyErr_BadInternalCall();
				5203	return NULL;
				5204	}
				5205	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5206	if (uformat == NULL)
				5207	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5208	fmt = PyUnicode_AS_UNICODE(uformat);
				5209	fmtcnt = PyUnicode_GET_SIZE(uformat);
				5210
				5211	reslen = rescnt = fmtcnt + 100;
				5212	result = _PyUnicode_New(reslen);
				5213	if (result == NULL)
				5214	goto onError;
				5215	res = PyUnicode_AS_UNICODE(result);
				5216
				5217	if (PyTuple_Check(args)) {
				5218	arglen = PyTuple_Size(args);
				5219	argidx = 0;
				5220	}
				5221	else {
				5222	arglen = -1;
				5223	argidx = -2;
				5224	}
				5225	if (args->ob_type->tp_as_mapping)
				5226	dict = args;
				5227
				5228	while (--fmtcnt >= 0) {
				5229	if (*fmt != '%') {
				5230	if (--rescnt < 0) {
				5231	rescnt = fmtcnt + 100;
				5232	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5233	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5234	return NULL;
				5235	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				5236	--rescnt;
				5237	}
				5238	res++ = fmt++;
				5239	}
				5240	else {
				5241	/* Got a format specifier */
				5242	int flags = 0;
				5243	int width = -1;
				5244	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5245	Py_UNICODE c = '\0';
				5246	Py_UNICODE fill;
				5247	PyObject *v = NULL;
				5248	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5249	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5250	Py_UNICODE sign;
				5251	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5252	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5253
				5254	fmt++;
				5255	if (*fmt == '(') {
				5256	Py_UNICODE *keystart;
				5257	int keylen;
				5258	PyObject *key;
				5259	int pcount = 1;
				5260
				5261	if (dict == NULL) {
				5262	PyErr_SetString(PyExc_TypeError,
				5263	"format requires a mapping");
				5264	goto onError;
				5265	}
				5266	++fmt;
				5267	--fmtcnt;
				5268	keystart = fmt;
				5269	/* Skip over balanced parentheses */
				5270	while (pcount > 0 && --fmtcnt >= 0) {
				5271	if (*fmt == ')')
				5272	--pcount;
				5273	else if (*fmt == '(')
				5274	++pcount;
				5275	fmt++;
				5276	}
				5277	keylen = fmt - keystart - 1;
				5278	if (fmtcnt < 0 \|\| pcount > 0) {
				5279	PyErr_SetString(PyExc_ValueError,
				5280	"incomplete format key");
				5281	goto onError;
				5282	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5283	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5284	then looked up since Python uses strings to hold
				5285	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5286	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5287	key = PyUnicode_EncodeUTF8(keystart,
				5288	keylen,
				5289	NULL);
				5290	if (key == NULL)
				5291	goto onError;
				5292	if (args_owned) {
				5293	Py_DECREF(args);
				5294	args_owned = 0;
				5295	}
				5296	args = PyObject_GetItem(dict, key);
				5297	Py_DECREF(key);
				5298	if (args == NULL) {
				5299	goto onError;
				5300	}
				5301	args_owned = 1;
				5302	arglen = -1;
				5303	argidx = -2;
				5304	}
				5305	while (--fmtcnt >= 0) {
				5306	switch (c = *fmt++) {
				5307	case '-': flags \|= F_LJUST; continue;
				5308	case '+': flags \|= F_SIGN; continue;
				5309	case ' ': flags \|= F_BLANK; continue;
				5310	case '#': flags \|= F_ALT; continue;
				5311	case '0': flags \|= F_ZERO; continue;
				5312	}
				5313	break;
				5314	}
				5315	if (c == '*') {
				5316	v = getnextarg(args, arglen, &argidx);
				5317	if (v == NULL)
				5318	goto onError;
				5319	if (!PyInt_Check(v)) {
				5320	PyErr_SetString(PyExc_TypeError,
				5321	"* wants int");
				5322	goto onError;
				5323	}
				5324	width = PyInt_AsLong(v);
				5325	if (width < 0) {
				5326	flags \|= F_LJUST;
				5327	width = -width;
				5328	}
				5329	if (--fmtcnt >= 0)
				5330	c = *fmt++;
				5331	}
				5332	else if (c >= '0' && c <= '9') {
				5333	width = c - '0';
				5334	while (--fmtcnt >= 0) {
				5335	c = *fmt++;
				5336	if (c < '0' \|\| c > '9')
				5337	break;
				5338	if ((width*10) / 10 != width) {
				5339	PyErr_SetString(PyExc_ValueError,
				5340	"width too big");
				5341	goto onError;
				5342	}
				5343	width = width*10 + (c - '0');
				5344	}
				5345	}
				5346	if (c == '.') {
				5347	prec = 0;
				5348	if (--fmtcnt >= 0)
				5349	c = *fmt++;
				5350	if (c == '*') {
				5351	v = getnextarg(args, arglen, &argidx);
				5352	if (v == NULL)
				5353	goto onError;
				5354	if (!PyInt_Check(v)) {
				5355	PyErr_SetString(PyExc_TypeError,
				5356	"* wants int");
				5357	goto onError;
				5358	}
				5359	prec = PyInt_AsLong(v);
				5360	if (prec < 0)
				5361	prec = 0;
				5362	if (--fmtcnt >= 0)
				5363	c = *fmt++;
				5364	}
				5365	else if (c >= '0' && c <= '9') {
				5366	prec = c - '0';
				5367	while (--fmtcnt >= 0) {
				5368	c = Py_CHARMASK(*fmt++);
				5369	if (c < '0' \|\| c > '9')
				5370	break;
				5371	if ((prec*10) / 10 != prec) {
				5372	PyErr_SetString(PyExc_ValueError,
				5373	"prec too big");
				5374	goto onError;
				5375	}
				5376	prec = prec*10 + (c - '0');
				5377	}
				5378	}
				5379	} /* prec */
				5380	if (fmtcnt >= 0) {
				5381	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5382	if (--fmtcnt >= 0)
				5383	c = *fmt++;
				5384	}
				5385	}
				5386	if (fmtcnt < 0) {
				5387	PyErr_SetString(PyExc_ValueError,
				5388	"incomplete format");
				5389	goto onError;
				5390	}
				5391	if (c != '%') {
				5392	v = getnextarg(args, arglen, &argidx);
				5393	if (v == NULL)
				5394	goto onError;
				5395	}
				5396	sign = 0;
				5397	fill = ' ';
				5398	switch (c) {
				5399
				5400	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5401	pbuf = formatbuf;
				5402	/* presume that buffer length is at least 1 */
				5403	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5404	len = 1;
				5405	break;
				5406
				5407	case 's':
				5408	case 'r':
				5409	if (PyUnicode_Check(v) && c == 's') {
				5410	temp = v;
				5411	Py_INCREF(temp);
				5412	}
				5413	else {
				5414	PyObject *unicode;
				5415	if (c == 's')
				5416	temp = PyObject_Str(v);
				5417	else
				5418	temp = PyObject_Repr(v);
				5419	if (temp == NULL)
				5420	goto onError;
				5421	if (!PyString_Check(temp)) {
				5422	/* XXX Note: this should never happen, since
				5423	PyObject_Repr() and PyObject_Str() assure
				5424	this */
				5425	Py_DECREF(temp);
				5426	PyErr_SetString(PyExc_TypeError,
				5427	"%s argument has non-string str()");
				5428	goto onError;
				5429	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5430	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5431	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5432	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5433	"strict");
				5434	Py_DECREF(temp);
				5435	temp = unicode;
				5436	if (temp == NULL)
				5437	goto onError;
				5438	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5439	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5440	len = PyUnicode_GET_SIZE(temp);
				5441	if (prec >= 0 && len > prec)
				5442	len = prec;
				5443	break;
				5444
				5445	case 'i':
				5446	case 'd':
				5447	case 'u':
				5448	case 'o':
				5449	case 'x':
				5450	case 'X':
				5451	if (c == 'i')
				5452	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5453	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5454	temp = formatlong(v, flags, prec, c);
				5455	if (!temp)
				5456	goto onError;
				5457	pbuf = PyUnicode_AS_UNICODE(temp);
				5458	len = PyUnicode_GET_SIZE(temp);
				5459	/* unbounded ints can always produce
				5460	a sign character! */
				5461	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5462	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5463	else {
				5464	pbuf = formatbuf;
				5465	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5466	flags, prec, c, v);
				5467	if (len < 0)
				5468	goto onError;
				5469	/* only d conversion is signed */
				5470	sign = c == 'd';
				5471	}
				5472	if (flags & F_ZERO)
				5473	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5474	break;
				5475
				5476	case 'e':
				5477	case 'E':
				5478	case 'f':
				5479	case 'g':
				5480	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5481	pbuf = formatbuf;
				5482	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5483	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5484	if (len < 0)
				5485	goto onError;
				5486	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5487	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5488	fill = '0';
				5489	break;
				5490
				5491	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5492	pbuf = formatbuf;
				5493	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5494	if (len < 0)
				5495	goto onError;
				5496	break;
				5497
				5498	default:
				5499	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5500	"unsupported format character '%c' (0x%x) "
				5501	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5502	(31<=c && c<=126) ? c : '?',
				5503	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5504	goto onError;
				5505	}
				5506	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5507	if (pbuf == '-' \|\| pbuf == '+') {
				5508	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5509	len--;
				5510	}
				5511	else if (flags & F_SIGN)
				5512	sign = '+';
				5513	else if (flags & F_BLANK)
				5514	sign = ' ';
				5515	else
				5516	sign = 0;
				5517	}
				5518	if (width < len)
				5519	width = len;
				5520	if (rescnt < width + (sign != 0)) {
				5521	reslen -= rescnt;
				5522	rescnt = width + fmtcnt + 100;
				5523	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5524	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5525	return NULL;
				5526	res = PyUnicode_AS_UNICODE(result)
				5527	+ reslen - rescnt;
				5528	}
				5529	if (sign) {
				5530	if (fill != ' ')
				5531	*res++ = sign;
				5532	rescnt--;
				5533	if (width > len)
				5534	width--;
				5535	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5536	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5537	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5538	assert(pbuf[1] == c);
				5539	if (fill != ' ') {
				5540	res++ = pbuf++;
				5541	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5542	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5543	rescnt -= 2;
				5544	width -= 2;
				5545	if (width < 0)
				5546	width = 0;
				5547	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5548	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5549	if (width > len && !(flags & F_LJUST)) {
				5550	do {
				5551	--rescnt;
				5552	*res++ = fill;
				5553	} while (--width > len);
				5554	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5555	if (fill == ' ') {
				5556	if (sign)
				5557	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5558	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5559	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5560	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5561	res++ = pbuf++;
				5562	res++ = pbuf++;
				5563	}
				5564	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5565	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5566	res += len;
				5567	rescnt -= len;
				5568	while (--width >= len) {
				5569	--rescnt;
				5570	*res++ = ' ';
				5571	}
				5572	if (dict && (argidx < arglen) && c != '%') {
				5573	PyErr_SetString(PyExc_TypeError,
				5574	"not all arguments converted");
				5575	goto onError;
				5576	}
				5577	Py_XDECREF(temp);
				5578	} /* '%' */
				5579	} /* until end */
				5580	if (argidx < arglen && !dict) {
				5581	PyErr_SetString(PyExc_TypeError,
				5582	"not all arguments converted");
				5583	goto onError;
				5584	}
				5585
				5586	if (args_owned) {
				5587	Py_DECREF(args);
				5588	}
				5589	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5590	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5591	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5592	return (PyObject *)result;
				5593
				5594	onError:
				5595	Py_XDECREF(result);
				5596	Py_DECREF(uformat);
				5597	if (args_owned) {
				5598	Py_DECREF(args);
				5599	}
				5600	return NULL;
				5601	}
				5602
				5603	static PyBufferProcs unicode_as_buffer = {
				5604	(getreadbufferproc) unicode_buffer_getreadbuf,
				5605	(getwritebufferproc) unicode_buffer_getwritebuf,
				5606	(getsegcountproc) unicode_buffer_getsegcount,
				5607	(getcharbufferproc) unicode_buffer_getcharbuf,
				5608	};
				5609
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5610	staticforward PyObject *
				5611	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds);
				5612
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5613	static PyObject *
				5614	unicode_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5615	{
				5616	PyObject *x = NULL;
				5617	static char *kwlist[] = {"string", "encoding", "errors", 0};
				5618	char *encoding = NULL;
				5619	char *errors = NULL;
				5620
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5621	if (type != &PyUnicode_Type)
				5622	return unicode_subtype_new(type, args, kwds);
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5623	if (!PyArg_ParseTupleAndKeywords(args, kwds, "\|Oss:unicode",
				5624	kwlist, &x, &encoding, &errors))
				5625	return NULL;
				5626	if (x == NULL)
				5627	return (PyObject *)_PyUnicode_New(0);
				5628	return PyUnicode_FromEncodedObject(x, encoding, errors);
				5629	}
				5630
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5631	static PyObject *
				5632	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5633	{
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5634	PyUnicodeObject tmp, pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5635	int n;
				5636
				5637	assert(PyType_IsSubtype(type, &PyUnicode_Type));
				5638	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
				5639	if (tmp == NULL)
				5640	return NULL;
				5641	assert(PyUnicode_Check(tmp));
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5642	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
				5643	if (pnew == NULL)
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5644	return NULL;
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5645	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
				5646	if (pnew->str == NULL) {
				5647	_Py_ForgetReference((PyObject *)pnew);
				5648	PyObject_DEL(pnew);
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5649	return NULL;
				5650	}
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5651	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
				5652	pnew->length = n;
				5653	pnew->hash = tmp->hash;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5654	Py_DECREF(tmp);
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5655	return (PyObject *)pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5656	}
				5657
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5658	static char unicode_doc[] =
				5659	"unicode(string [, encoding[, errors]]) -> object\n\
				5660	\n\
				5661	Create a new Unicode object from the given encoded string.\n\
				5662	encoding defaults to the current default string encoding and \n\
				5663	errors, defining the error handling, to 'strict'.";
				5664
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5665	PyTypeObject PyUnicode_Type = {
				5666	PyObject_HEAD_INIT(&PyType_Type)
				5667	0, /* ob_size */
				5668	"unicode", /* tp_name */
				5669	sizeof(PyUnicodeObject), /* tp_size */
				5670	0, /* tp_itemsize */
				5671	/* Slots */
				5672	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5673	0, /* tp_print */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5674	0, /* tp_getattr */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5675	0, /* tp_setattr */
				5676	(cmpfunc) unicode_compare, /* tp_compare */
				5677	(reprfunc) unicode_repr, /* tp_repr */
				5678	0, /* tp_as_number */
				5679	&unicode_as_sequence, /* tp_as_sequence */
				5680	0, /* tp_as_mapping */
				5681	(hashfunc) unicode_hash, /* tp_hash*/
				5682	0, /* tp_call*/
				5683	(reprfunc) unicode_str, /* tp_str */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5684	PyObject_GenericGetAttr, /* tp_getattro */
				5685	0, /* tp_setattro */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5686	&unicode_as_buffer, /* tp_as_buffer */
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5687	Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5688	unicode_doc, /* tp_doc */
				5689	0, /* tp_traverse */
				5690	0, /* tp_clear */
				5691	0, /* tp_richcompare */
				5692	0, /* tp_weaklistoffset */
				5693	0, /* tp_iter */
				5694	0, /* tp_iternext */
				5695	unicode_methods, /* tp_methods */
				5696	0, /* tp_members */
				5697	0, /* tp_getset */
				5698	0, /* tp_base */
				5699	0, /* tp_dict */
				5700	0, /* tp_descr_get */
				5701	0, /* tp_descr_set */
				5702	0, /* tp_dictoffset */
				5703	0, /* tp_init */
				5704	0, /* tp_alloc */
				5705	unicode_new, /* tp_new */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5706	};
				5707
				5708	/* Initialize the Unicode implementation */
				5709
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5710	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5711	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5712	int i;
				5713
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5714	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5715	unicode_freelist = NULL;
				5716	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5717	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5718	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5719	for (i = 0; i < 256; i++)
				5720	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5721	}
				5722
				5723	/* Finalize the Unicode implementation */
				5724
				5725	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5726	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5727	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5728	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5729	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5730
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5731	Py_XDECREF(unicode_empty);
				5732	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5733
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5734	for (i = 0; i < 256; i++) {
				5735	if (unicode_latin1[i]) {
				5736	Py_DECREF(unicode_latin1[i]);
				5737	unicode_latin1[i] = NULL;
				5738	}
				5739	}
				5740
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5741	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5742	PyUnicodeObject *v = u;
				5743	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5744	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5745	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5746	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5747	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5748	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5749	unicode_freelist = NULL;
				5750	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5751	}