Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: a252587e721eaf6567d4d72b155f245e8c95d86a [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	107	PyUnicode_GetMax(void)
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	108	{
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	109	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
				204	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	222	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	227	void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	228	{
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	229	if (!PyUnicode_CheckExact(unicode)) {
				230	unicode->ob_type->tp_free((PyObject *)unicode);
				231	return;
				232	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	233	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	234	/* Keep-Alive optimization */
				235	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	236	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	237	unicode->str = NULL;
				238	unicode->length = 0;
				239	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	240	if (unicode->defenc) {
				241	Py_DECREF(unicode->defenc);
				242	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	243	}
				244	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	245	(PyUnicodeObject *)unicode = unicode_freelist;
				246	unicode_freelist = unicode;
				247	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	248	}
				249	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	250	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	251	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	252	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	253	}
				254	}
				255
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	256	int PyUnicode_Resize(PyObject **unicode,
				257	int length)
				258	{
				259	register PyUnicodeObject *v;
				260
				261	/* Argument checks */
				262	if (unicode == NULL) {
				263	PyErr_BadInternalCall();
				264	return -1;
				265	}
				266	v = (PyUnicodeObject )unicode;
				267	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				268	PyErr_BadInternalCall();
				269	return -1;
				270	}
				271
				272	/* Resizing unicode_empty and single character objects is not
				273	possible since these are being shared. We simply return a fresh
				274	copy with the same Unicode content. */
				275	if (v->length != length &&
				276	(v == unicode_empty \|\| v->length == 1)) {
				277	PyUnicodeObject *w = _PyUnicode_New(length);
				278	if (w == NULL)
				279	return -1;
				280	Py_UNICODE_COPY(w->str, v->str,
				281	length < v->length ? length : v->length);
				282	unicode = (PyObject )w;
				283	return 0;
				284	}
				285
				286	/* Note that we don't have to modify *unicode for unshared Unicode
				287	objects, since we can modify them in-place. */
				288	return unicode_resize(v, length);
				289	}
				290
				291	/* Internal API for use in unicodeobject.c only ! */
				292	#define _PyUnicode_Resize(unicodevar, length) \
				293	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				294
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	295	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				296	int size)
				297	{
				298	PyUnicodeObject *unicode;
				299
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	300	/* If the Unicode data is known at construction time, we can apply
				301	some optimizations which share commonly used objects. */
				302	if (u != NULL) {
				303
				304	/* Optimization for empty strings */
				305	if (size == 0 && unicode_empty != NULL) {
				306	Py_INCREF(unicode_empty);
				307	return (PyObject *)unicode_empty;
				308	}
				309
				310	/* Single character Unicode objects in the Latin-1 range are
				311	shared when using this constructor */
				312	if (size == 1 && *u < 256) {
				313	unicode = unicode_latin1[*u];
				314	if (!unicode) {
				315	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	316	if (!unicode)
				317	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	318	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	319	unicode_latin1[*u] = unicode;
				320	}
				321	Py_INCREF(unicode);
				322	return (PyObject *)unicode;
				323	}
				324	}
				325
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	326	unicode = _PyUnicode_New(size);
				327	if (!unicode)
				328	return NULL;
				329
				330	/* Copy the Unicode data into the new object */
				331	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	332	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	333
				334	return (PyObject *)unicode;
				335	}
				336
				337	#ifdef HAVE_WCHAR_H
				338
				339	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				340	int size)
				341	{
				342	PyUnicodeObject *unicode;
				343
				344	if (w == NULL) {
				345	PyErr_BadInternalCall();
				346	return NULL;
				347	}
				348
				349	unicode = _PyUnicode_New(size);
				350	if (!unicode)
				351	return NULL;
				352
				353	/* Copy the wchar_t data into the new object */
				354	#ifdef HAVE_USABLE_WCHAR_T
				355	memcpy(unicode->str, w, size * sizeof(wchar_t));
				356	#else
				357	{
				358	register Py_UNICODE *u;
				359	register int i;
				360	u = PyUnicode_AS_UNICODE(unicode);
				361	for (i = size; i >= 0; i--)
				362	u++ = w++;
				363	}
				364	#endif
				365
				366	return (PyObject *)unicode;
				367	}
				368
				369	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				370	register wchar_t *w,
				371	int size)
				372	{
				373	if (unicode == NULL) {
				374	PyErr_BadInternalCall();
				375	return -1;
				376	}
				377	if (size > PyUnicode_GET_SIZE(unicode))
				378	size = PyUnicode_GET_SIZE(unicode);
				379	#ifdef HAVE_USABLE_WCHAR_T
				380	memcpy(w, unicode->str, size * sizeof(wchar_t));
				381	#else
				382	{
				383	register Py_UNICODE *u;
				384	register int i;
				385	u = PyUnicode_AS_UNICODE(unicode);
				386	for (i = size; i >= 0; i--)
				387	w++ = u++;
				388	}
				389	#endif
				390
				391	return size;
				392	}
				393
				394	#endif
				395
				396	PyObject PyUnicode_FromObject(register PyObject obj)
				397	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	398	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				399	}
				400
				401	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				402	const char *encoding,
				403	const char *errors)
				404	{
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	405	const char *s = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	406	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	407	int owned = 0;
				408	PyObject *v;
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	409	int reclevel;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	410
				411	if (obj == NULL) {
				412	PyErr_BadInternalCall();
				413	return NULL;
				414	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	415
				416	/* Coerce object */
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	417	for (reclevel = 0; reclevel < 2; reclevel++) {
				418
				419	if (PyUnicode_Check(obj)) {
				420	if (encoding) {
				421	PyErr_SetString(PyExc_TypeError,
				422	"decoding Unicode is not supported");
				423	goto onError;
				424	}
				425	if (PyUnicode_CheckExact(obj)) {
				426	Py_INCREF(obj);
				427	v = obj;
				428	}
				429	else {
				430	/* For a subclass of unicode, return a true unicode object
				431	with the same string value. */
				432	v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
				433	PyUnicode_GET_SIZE(obj));
				434	}
				435	goto done;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	436	}
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	437	else if (PyString_Check(obj)) {
				438	s = PyString_AS_STRING(obj);
				439	len = PyString_GET_SIZE(obj);
				440	break;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	441	}
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	442	else {
				443	PyObject *w;
				444
				445	/* Try char buffer interface */
				446	if (PyObject_AsCharBuffer(obj, &s, &len))
				447	PyErr_Clear();
				448	else
				449	break;
				450
				451	/* Mimic the behaviour of str(object) if everything else
				452	fails (see PyObject_Str()); this also covers instances
				453	which implement __str__. */
				454	if (obj->ob_type->tp_str == NULL)
				455	w = PyObject_Repr(obj);
				456	else
				457	w = (*obj->ob_type->tp_str)(obj);
				458	if (w == NULL)
				459	goto onError;
				460	if (owned) {
				461	Py_DECREF(obj);
				462	}
				463	obj = w;
				464	owned = 1;
Tim Peters	78e0fc7	2001-09-11 03:07:38 +0000	[diff] [blame]	465	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	466	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	467
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	468	if (s == NULL) {
				469	PyErr_Format(PyExc_TypeError,
				470	"coercing to Unicode: __str__ recursion limit exceeded "
				471	"(last type: %.80s)",
				472	obj->ob_type->tp_name);
				473	goto onError;
				474	}
				475
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	476	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	477	if (len == 0) {
				478	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	479	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	480	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	481	else
				482	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	483
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	484	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	485	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	486	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	487	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	488	return v;
				489
				490	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	491	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	492	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	493	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	494	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	495	}
				496
				497	PyObject PyUnicode_Decode(const char s,
				498	int size,
				499	const char *encoding,
				500	const char *errors)
				501	{
				502	PyObject buffer = NULL, unicode;
				503
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	504	if (encoding == NULL)
				505	encoding = PyUnicode_GetDefaultEncoding();
				506
				507	/* Shortcuts for common default encodings */
				508	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	509	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	510	else if (strcmp(encoding, "latin-1") == 0)
				511	return PyUnicode_DecodeLatin1(s, size, errors);
				512	else if (strcmp(encoding, "ascii") == 0)
				513	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	514
				515	/* Decode via the codec registry */
				516	buffer = PyBuffer_FromMemory((void *)s, size);
				517	if (buffer == NULL)
				518	goto onError;
				519	unicode = PyCodec_Decode(buffer, encoding, errors);
				520	if (unicode == NULL)
				521	goto onError;
				522	if (!PyUnicode_Check(unicode)) {
				523	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	524	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	525	unicode->ob_type->tp_name);
				526	Py_DECREF(unicode);
				527	goto onError;
				528	}
				529	Py_DECREF(buffer);
				530	return unicode;
				531
				532	onError:
				533	Py_XDECREF(buffer);
				534	return NULL;
				535	}
				536
				537	PyObject PyUnicode_Encode(const Py_UNICODE s,
				538	int size,
				539	const char *encoding,
				540	const char *errors)
				541	{
				542	PyObject v, unicode;
				543
				544	unicode = PyUnicode_FromUnicode(s, size);
				545	if (unicode == NULL)
				546	return NULL;
				547	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				548	Py_DECREF(unicode);
				549	return v;
				550	}
				551
				552	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				553	const char *encoding,
				554	const char *errors)
				555	{
				556	PyObject *v;
				557
				558	if (!PyUnicode_Check(unicode)) {
				559	PyErr_BadArgument();
				560	goto onError;
				561	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	562
				563	if (encoding == NULL)
				564	encoding = PyUnicode_GetDefaultEncoding();
				565
				566	/* Shortcuts for common default encodings */
				567	if (errors == NULL) {
				568	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	569	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	570	else if (strcmp(encoding, "latin-1") == 0)
				571	return PyUnicode_AsLatin1String(unicode);
				572	else if (strcmp(encoding, "ascii") == 0)
				573	return PyUnicode_AsASCIIString(unicode);
				574	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	575
				576	/* Encode via the codec registry */
				577	v = PyCodec_Encode(unicode, encoding, errors);
				578	if (v == NULL)
				579	goto onError;
				580	/* XXX Should we really enforce this ? */
				581	if (!PyString_Check(v)) {
				582	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	583	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	584	v->ob_type->tp_name);
				585	Py_DECREF(v);
				586	goto onError;
				587	}
				588	return v;
				589
				590	onError:
				591	return NULL;
				592	}
				593
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	594	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				595	const char *errors)
				596	{
				597	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				598
				599	if (v)
				600	return v;
				601	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				602	if (v && errors == NULL)
				603	((PyUnicodeObject *)unicode)->defenc = v;
				604	return v;
				605	}
				606
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	607	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				608	{
				609	if (!PyUnicode_Check(unicode)) {
				610	PyErr_BadArgument();
				611	goto onError;
				612	}
				613	return PyUnicode_AS_UNICODE(unicode);
				614
				615	onError:
				616	return NULL;
				617	}
				618
				619	int PyUnicode_GetSize(PyObject *unicode)
				620	{
				621	if (!PyUnicode_Check(unicode)) {
				622	PyErr_BadArgument();
				623	goto onError;
				624	}
				625	return PyUnicode_GET_SIZE(unicode);
				626
				627	onError:
				628	return -1;
				629	}
				630
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	631	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	632	{
				633	return unicode_default_encoding;
				634	}
				635
				636	int PyUnicode_SetDefaultEncoding(const char *encoding)
				637	{
				638	PyObject *v;
				639
				640	/* Make sure the encoding is valid. As side effect, this also
				641	loads the encoding into the codec registry cache. */
				642	v = _PyCodec_Lookup(encoding);
				643	if (v == NULL)
				644	goto onError;
				645	Py_DECREF(v);
				646	strncpy(unicode_default_encoding,
				647	encoding,
				648	sizeof(unicode_default_encoding));
				649	return 0;
				650
				651	onError:
				652	return -1;
				653	}
				654
Marc-André Lemburg	c60e6f7	2001-09-20 10:35:46 +0000	[diff] [blame]	655	/* --- UTF-7 Codec -------------------------------------------------------- */
				656
				657	/* see RFC2152 for details */
				658
				659	static
				660	char utf7_special[128] = {
				661	/* indicate whether a UTF-7 character is special i.e. cannot be directly
				662	encoded:
				663	0 - not special
				664	1 - special
				665	2 - whitespace (optional)
				666	3 - RFC2152 Set O (optional) */
				667	1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
				668	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				669	2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
				670	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
				671	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				672	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
				673	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				674	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
				675
				676	};
				677
				678	#define SPECIAL(c, encodeO, encodeWS) \
				679	(((c)>127 \|\| utf7_special[(c)] == 1) \|\| \
				680	(encodeWS && (utf7_special[(c)] == 2)) \|\| \
				681	(encodeO && (utf7_special[(c)] == 3)))
				682
				683	#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
				684	#define B64CHAR(c) (isalnum(c) \|\| (c) == '+' \|\| (c) == '/')
				685	#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
				686	(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
				687
				688	#define ENCODE(out, ch, bits) \
				689	while (bits >= 6) { \
				690	*out++ = B64(ch >> (bits-6)); \
				691	bits -= 6; \
				692	}
				693
				694	#define DECODE(out, ch, bits, surrogate) \
				695	while (bits >= 16) { \
				696	Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
				697	bits -= 16; \
				698	if (surrogate) { \
				699	/* We have already generated an error for the high surrogate
				700	so let's not bother seeing if the low surrogate is correct or not */\
				701	surrogate = 0; \
				702	} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
				703	/* This is a surrogate pair. Unfortunately we can't represent \
				704	it in a 16-bit character */ \
				705	surrogate = 1; \
				706	errmsg = "code pairs are not supported"; \
				707	goto utf7Error; \
				708	} else { \
				709	*out++ = outCh; \
				710	} \
				711	} \
				712
				713	static
				714	int utf7_decoding_error(Py_UNICODE **dest,
				715	const char *errors,
				716	const char *details)
				717	{
				718	if ((errors == NULL) \|\|
				719	(strcmp(errors,"strict") == 0)) {
				720	PyErr_Format(PyExc_UnicodeError,
				721	"UTF-7 decoding error: %.400s",
				722	details);
				723	return -1;
				724	}
				725	else if (strcmp(errors,"ignore") == 0) {
				726	return 0;
				727	}
				728	else if (strcmp(errors,"replace") == 0) {
				729	if (dest != NULL) {
				730	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				731	(*dest)++;
				732	}
				733	return 0;
				734	}
				735	else {
				736	PyErr_Format(PyExc_ValueError,
				737	"UTF-7 decoding error; unknown error handling code: %.400s",
				738	errors);
				739	return -1;
				740	}
				741	}
				742
				743	PyObject PyUnicode_DecodeUTF7(const char s,
				744	int size,
				745	const char *errors)
				746	{
				747	const char *e;
				748	PyUnicodeObject *unicode;
				749	Py_UNICODE *p;
				750	const char *errmsg = "";
				751	int inShift = 0;
				752	unsigned int bitsleft = 0;
				753	unsigned long charsleft = 0;
				754	int surrogate = 0;
				755
				756	unicode = _PyUnicode_New(size);
				757	if (!unicode)
				758	return NULL;
				759	if (size == 0)
				760	return (PyObject *)unicode;
				761
				762	p = unicode->str;
				763	e = s + size;
				764
				765	while (s < e) {
				766	Py_UNICODE ch = *s;
				767
				768	if (inShift) {
				769	if ((ch == '-') \|\| !B64CHAR(ch)) {
				770	inShift = 0;
				771	s++;
				772
				773	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				774	if (bitsleft >= 6) {
				775	/* The shift sequence has a partial character in it. If
				776	bitsleft < 6 then we could just classify it as padding
				777	but that is not the case here */
				778
				779	errmsg = "partial character in shift sequence";
				780	goto utf7Error;
				781	}
				782	/* According to RFC2152 the remaining bits should be zero. We
				783	choose to signal an error/insert a replacement character
				784	here so indicate the potential of a misencoded character. */
				785
				786	/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
				787	if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
				788	errmsg = "non-zero padding bits in shift sequence";
				789	goto utf7Error;
				790	}
				791
				792	if (ch == '-') {
				793	if ((s < e) && (*(s) == '-')) {
				794	*p++ = '-';
				795	inShift = 1;
				796	}
				797	} else if (SPECIAL(ch,0,0)) {
				798	errmsg = "unexpected special character";
				799	goto utf7Error;
				800	} else {
				801	*p++ = ch;
				802	}
				803	} else {
				804	charsleft = (charsleft << 6) \| UB64(ch);
				805	bitsleft += 6;
				806	s++;
				807	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				808	}
				809	}
				810	else if ( ch == '+' ) {
				811	s++;
				812	if (s < e && *s == '-') {
				813	s++;
				814	*p++ = '+';
				815	} else
				816	{
				817	inShift = 1;
				818	bitsleft = 0;
				819	}
				820	}
				821	else if (SPECIAL(ch,0,0)) {
				822	errmsg = "unexpected special character";
				823	s++;
				824	goto utf7Error;
				825	}
				826	else {
				827	*p++ = ch;
				828	s++;
				829	}
				830	continue;
				831	utf7Error:
				832	if (utf7_decoding_error(&p, errors, errmsg))
				833	goto onError;
				834	}
				835
				836	if (inShift) {
				837	if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
				838	goto onError;
				839	}
				840
				841	if (_PyUnicode_Resize(&unicode, p - unicode->str))
				842	goto onError;
				843
				844	return (PyObject *)unicode;
				845
				846	onError:
				847	Py_DECREF(unicode);
				848	return NULL;
				849	}
				850
				851
				852	PyObject PyUnicode_EncodeUTF7(const Py_UNICODE s,
				853	int size,
				854	int encodeSetO,
				855	int encodeWhiteSpace,
				856	const char *errors)
				857	{
				858	PyObject *v;
				859	/* It might be possible to tighten this worst case */
				860	unsigned int cbAllocated = 5 * size;
				861	int inShift = 0;
				862	int i = 0;
				863	unsigned int bitsleft = 0;
				864	unsigned long charsleft = 0;
				865	char * out;
				866	char * start;
				867
				868	if (size == 0)
				869	return PyString_FromStringAndSize(NULL, 0);
				870
				871	v = PyString_FromStringAndSize(NULL, cbAllocated);
				872	if (v == NULL)
				873	return NULL;
				874
				875	start = out = PyString_AS_STRING(v);
				876	for (;i < size; ++i) {
				877	Py_UNICODE ch = s[i];
				878
				879	if (!inShift) {
				880	if (ch == '+') {
				881	*out++ = '+';
				882	*out++ = '-';
				883	} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				884	charsleft = ch;
				885	bitsleft = 16;
				886	*out++ = '+';
				887	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				888	inShift = bitsleft > 0;
				889	} else {
				890	*out++ = (char) ch;
				891	}
				892	} else {
				893	if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				894	*out++ = B64(charsleft << (6-bitsleft));
				895	charsleft = 0;
				896	bitsleft = 0;
				897	/* Characters not in the BASE64 set implicitly unshift the sequence
				898	so no '-' is required, except if the character is itself a '-' */
				899	if (B64CHAR(ch) \|\| ch == '-') {
				900	*out++ = '-';
				901	}
				902	inShift = 0;
				903	*out++ = (char) ch;
				904	} else {
				905	bitsleft += 16;
				906	charsleft = (charsleft << 16) \| ch;
				907	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				908
				909	/* If the next character is special then we dont' need to terminate
				910	the shift sequence. If the next character is not a BASE64 character
				911	or '-' then the shift sequence will be terminated implicitly and we
				912	don't have to insert a '-'. */
				913
				914	if (bitsleft == 0) {
				915	if (i + 1 < size) {
				916	Py_UNICODE ch2 = s[i+1];
				917
				918	if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
				919
				920	} else if (B64CHAR(ch2) \|\| ch2 == '-') {
				921	*out++ = '-';
				922	inShift = 0;
				923	} else {
				924	inShift = 0;
				925	}
				926
				927	}
				928	else {
				929	*out++ = '-';
				930	inShift = 0;
				931	}
				932	}
				933	}
				934	}
				935	}
				936	if (bitsleft) {
				937	*out++= B64(charsleft << (6-bitsleft) );
				938	*out++ = '-';
				939	}
				940
				941	if (_PyString_Resize(&v, out - start)) {
				942	Py_DECREF(v);
				943	return NULL;
				944	}
				945	return v;
				946	}
				947
				948	#undef SPECIAL
				949	#undef B64
				950	#undef B64CHAR
				951	#undef UB64
				952	#undef ENCODE
				953	#undef DECODE
				954
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	955	/* --- UTF-8 Codec -------------------------------------------------------- */
				956
				957	static
				958	char utf8_code_length[256] = {
				959	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				960	illegal prefix. see RFC 2279 for details */
				961	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				962	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				963	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				964	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				965	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				966	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				967	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				968	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				969	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				970	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				971	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				972	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				973	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				974	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				975	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				976	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				977	};
				978
				979	static
				980	int utf8_decoding_error(const char **source,
				981	Py_UNICODE **dest,
				982	const char *errors,
				983	const char *details)
				984	{
				985	if ((errors == NULL) \|\|
				986	(strcmp(errors,"strict") == 0)) {
				987	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	988	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	989	details);
				990	return -1;
				991	}
				992	else if (strcmp(errors,"ignore") == 0) {
				993	(*source)++;
				994	return 0;
				995	}
				996	else if (strcmp(errors,"replace") == 0) {
				997	(*source)++;
				998	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				999	(*dest)++;
				1000	return 0;
				1001	}
				1002	else {
				1003	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1004	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1005	errors);
				1006	return -1;
				1007	}
				1008	}
				1009
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1010	PyObject PyUnicode_DecodeUTF8(const char s,
				1011	int size,
				1012	const char *errors)
				1013	{
				1014	int n;
				1015	const char *e;
				1016	PyUnicodeObject *unicode;
				1017	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1018	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1019
				1020	/* Note: size will always be longer than the resulting Unicode
				1021	character count */
				1022	unicode = _PyUnicode_New(size);
				1023	if (!unicode)
				1024	return NULL;
				1025	if (size == 0)
				1026	return (PyObject *)unicode;
				1027
				1028	/* Unpack UTF-8 encoded data */
				1029	p = unicode->str;
				1030	e = s + size;
				1031
				1032	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1033	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1034
				1035	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1036	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1037	s++;
				1038	continue;
				1039	}
				1040
				1041	n = utf8_code_length[ch];
				1042
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1043	if (s + n > e) {
				1044	errmsg = "unexpected end of data";
				1045	goto utf8Error;
				1046	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1047
				1048	switch (n) {
				1049
				1050	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1051	errmsg = "unexpected code byte";
				1052	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1053
				1054	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1055	errmsg = "internal error";
				1056	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1057
				1058	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1059	if ((s[1] & 0xc0) != 0x80) {
				1060	errmsg = "invalid data";
				1061	goto utf8Error;
				1062	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1063	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1064	if (ch < 0x80) {
				1065	errmsg = "illegal encoding";
				1066	goto utf8Error;
				1067	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1068	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1069	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1070	break;
				1071
				1072	case 3:
				1073	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1074	(s[2] & 0xc0) != 0x80) {
				1075	errmsg = "invalid data";
				1076	goto utf8Error;
				1077	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1078	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1079	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				1080	errmsg = "illegal encoding";
				1081	goto utf8Error;
				1082	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1083	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1084	*p++ = (Py_UNICODE)ch;
				1085	break;
				1086
				1087	case 4:
				1088	if ((s[1] & 0xc0) != 0x80 \|\|
				1089	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1090	(s[3] & 0xc0) != 0x80) {
				1091	errmsg = "invalid data";
				1092	goto utf8Error;
				1093	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1094	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				1095	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				1096	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1097	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1098	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1099	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1100	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1101	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1102	errmsg = "illegal encoding";
				1103	goto utf8Error;
				1104	}
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1105	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1106	*p++ = (Py_UNICODE)ch;
				1107	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1108	/* compute and append the two surrogates: */
				1109
				1110	/* translate from 10000..10FFFF to 0..FFFF */
				1111	ch -= 0x10000;
				1112
				1113	/* high surrogate = top 10 bits added to D800 */
				1114	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				1115
				1116	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1117	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1118	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1119	break;
				1120
				1121	default:
				1122	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1123	errmsg = "unsupported Unicode code range";
				1124	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1125	}
				1126	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1127	continue;
				1128
				1129	utf8Error:
				1130	if (utf8_decoding_error(&s, &p, errors, errmsg))
				1131	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1132	}
				1133
				1134	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1135	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1136	goto onError;
				1137
				1138	return (PyObject *)unicode;
				1139
				1140	onError:
				1141	Py_DECREF(unicode);
				1142	return NULL;
				1143	}
				1144
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1145	/* Not used anymore, now that the encoder supports UTF-16
				1146	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1147	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1148	static
				1149	int utf8_encoding_error(const Py_UNICODE **source,
				1150	char **dest,
				1151	const char *errors,
				1152	const char *details)
				1153	{
				1154	if ((errors == NULL) \|\|
				1155	(strcmp(errors,"strict") == 0)) {
				1156	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1157	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1158	details);
				1159	return -1;
				1160	}
				1161	else if (strcmp(errors,"ignore") == 0) {
				1162	return 0;
				1163	}
				1164	else if (strcmp(errors,"replace") == 0) {
				1165	**dest = '?';
				1166	(*dest)++;
				1167	return 0;
				1168	}
				1169	else {
				1170	PyErr_Format(PyExc_ValueError,
				1171	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1172	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1173	errors);
				1174	return -1;
				1175	}
				1176	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1177	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1178
				1179	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				1180	int size,
				1181	const char *errors)
				1182	{
				1183	PyObject *v;
				1184	char *p;
				1185	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1186	Py_UCS4 ch2;
				1187	unsigned int cbAllocated = 3 * size;
				1188	unsigned int cbWritten = 0;
				1189	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1190
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1191	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1192	if (v == NULL)
				1193	return NULL;
				1194	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1195	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1196
				1197	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1198	while (i < size) {
				1199	Py_UCS4 ch = s[i++];
				1200	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1201	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1202	cbWritten++;
				1203	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1204	else if (ch < 0x0800) {
				1205	*p++ = 0xc0 \| (ch >> 6);
				1206	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1207	cbWritten += 2;
				1208	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1209	else if (ch < 0x10000) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1210	/* Check for high surrogate */
				1211	if (0xD800 <= ch && ch <= 0xDBFF) {
				1212	if (i != size) {
				1213	ch2 = s[i];
				1214	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				1215
				1216	if (cbWritten >= (cbAllocated - 4)) {
				1217	/* Provide enough room for some more
				1218	surrogates */
				1219	cbAllocated += 4*10;
				1220	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1221	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1222	}
				1223
				1224	/* combine the two values */
				1225	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				1226
				1227	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1228	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1229	i++;
				1230	cbWritten += 4;
				1231	}
				1232	}
				1233	}
				1234	else {
				1235	*p++ = (char)(0xe0 \| (ch >> 12));
				1236	cbWritten += 3;
				1237	}
				1238	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				1239	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1240	} else {
				1241	*p++ = 0xf0 \| (ch>>18);
				1242	*p++ = 0x80 \| ((ch>>12) & 0x3f);
				1243	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				1244	*p++ = 0x80 \| (ch & 0x3f);
				1245	cbWritten += 4;
				1246	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1247	}
				1248	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1249	if (_PyString_Resize(&v, p - q))
				1250	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1251	return v;
				1252
				1253	onError:
				1254	Py_DECREF(v);
				1255	return NULL;
				1256	}
				1257
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1258	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				1259	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1260	if (!PyUnicode_Check(unicode)) {
				1261	PyErr_BadArgument();
				1262	return NULL;
				1263	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	1264	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				1265	PyUnicode_GET_SIZE(unicode),
				1266	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1267	}
				1268
				1269	/* --- UTF-16 Codec ------------------------------------------------------- */
				1270
				1271	static
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1272	int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1273	const char *errors,
				1274	const char *details)
				1275	{
				1276	if ((errors == NULL) \|\|
				1277	(strcmp(errors,"strict") == 0)) {
				1278	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1279	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1280	details);
				1281	return -1;
				1282	}
				1283	else if (strcmp(errors,"ignore") == 0) {
				1284	return 0;
				1285	}
				1286	else if (strcmp(errors,"replace") == 0) {
				1287	if (dest) {
				1288	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1289	(*dest)++;
				1290	}
				1291	return 0;
				1292	}
				1293	else {
				1294	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	1295	"UTF-16 decoding error; "
				1296	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1297	errors);
				1298	return -1;
				1299	}
				1300	}
				1301
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1302	PyObject *
				1303	PyUnicode_DecodeUTF16(const char *s,
				1304	int size,
				1305	const char *errors,
				1306	int *byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1307	{
				1308	PyUnicodeObject *unicode;
				1309	Py_UNICODE *p;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1310	const unsigned char q, e;
				1311	int bo = 0; /* assume native ordering by default */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1312	const char *errmsg = "";
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1313	/* Offsets from q for retrieving byte pairs in the right order. */
				1314	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1315	int ihi = 1, ilo = 0;
				1316	#else
				1317	int ihi = 0, ilo = 1;
				1318	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1319
				1320	/* size should be an even number */
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1321	if (size & 1) {
				1322	if (utf16_decoding_error(NULL, errors, "truncated data"))
				1323	return NULL;
				1324	--size; /* else ignore the oddball byte */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1325	}
				1326
				1327	/* Note: size will always be longer than the resulting Unicode
				1328	character count */
				1329	unicode = _PyUnicode_New(size);
				1330	if (!unicode)
				1331	return NULL;
				1332	if (size == 0)
				1333	return (PyObject *)unicode;
				1334
				1335	/* Unpack UTF-16 encoded data */
				1336	p = unicode->str;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1337	q = (unsigned char *)s;
				1338	e = q + size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1339
				1340	if (byteorder)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1341	bo = *byteorder;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1342
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1343	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1344	byte order setting accordingly. In native mode, the leading BOM
				1345	mark is skipped, in all other modes, it is copied to the output
				1346	stream as-is (giving a ZWNBSP character). */
				1347	if (bo == 0) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1348	const Py_UNICODE bom = (q[ihi] << 8) \| q[ilo];
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1349	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1350	if (bom == 0xFEFF) {
				1351	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1352	bo = -1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1353	}
				1354	else if (bom == 0xFFFE) {
				1355	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1356	bo = 1;
				1357	}
				1358	#else
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1359	if (bom == 0xFEFF) {
				1360	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1361	bo = 1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1362	}
				1363	else if (bom == 0xFFFE) {
				1364	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1365	bo = -1;
				1366	}
				1367	#endif
				1368	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1369
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1370	if (bo == -1) {
				1371	/* force LE */
				1372	ihi = 1;
				1373	ilo = 0;
				1374	}
				1375	else if (bo == 1) {
				1376	/* force BE */
				1377	ihi = 0;
				1378	ilo = 1;
				1379	}
				1380
				1381	while (q < e) {
				1382	Py_UNICODE ch = (q[ihi] << 8) \| q[ilo];
				1383	q += 2;
				1384
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1385	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1386	*p++ = ch;
				1387	continue;
				1388	}
				1389
				1390	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1391	if (q >= e) {
				1392	errmsg = "unexpected end of data";
				1393	goto utf16Error;
				1394	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1395	if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1396	Py_UNICODE ch2 = (q[ihi] << 8) \| q[ilo];
				1397	q += 2;
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1398	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1399	#ifndef Py_UNICODE_WIDE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1400	*p++ = ch;
				1401	*p++ = ch2;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1402	#else
				1403	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1404	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1405	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1406	}
				1407	else {
				1408	errmsg = "illegal UTF-16 surrogate";
				1409	goto utf16Error;
				1410	}
				1411
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1412	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1413	errmsg = "illegal encoding";
				1414	/* Fall through to report the error */
				1415
				1416	utf16Error:
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1417	if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1418	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1419	}
				1420
				1421	if (byteorder)
				1422	*byteorder = bo;
				1423
				1424	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1425	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1426	goto onError;
				1427
				1428	return (PyObject *)unicode;
				1429
				1430	onError:
				1431	Py_DECREF(unicode);
				1432	return NULL;
				1433	}
				1434
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1435	PyObject *
				1436	PyUnicode_EncodeUTF16(const Py_UNICODE *s,
				1437	int size,
				1438	const char *errors,
				1439	int byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1440	{
				1441	PyObject *v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1442	unsigned char *p;
				1443	int i, pairs;
				1444	/* Offsets from p for storing byte pairs in the right order. */
				1445	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1446	int ihi = 1, ilo = 0;
				1447	#else
				1448	int ihi = 0, ilo = 1;
				1449	#endif
				1450
				1451	#define STORECHAR(CH) \
				1452	do { \
				1453	p[ihi] = ((CH) >> 8) & 0xff; \
				1454	p[ilo] = (CH) & 0xff; \
				1455	p += 2; \
				1456	} while(0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1457
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1458	for (i = pairs = 0; i < size; i++)
				1459	if (s[i] >= 0x10000)
				1460	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1461	v = PyString_FromStringAndSize(NULL,
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1462	2 * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1463	if (v == NULL)
				1464	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1465
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1466	p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1467	if (byteorder == 0)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1468	STORECHAR(0xFEFF);
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1469	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1470	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1471
				1472	if (byteorder == -1) {
				1473	/* force LE */
				1474	ihi = 1;
				1475	ilo = 0;
				1476	}
				1477	else if (byteorder == 1) {
				1478	/* force BE */
				1479	ihi = 0;
				1480	ilo = 1;
				1481	}
				1482
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1483	while (size-- > 0) {
				1484	Py_UNICODE ch = *s++;
				1485	Py_UNICODE ch2 = 0;
				1486	if (ch >= 0x10000) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1487	ch2 = 0xDC00 \| ((ch-0x10000) & 0x3FF);
				1488	ch = 0xD800 \| ((ch-0x10000) >> 10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1489	}
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1490	STORECHAR(ch);
				1491	if (ch2)
				1492	STORECHAR(ch2);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1493	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1494	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1495	#undef STORECHAR
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1496	}
				1497
				1498	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1499	{
				1500	if (!PyUnicode_Check(unicode)) {
				1501	PyErr_BadArgument();
				1502	return NULL;
				1503	}
				1504	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1505	PyUnicode_GET_SIZE(unicode),
				1506	NULL,
				1507	0);
				1508	}
				1509
				1510	/* --- Unicode Escape Codec ----------------------------------------------- */
				1511
				1512	static
				1513	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1514	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1515	const char *errors,
				1516	const char *details)
				1517	{
				1518	if ((errors == NULL) \|\|
				1519	(strcmp(errors,"strict") == 0)) {
				1520	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1521	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1522	details);
				1523	return -1;
				1524	}
				1525	else if (strcmp(errors,"ignore") == 0) {
				1526	return 0;
				1527	}
				1528	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1529	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1530	return 0;
				1531	}
				1532	else {
				1533	PyErr_Format(PyExc_ValueError,
				1534	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1535	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1536	errors);
				1537	return -1;
				1538	}
				1539	}
				1540
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1541	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1542
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1543	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1544	int size,
				1545	const char *errors)
				1546	{
				1547	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1548	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1549	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1550	char* message;
				1551	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1552
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1553	/* Escaped strings will always be longer than the resulting
				1554	Unicode string, so we start with size here and then reduce the
				1555	length after conversion to the true value. */
				1556	v = _PyUnicode_New(size);
				1557	if (v == NULL)
				1558	goto onError;
				1559	if (size == 0)
				1560	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1561
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1562	p = buf = PyUnicode_AS_UNICODE(v);
				1563	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1564
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1565	while (s < end) {
				1566	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1567	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1568	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1569
				1570	/* Non-escape characters are interpreted as Unicode ordinals */
				1571	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1572	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1573	continue;
				1574	}
				1575
				1576	/* \ - Escapes */
				1577	s++;
				1578	switch (*s++) {
				1579
				1580	/* \x escapes */
				1581	case '\n': break;
				1582	case '\\': *p++ = '\\'; break;
				1583	case '\'': *p++ = '\''; break;
				1584	case '\"': *p++ = '\"'; break;
				1585	case 'b': *p++ = '\b'; break;
				1586	case 'f': p++ = '\014'; break; / FF */
				1587	case 't': *p++ = '\t'; break;
				1588	case 'n': *p++ = '\n'; break;
				1589	case 'r': *p++ = '\r'; break;
				1590	case 'v': p++ = '\013'; break; / VT */
				1591	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1592
				1593	/* \OOO (octal) escapes */
				1594	case '0': case '1': case '2': case '3':
				1595	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1596	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1597	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1598	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1599	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1600	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1601	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1602	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1603	break;
				1604
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1605	/* hex escapes */
				1606	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1607	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1608	digits = 2;
				1609	message = "truncated \\xXX escape";
				1610	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1611
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1612	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1613	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1614	digits = 4;
				1615	message = "truncated \\uXXXX escape";
				1616	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1617
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1618	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1619	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1620	digits = 8;
				1621	message = "truncated \\UXXXXXXXX escape";
				1622	hexescape:
				1623	chr = 0;
				1624	for (i = 0; i < digits; i++) {
				1625	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1626	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1627	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1628	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1629	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1630	i++;
				1631	break;
				1632	}
				1633	chr = (chr<<4) & ~0xF;
				1634	if (c >= '0' && c <= '9')
				1635	chr += c - '0';
				1636	else if (c >= 'a' && c <= 'f')
				1637	chr += 10 + c - 'a';
				1638	else
				1639	chr += 10 + c - 'A';
				1640	}
				1641	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1642	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1643	/* when we get here, chr is a 32-bit unicode character */
				1644	if (chr <= 0xffff)
				1645	/* UCS-2 character */
				1646	*p++ = (Py_UNICODE) chr;
				1647	else if (chr <= 0x10ffff) {
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1648	/* UCS-4 character. Either store directly, or as
				1649	surrogate pair. */
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1650	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1651	*p++ = chr;
				1652	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1653	chr -= 0x10000L;
				1654	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1655	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1656	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1657	} else {
				1658	if (unicodeescape_decoding_error(
				1659	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1660	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1661	)
				1662	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1663	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1664	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1665	break;
				1666
				1667	/* \N{name} */
				1668	case 'N':
				1669	message = "malformed \\N character escape";
				1670	if (ucnhash_CAPI == NULL) {
				1671	/* load the unicode data module */
				1672	PyObject m, v;
				1673	m = PyImport_ImportModule("unicodedata");
				1674	if (m == NULL)
				1675	goto ucnhashError;
				1676	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1677	Py_DECREF(m);
				1678	if (v == NULL)
				1679	goto ucnhashError;
				1680	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1681	Py_DECREF(v);
				1682	if (ucnhash_CAPI == NULL)
				1683	goto ucnhashError;
				1684	}
				1685	if (*s == '{') {
				1686	const char *start = s+1;
				1687	/* look for the closing brace */
				1688	while (*s != '}' && s < end)
				1689	s++;
				1690	if (s > start && s < end && *s == '}') {
				1691	/* found a name. look it up in the unicode database */
				1692	message = "unknown Unicode character name";
				1693	s++;
				1694	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1695	goto store;
				1696	}
				1697	}
				1698	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1699	goto onError;
				1700	*p++ = x;
				1701	break;
				1702
				1703	default:
				1704	*p++ = '\\';
				1705	*p++ = (unsigned char)s[-1];
				1706	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1707	}
				1708	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1709	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1710	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1711	return (PyObject *)v;
				1712
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1713	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1714	PyErr_SetString(
				1715	PyExc_UnicodeError,
				1716	"\\N escapes not supported (can't load unicodedata module)"
				1717	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1718	return NULL;
				1719
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1720	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1721	Py_XDECREF(v);
				1722	return NULL;
				1723	}
				1724
				1725	/* Return a Unicode-Escape string version of the Unicode object.
				1726
				1727	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1728	appropriate.
				1729
				1730	*/
				1731
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1732	static const Py_UNICODE findchar(const Py_UNICODE s,
				1733	int size,
				1734	Py_UNICODE ch);
				1735
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1736	static
				1737	PyObject unicodeescape_string(const Py_UNICODE s,
				1738	int size,
				1739	int quotes)
				1740	{
				1741	PyObject *repr;
				1742	char *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1743
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1744	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1745
				1746	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1747	if (repr == NULL)
				1748	return NULL;
				1749
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1750	p = PyString_AS_STRING(repr);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1751
				1752	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1753	*p++ = 'u';
				1754	*p++ = (findchar(s, size, '\'') &&
				1755	!findchar(s, size, '"')) ? '"' : '\'';
				1756	}
				1757	while (size-- > 0) {
				1758	Py_UNICODE ch = *s++;
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1759
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1760	/* Escape quotes */
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1761	if (quotes &&
				1762	(ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1763	*p++ = '\\';
				1764	*p++ = (char) ch;
Guido van Rossum	ad9744a	2001-09-21 15:38:17 +0000	[diff] [blame]	1765	continue;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1766	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1767
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1768	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1769	/* Map 21-bit characters to '\U00xxxxxx' */
				1770	else if (ch >= 0x10000) {
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1771	int offset = p - PyString_AS_STRING(repr);
				1772
				1773	/* Resize the string if necessary */
				1774	if (offset + 12 > PyString_GET_SIZE(repr)) {
				1775	if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
				1776	goto onError;
				1777	p = PyString_AS_STRING(repr) + offset;
				1778	}
				1779
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1780	*p++ = '\\';
				1781	*p++ = 'U';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1782	*p++ = hexdigit[(ch >> 28) & 0x0000000F];
				1783	*p++ = hexdigit[(ch >> 24) & 0x0000000F];
				1784	*p++ = hexdigit[(ch >> 20) & 0x0000000F];
				1785	*p++ = hexdigit[(ch >> 16) & 0x0000000F];
				1786	*p++ = hexdigit[(ch >> 12) & 0x0000000F];
				1787	*p++ = hexdigit[(ch >> 8) & 0x0000000F];
				1788	*p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1789	*p++ = hexdigit[ch & 0x0000000F];
				1790	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1791	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1792	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1793	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
				1794	else if (ch >= 0xD800 && ch < 0xDC00) {
				1795	Py_UNICODE ch2;
				1796	Py_UCS4 ucs;
				1797
				1798	ch2 = *s++;
				1799	size--;
				1800	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
				1801	ucs = (((ch & 0x03FF) << 10) \| (ch2 & 0x03FF)) + 0x00010000;
				1802	*p++ = '\\';
				1803	*p++ = 'U';
				1804	*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
				1805	*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
				1806	*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
				1807	*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
				1808	*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
				1809	*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
				1810	*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
				1811	*p++ = hexdigit[ucs & 0x0000000F];
				1812	continue;
				1813	}
				1814	/* Fall through: isolated surrogates are copied as-is */
				1815	s--;
				1816	size++;
				1817	}
				1818
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1819	/* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1820	if (ch >= 256) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1821	*p++ = '\\';
				1822	*p++ = 'u';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1823	*p++ = hexdigit[(ch >> 12) & 0x000F];
				1824	*p++ = hexdigit[(ch >> 8) & 0x000F];
				1825	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1826	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1827	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1828
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1829	/* Map special whitespace to '\t', \n', '\r' */
				1830	else if (ch == '\t') {
				1831	*p++ = '\\';
				1832	*p++ = 't';
				1833	}
				1834	else if (ch == '\n') {
				1835	*p++ = '\\';
				1836	*p++ = 'n';
				1837	}
				1838	else if (ch == '\r') {
				1839	*p++ = '\\';
				1840	*p++ = 'r';
				1841	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1842
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1843	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1844	else if (ch < ' ' \|\| ch >= 128) {
				1845	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1846	*p++ = 'x';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1847	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1848	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1849	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1850
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1851	/* Copy everything else as-is */
				1852	else
				1853	*p++ = (char) ch;
				1854	}
				1855	if (quotes)
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1856	*p++ = PyString_AS_STRING(repr)[1];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1857
				1858	*p = '\0';
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1859	if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1860	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1861
				1862	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1863
				1864	onError:
				1865	Py_DECREF(repr);
				1866	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1867	}
				1868
				1869	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1870	int size)
				1871	{
				1872	return unicodeescape_string(s, size, 0);
				1873	}
				1874
				1875	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1876	{
				1877	if (!PyUnicode_Check(unicode)) {
				1878	PyErr_BadArgument();
				1879	return NULL;
				1880	}
				1881	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1882	PyUnicode_GET_SIZE(unicode));
				1883	}
				1884
				1885	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1886
				1887	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1888	int size,
				1889	const char *errors)
				1890	{
				1891	PyUnicodeObject *v;
				1892	Py_UNICODE p, buf;
				1893	const char *end;
				1894	const char *bs;
				1895
				1896	/* Escaped strings will always be longer than the resulting
				1897	Unicode string, so we start with size here and then reduce the
				1898	length after conversion to the true value. */
				1899	v = _PyUnicode_New(size);
				1900	if (v == NULL)
				1901	goto onError;
				1902	if (size == 0)
				1903	return (PyObject *)v;
				1904	p = buf = PyUnicode_AS_UNICODE(v);
				1905	end = s + size;
				1906	while (s < end) {
				1907	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1908	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1909	int i;
				1910
				1911	/* Non-escape characters are interpreted as Unicode ordinals */
				1912	if (*s != '\\') {
				1913	p++ = (unsigned char)s++;
				1914	continue;
				1915	}
				1916
				1917	/* \u-escapes are only interpreted iff the number of leading
				1918	backslashes if odd */
				1919	bs = s;
				1920	for (;s < end;) {
				1921	if (*s != '\\')
				1922	break;
				1923	p++ = (unsigned char)s++;
				1924	}
				1925	if (((s - bs) & 1) == 0 \|\|
				1926	s >= end \|\|
				1927	*s != 'u') {
				1928	continue;
				1929	}
				1930	p--;
				1931	s++;
				1932
				1933	/* \uXXXX with 4 hex digits */
				1934	for (x = 0, i = 0; i < 4; i++) {
				1935	c = (unsigned char)s[i];
				1936	if (!isxdigit(c)) {
				1937	if (unicodeescape_decoding_error(&s, &x, errors,
				1938	"truncated \\uXXXX"))
				1939	goto onError;
				1940	i++;
				1941	break;
				1942	}
				1943	x = (x<<4) & ~0xF;
				1944	if (c >= '0' && c <= '9')
				1945	x += c - '0';
				1946	else if (c >= 'a' && c <= 'f')
				1947	x += 10 + c - 'a';
				1948	else
				1949	x += 10 + c - 'A';
				1950	}
				1951	s += i;
				1952	*p++ = x;
				1953	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1954	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1955	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1956	return (PyObject *)v;
				1957
				1958	onError:
				1959	Py_XDECREF(v);
				1960	return NULL;
				1961	}
				1962
				1963	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1964	int size)
				1965	{
				1966	PyObject *repr;
				1967	char *p;
				1968	char *q;
				1969
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1970	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1971
				1972	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1973	if (repr == NULL)
				1974	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1975	if (size == 0)
				1976	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1977
				1978	p = q = PyString_AS_STRING(repr);
				1979	while (size-- > 0) {
				1980	Py_UNICODE ch = *s++;
				1981	/* Map 16-bit characters to '\uxxxx' */
				1982	if (ch >= 256) {
				1983	*p++ = '\\';
				1984	*p++ = 'u';
				1985	*p++ = hexdigit[(ch >> 12) & 0xf];
				1986	*p++ = hexdigit[(ch >> 8) & 0xf];
				1987	*p++ = hexdigit[(ch >> 4) & 0xf];
				1988	*p++ = hexdigit[ch & 15];
				1989	}
				1990	/* Copy everything else as-is */
				1991	else
				1992	*p++ = (char) ch;
				1993	}
				1994	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1995	if (_PyString_Resize(&repr, p - q))
				1996	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1997
				1998	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1999
				2000	onError:
				2001	Py_DECREF(repr);
				2002	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2003	}
				2004
				2005	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				2006	{
				2007	if (!PyUnicode_Check(unicode)) {
				2008	PyErr_BadArgument();
				2009	return NULL;
				2010	}
				2011	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				2012	PyUnicode_GET_SIZE(unicode));
				2013	}
				2014
				2015	/* --- Latin-1 Codec ------------------------------------------------------ */
				2016
				2017	PyObject PyUnicode_DecodeLatin1(const char s,
				2018	int size,
				2019	const char *errors)
				2020	{
				2021	PyUnicodeObject *v;
				2022	Py_UNICODE *p;
				2023
				2024	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2025	if (size == 1 && (unsigned char)s < 256) {
				2026	Py_UNICODE r = (unsigned char)s;
				2027	return PyUnicode_FromUnicode(&r, 1);
				2028	}
				2029
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2030	v = _PyUnicode_New(size);
				2031	if (v == NULL)
				2032	goto onError;
				2033	if (size == 0)
				2034	return (PyObject *)v;
				2035	p = PyUnicode_AS_UNICODE(v);
				2036	while (size-- > 0)
				2037	p++ = (unsigned char)s++;
				2038	return (PyObject *)v;
				2039
				2040	onError:
				2041	Py_XDECREF(v);
				2042	return NULL;
				2043	}
				2044
				2045	static
				2046	int latin1_encoding_error(const Py_UNICODE **source,
				2047	char **dest,
				2048	const char *errors,
				2049	const char *details)
				2050	{
				2051	if ((errors == NULL) \|\|
				2052	(strcmp(errors,"strict") == 0)) {
				2053	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2054	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2055	details);
				2056	return -1;
				2057	}
				2058	else if (strcmp(errors,"ignore") == 0) {
				2059	return 0;
				2060	}
				2061	else if (strcmp(errors,"replace") == 0) {
				2062	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2063	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2064	return 0;
				2065	}
				2066	else {
				2067	PyErr_Format(PyExc_ValueError,
				2068	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2069	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2070	errors);
				2071	return -1;
				2072	}
				2073	}
				2074
				2075	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				2076	int size,
				2077	const char *errors)
				2078	{
				2079	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2080	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2081
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2082	repr = PyString_FromStringAndSize(NULL, size);
				2083	if (repr == NULL)
				2084	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2085	if (size == 0)
				2086	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2087
				2088	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2089	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2090	while (size-- > 0) {
				2091	Py_UNICODE ch = *p++;
				2092	if (ch >= 256) {
				2093	if (latin1_encoding_error(&p, &s, errors,
				2094	"ordinal not in range(256)"))
				2095	goto onError;
				2096	}
				2097	else
				2098	*s++ = (char)ch;
				2099	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2100	/* Resize if error handling skipped some characters */
				2101	if (s - start < PyString_GET_SIZE(repr))
				2102	if (_PyString_Resize(&repr, s - start))
				2103	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2104	return repr;
				2105
				2106	onError:
				2107	Py_DECREF(repr);
				2108	return NULL;
				2109	}
				2110
				2111	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				2112	{
				2113	if (!PyUnicode_Check(unicode)) {
				2114	PyErr_BadArgument();
				2115	return NULL;
				2116	}
				2117	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				2118	PyUnicode_GET_SIZE(unicode),
				2119	NULL);
				2120	}
				2121
				2122	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				2123
				2124	static
				2125	int ascii_decoding_error(const char **source,
				2126	Py_UNICODE **dest,
				2127	const char *errors,
				2128	const char *details)
				2129	{
				2130	if ((errors == NULL) \|\|
				2131	(strcmp(errors,"strict") == 0)) {
				2132	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2133	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2134	details);
				2135	return -1;
				2136	}
				2137	else if (strcmp(errors,"ignore") == 0) {
				2138	return 0;
				2139	}
				2140	else if (strcmp(errors,"replace") == 0) {
				2141	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2142	(*dest)++;
				2143	return 0;
				2144	}
				2145	else {
				2146	PyErr_Format(PyExc_ValueError,
				2147	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2148	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2149	errors);
				2150	return -1;
				2151	}
				2152	}
				2153
				2154	PyObject PyUnicode_DecodeASCII(const char s,
				2155	int size,
				2156	const char *errors)
				2157	{
				2158	PyUnicodeObject *v;
				2159	Py_UNICODE *p;
				2160
				2161	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2162	if (size == 1 && (unsigned char)s < 128) {
				2163	Py_UNICODE r = (unsigned char)s;
				2164	return PyUnicode_FromUnicode(&r, 1);
				2165	}
				2166
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2167	v = _PyUnicode_New(size);
				2168	if (v == NULL)
				2169	goto onError;
				2170	if (size == 0)
				2171	return (PyObject *)v;
				2172	p = PyUnicode_AS_UNICODE(v);
				2173	while (size-- > 0) {
				2174	register unsigned char c;
				2175
				2176	c = (unsigned char)*s++;
				2177	if (c < 128)
				2178	*p++ = c;
				2179	else if (ascii_decoding_error(&s, &p, errors,
				2180	"ordinal not in range(128)"))
				2181	goto onError;
				2182	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2183	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2184	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2185	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2186	return (PyObject *)v;
				2187
				2188	onError:
				2189	Py_XDECREF(v);
				2190	return NULL;
				2191	}
				2192
				2193	static
				2194	int ascii_encoding_error(const Py_UNICODE **source,
				2195	char **dest,
				2196	const char *errors,
				2197	const char *details)
				2198	{
				2199	if ((errors == NULL) \|\|
				2200	(strcmp(errors,"strict") == 0)) {
				2201	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2202	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2203	details);
				2204	return -1;
				2205	}
				2206	else if (strcmp(errors,"ignore") == 0) {
				2207	return 0;
				2208	}
				2209	else if (strcmp(errors,"replace") == 0) {
				2210	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2211	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2212	return 0;
				2213	}
				2214	else {
				2215	PyErr_Format(PyExc_ValueError,
				2216	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2217	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2218	errors);
				2219	return -1;
				2220	}
				2221	}
				2222
				2223	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				2224	int size,
				2225	const char *errors)
				2226	{
				2227	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2228	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2229
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2230	repr = PyString_FromStringAndSize(NULL, size);
				2231	if (repr == NULL)
				2232	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2233	if (size == 0)
				2234	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2235
				2236	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2237	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2238	while (size-- > 0) {
				2239	Py_UNICODE ch = *p++;
				2240	if (ch >= 128) {
				2241	if (ascii_encoding_error(&p, &s, errors,
				2242	"ordinal not in range(128)"))
				2243	goto onError;
				2244	}
				2245	else
				2246	*s++ = (char)ch;
				2247	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2248	/* Resize if error handling skipped some characters */
				2249	if (s - start < PyString_GET_SIZE(repr))
				2250	if (_PyString_Resize(&repr, s - start))
				2251	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2252	return repr;
				2253
				2254	onError:
				2255	Py_DECREF(repr);
				2256	return NULL;
				2257	}
				2258
				2259	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				2260	{
				2261	if (!PyUnicode_Check(unicode)) {
				2262	PyErr_BadArgument();
				2263	return NULL;
				2264	}
				2265	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				2266	PyUnicode_GET_SIZE(unicode),
				2267	NULL);
				2268	}
				2269
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	2270	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2271
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2272	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2273
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2274	PyObject PyUnicode_DecodeMBCS(const char s,
				2275	int size,
				2276	const char *errors)
				2277	{
				2278	PyUnicodeObject *v;
				2279	Py_UNICODE *p;
				2280
				2281	/* First get the size of the result */
				2282	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2283	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2284	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2285
				2286	v = _PyUnicode_New(usize);
				2287	if (v == NULL)
				2288	return NULL;
				2289	if (usize == 0)
				2290	return (PyObject *)v;
				2291	p = PyUnicode_AS_UNICODE(v);
				2292	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				2293	Py_DECREF(v);
				2294	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2295	}
				2296
				2297	return (PyObject *)v;
				2298	}
				2299
				2300	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				2301	int size,
				2302	const char *errors)
				2303	{
				2304	PyObject *repr;
				2305	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2306	DWORD mbcssize;
				2307
				2308	/* If there are no characters, bail now! */
				2309	if (size==0)
				2310	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2311
				2312	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2313	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2314	if (mbcssize==0)
				2315	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2316
				2317	repr = PyString_FromStringAndSize(NULL, mbcssize);
				2318	if (repr == NULL)
				2319	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2320	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2321	return repr;
				2322
				2323	/* Do the conversion */
				2324	s = PyString_AS_STRING(repr);
				2325	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				2326	Py_DECREF(repr);
				2327	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2328	}
				2329	return repr;
				2330	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2331
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2332	#endif /* MS_WIN32 */
				2333
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2334	/* --- Character Mapping Codec -------------------------------------------- */
				2335
				2336	static
				2337	int charmap_decoding_error(const char **source,
				2338	Py_UNICODE **dest,
				2339	const char *errors,
				2340	const char *details)
				2341	{
				2342	if ((errors == NULL) \|\|
				2343	(strcmp(errors,"strict") == 0)) {
				2344	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2345	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2346	details);
				2347	return -1;
				2348	}
				2349	else if (strcmp(errors,"ignore") == 0) {
				2350	return 0;
				2351	}
				2352	else if (strcmp(errors,"replace") == 0) {
				2353	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2354	(*dest)++;
				2355	return 0;
				2356	}
				2357	else {
				2358	PyErr_Format(PyExc_ValueError,
				2359	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2360	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2361	errors);
				2362	return -1;
				2363	}
				2364	}
				2365
				2366	PyObject PyUnicode_DecodeCharmap(const char s,
				2367	int size,
				2368	PyObject *mapping,
				2369	const char *errors)
				2370	{
				2371	PyUnicodeObject *v;
				2372	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2373	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2374
				2375	/* Default to Latin-1 */
				2376	if (mapping == NULL)
				2377	return PyUnicode_DecodeLatin1(s, size, errors);
				2378
				2379	v = _PyUnicode_New(size);
				2380	if (v == NULL)
				2381	goto onError;
				2382	if (size == 0)
				2383	return (PyObject *)v;
				2384	p = PyUnicode_AS_UNICODE(v);
				2385	while (size-- > 0) {
				2386	unsigned char ch = *s++;
				2387	PyObject w, x;
				2388
				2389	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2390	w = PyInt_FromLong((long)ch);
				2391	if (w == NULL)
				2392	goto onError;
				2393	x = PyObject_GetItem(mapping, w);
				2394	Py_DECREF(w);
				2395	if (x == NULL) {
				2396	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2397	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2398	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2399	x = Py_None;
				2400	Py_INCREF(x);
				2401	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2402	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2403	}
				2404
				2405	/* Apply mapping */
				2406	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2407	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2408	if (value < 0 \|\| value > 65535) {
				2409	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2410	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2411	Py_DECREF(x);
				2412	goto onError;
				2413	}
				2414	*p++ = (Py_UNICODE)value;
				2415	}
				2416	else if (x == Py_None) {
				2417	/* undefined mapping */
				2418	if (charmap_decoding_error(&s, &p, errors,
				2419	"character maps to <undefined>")) {
				2420	Py_DECREF(x);
				2421	goto onError;
				2422	}
				2423	}
				2424	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2425	int targetsize = PyUnicode_GET_SIZE(x);
				2426
				2427	if (targetsize == 1)
				2428	/* 1-1 mapping */
				2429	p++ = PyUnicode_AS_UNICODE(x);
				2430
				2431	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2432	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2433	if (targetsize > extrachars) {
				2434	/* resize first */
				2435	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2436	int needed = (targetsize - extrachars) + \
				2437	(targetsize << 2);
				2438	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2439	if (_PyUnicode_Resize(&v,
				2440	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2441	Py_DECREF(x);
				2442	goto onError;
				2443	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2444	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2445	}
				2446	Py_UNICODE_COPY(p,
				2447	PyUnicode_AS_UNICODE(x),
				2448	targetsize);
				2449	p += targetsize;
				2450	extrachars -= targetsize;
				2451	}
				2452	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2453	}
				2454	else {
				2455	/* wrong return value */
				2456	PyErr_SetString(PyExc_TypeError,
				2457	"character mapping must return integer, None or unicode");
				2458	Py_DECREF(x);
				2459	goto onError;
				2460	}
				2461	Py_DECREF(x);
				2462	}
				2463	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2464	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2465	goto onError;
				2466	return (PyObject *)v;
				2467
				2468	onError:
				2469	Py_XDECREF(v);
				2470	return NULL;
				2471	}
				2472
				2473	static
				2474	int charmap_encoding_error(const Py_UNICODE **source,
				2475	char **dest,
				2476	const char *errors,
				2477	const char *details)
				2478	{
				2479	if ((errors == NULL) \|\|
				2480	(strcmp(errors,"strict") == 0)) {
				2481	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2482	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2483	details);
				2484	return -1;
				2485	}
				2486	else if (strcmp(errors,"ignore") == 0) {
				2487	return 0;
				2488	}
				2489	else if (strcmp(errors,"replace") == 0) {
				2490	**dest = '?';
				2491	(*dest)++;
				2492	return 0;
				2493	}
				2494	else {
				2495	PyErr_Format(PyExc_ValueError,
				2496	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2497	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2498	errors);
				2499	return -1;
				2500	}
				2501	}
				2502
				2503	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2504	int size,
				2505	PyObject *mapping,
				2506	const char *errors)
				2507	{
				2508	PyObject *v;
				2509	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2510	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2511
				2512	/* Default to Latin-1 */
				2513	if (mapping == NULL)
				2514	return PyUnicode_EncodeLatin1(p, size, errors);
				2515
				2516	v = PyString_FromStringAndSize(NULL, size);
				2517	if (v == NULL)
				2518	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2519	if (size == 0)
				2520	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2521	s = PyString_AS_STRING(v);
				2522	while (size-- > 0) {
				2523	Py_UNICODE ch = *p++;
				2524	PyObject w, x;
				2525
				2526	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2527	w = PyInt_FromLong((long)ch);
				2528	if (w == NULL)
				2529	goto onError;
				2530	x = PyObject_GetItem(mapping, w);
				2531	Py_DECREF(w);
				2532	if (x == NULL) {
				2533	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2534	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2535	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2536	x = Py_None;
				2537	Py_INCREF(x);
				2538	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2539	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2540	}
				2541
				2542	/* Apply mapping */
				2543	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2544	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2545	if (value < 0 \|\| value > 255) {
				2546	PyErr_SetString(PyExc_TypeError,
				2547	"character mapping must be in range(256)");
				2548	Py_DECREF(x);
				2549	goto onError;
				2550	}
				2551	*s++ = (char)value;
				2552	}
				2553	else if (x == Py_None) {
				2554	/* undefined mapping */
				2555	if (charmap_encoding_error(&p, &s, errors,
				2556	"character maps to <undefined>")) {
				2557	Py_DECREF(x);
				2558	goto onError;
				2559	}
				2560	}
				2561	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2562	int targetsize = PyString_GET_SIZE(x);
				2563
				2564	if (targetsize == 1)
				2565	/* 1-1 mapping */
				2566	s++ = PyString_AS_STRING(x);
				2567
				2568	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2569	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2570	if (targetsize > extrachars) {
				2571	/* resize first */
				2572	int oldpos = (int)(s - PyString_AS_STRING(v));
				2573	int needed = (targetsize - extrachars) + \
				2574	(targetsize << 2);
				2575	extrachars += needed;
				2576	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2577	Py_DECREF(x);
				2578	goto onError;
				2579	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2580	s = PyString_AS_STRING(v) + oldpos;
				2581	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2582	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2583	s += targetsize;
				2584	extrachars -= targetsize;
				2585	}
				2586	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2587	}
				2588	else {
				2589	/* wrong return value */
				2590	PyErr_SetString(PyExc_TypeError,
				2591	"character mapping must return integer, None or unicode");
				2592	Py_DECREF(x);
				2593	goto onError;
				2594	}
				2595	Py_DECREF(x);
				2596	}
				2597	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2598	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2599	goto onError;
				2600	return v;
				2601
				2602	onError:
				2603	Py_DECREF(v);
				2604	return NULL;
				2605	}
				2606
				2607	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2608	PyObject *mapping)
				2609	{
				2610	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2611	PyErr_BadArgument();
				2612	return NULL;
				2613	}
				2614	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2615	PyUnicode_GET_SIZE(unicode),
				2616	mapping,
				2617	NULL);
				2618	}
				2619
				2620	static
				2621	int translate_error(const Py_UNICODE **source,
				2622	Py_UNICODE **dest,
				2623	const char *errors,
				2624	const char *details)
				2625	{
				2626	if ((errors == NULL) \|\|
				2627	(strcmp(errors,"strict") == 0)) {
				2628	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2629	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2630	details);
				2631	return -1;
				2632	}
				2633	else if (strcmp(errors,"ignore") == 0) {
				2634	return 0;
				2635	}
				2636	else if (strcmp(errors,"replace") == 0) {
				2637	**dest = '?';
				2638	(*dest)++;
				2639	return 0;
				2640	}
				2641	else {
				2642	PyErr_Format(PyExc_ValueError,
				2643	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2644	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2645	errors);
				2646	return -1;
				2647	}
				2648	}
				2649
				2650	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2651	int size,
				2652	PyObject *mapping,
				2653	const char *errors)
				2654	{
				2655	PyUnicodeObject *v;
				2656	Py_UNICODE *p;
				2657
				2658	if (mapping == NULL) {
				2659	PyErr_BadArgument();
				2660	return NULL;
				2661	}
				2662
				2663	/* Output will never be longer than input */
				2664	v = _PyUnicode_New(size);
				2665	if (v == NULL)
				2666	goto onError;
				2667	if (size == 0)
				2668	goto done;
				2669	p = PyUnicode_AS_UNICODE(v);
				2670	while (size-- > 0) {
				2671	Py_UNICODE ch = *s++;
				2672	PyObject w, x;
				2673
				2674	/* Get mapping */
				2675	w = PyInt_FromLong(ch);
				2676	if (w == NULL)
				2677	goto onError;
				2678	x = PyObject_GetItem(mapping, w);
				2679	Py_DECREF(w);
				2680	if (x == NULL) {
				2681	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2682	/* No mapping found: default to 1-1 mapping */
				2683	PyErr_Clear();
				2684	*p++ = ch;
				2685	continue;
				2686	}
				2687	goto onError;
				2688	}
				2689
				2690	/* Apply mapping */
				2691	if (PyInt_Check(x))
				2692	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2693	else if (x == Py_None) {
				2694	/* undefined mapping */
				2695	if (translate_error(&s, &p, errors,
				2696	"character maps to <undefined>")) {
				2697	Py_DECREF(x);
				2698	goto onError;
				2699	}
				2700	}
				2701	else if (PyUnicode_Check(x)) {
				2702	if (PyUnicode_GET_SIZE(x) != 1) {
				2703	/* 1-n mapping */
				2704	PyErr_SetString(PyExc_NotImplementedError,
				2705	"1-n mappings are currently not implemented");
				2706	Py_DECREF(x);
				2707	goto onError;
				2708	}
				2709	p++ = PyUnicode_AS_UNICODE(x);
				2710	}
				2711	else {
				2712	/* wrong return value */
				2713	PyErr_SetString(PyExc_TypeError,
				2714	"translate mapping must return integer, None or unicode");
				2715	Py_DECREF(x);
				2716	goto onError;
				2717	}
				2718	Py_DECREF(x);
				2719	}
				2720	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2721	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2722	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2723
				2724	done:
				2725	return (PyObject *)v;
				2726
				2727	onError:
				2728	Py_XDECREF(v);
				2729	return NULL;
				2730	}
				2731
				2732	PyObject PyUnicode_Translate(PyObject str,
				2733	PyObject *mapping,
				2734	const char *errors)
				2735	{
				2736	PyObject *result;
				2737
				2738	str = PyUnicode_FromObject(str);
				2739	if (str == NULL)
				2740	goto onError;
				2741	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2742	PyUnicode_GET_SIZE(str),
				2743	mapping,
				2744	errors);
				2745	Py_DECREF(str);
				2746	return result;
				2747
				2748	onError:
				2749	Py_XDECREF(str);
				2750	return NULL;
				2751	}
				2752
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2753	/* --- Decimal Encoder ---------------------------------------------------- */
				2754
				2755	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2756	int length,
				2757	char *output,
				2758	const char *errors)
				2759	{
				2760	Py_UNICODE p, end;
				2761
				2762	if (output == NULL) {
				2763	PyErr_BadArgument();
				2764	return -1;
				2765	}
				2766
				2767	p = s;
				2768	end = s + length;
				2769	while (p < end) {
				2770	register Py_UNICODE ch = *p++;
				2771	int decimal;
				2772
				2773	if (Py_UNICODE_ISSPACE(ch)) {
				2774	*output++ = ' ';
				2775	continue;
				2776	}
				2777	decimal = Py_UNICODE_TODECIMAL(ch);
				2778	if (decimal >= 0) {
				2779	*output++ = '0' + decimal;
				2780	continue;
				2781	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2782	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2783	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2784	continue;
				2785	}
				2786	/* All other characters are considered invalid */
				2787	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2788	PyErr_SetString(PyExc_ValueError,
				2789	"invalid decimal Unicode string");
				2790	goto onError;
				2791	}
				2792	else if (strcmp(errors, "ignore") == 0)
				2793	continue;
				2794	else if (strcmp(errors, "replace") == 0) {
				2795	*output++ = '?';
				2796	continue;
				2797	}
				2798	}
				2799	/* 0-terminate the output string */
				2800	*output++ = '\0';
				2801	return 0;
				2802
				2803	onError:
				2804	return -1;
				2805	}
				2806
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2807	/* --- Helpers ------------------------------------------------------------ */
				2808
				2809	static
				2810	int count(PyUnicodeObject *self,
				2811	int start,
				2812	int end,
				2813	PyUnicodeObject *substring)
				2814	{
				2815	int count = 0;
				2816
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2817	if (start < 0)
				2818	start += self->length;
				2819	if (start < 0)
				2820	start = 0;
				2821	if (end > self->length)
				2822	end = self->length;
				2823	if (end < 0)
				2824	end += self->length;
				2825	if (end < 0)
				2826	end = 0;
				2827
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2828	if (substring->length == 0)
				2829	return (end - start + 1);
				2830
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2831	end -= substring->length;
				2832
				2833	while (start <= end)
				2834	if (Py_UNICODE_MATCH(self, start, substring)) {
				2835	count++;
				2836	start += substring->length;
				2837	} else
				2838	start++;
				2839
				2840	return count;
				2841	}
				2842
				2843	int PyUnicode_Count(PyObject *str,
				2844	PyObject *substr,
				2845	int start,
				2846	int end)
				2847	{
				2848	int result;
				2849
				2850	str = PyUnicode_FromObject(str);
				2851	if (str == NULL)
				2852	return -1;
				2853	substr = PyUnicode_FromObject(substr);
				2854	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2855	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2856	return -1;
				2857	}
				2858
				2859	result = count((PyUnicodeObject *)str,
				2860	start, end,
				2861	(PyUnicodeObject *)substr);
				2862
				2863	Py_DECREF(str);
				2864	Py_DECREF(substr);
				2865	return result;
				2866	}
				2867
				2868	static
				2869	int findstring(PyUnicodeObject *self,
				2870	PyUnicodeObject *substring,
				2871	int start,
				2872	int end,
				2873	int direction)
				2874	{
				2875	if (start < 0)
				2876	start += self->length;
				2877	if (start < 0)
				2878	start = 0;
				2879
				2880	if (substring->length == 0)
				2881	return start;
				2882
				2883	if (end > self->length)
				2884	end = self->length;
				2885	if (end < 0)
				2886	end += self->length;
				2887	if (end < 0)
				2888	end = 0;
				2889
				2890	end -= substring->length;
				2891
				2892	if (direction < 0) {
				2893	for (; end >= start; end--)
				2894	if (Py_UNICODE_MATCH(self, end, substring))
				2895	return end;
				2896	} else {
				2897	for (; start <= end; start++)
				2898	if (Py_UNICODE_MATCH(self, start, substring))
				2899	return start;
				2900	}
				2901
				2902	return -1;
				2903	}
				2904
				2905	int PyUnicode_Find(PyObject *str,
				2906	PyObject *substr,
				2907	int start,
				2908	int end,
				2909	int direction)
				2910	{
				2911	int result;
				2912
				2913	str = PyUnicode_FromObject(str);
				2914	if (str == NULL)
				2915	return -1;
				2916	substr = PyUnicode_FromObject(substr);
				2917	if (substr == NULL) {
				2918	Py_DECREF(substr);
				2919	return -1;
				2920	}
				2921
				2922	result = findstring((PyUnicodeObject *)str,
				2923	(PyUnicodeObject *)substr,
				2924	start, end, direction);
				2925	Py_DECREF(str);
				2926	Py_DECREF(substr);
				2927	return result;
				2928	}
				2929
				2930	static
				2931	int tailmatch(PyUnicodeObject *self,
				2932	PyUnicodeObject *substring,
				2933	int start,
				2934	int end,
				2935	int direction)
				2936	{
				2937	if (start < 0)
				2938	start += self->length;
				2939	if (start < 0)
				2940	start = 0;
				2941
				2942	if (substring->length == 0)
				2943	return 1;
				2944
				2945	if (end > self->length)
				2946	end = self->length;
				2947	if (end < 0)
				2948	end += self->length;
				2949	if (end < 0)
				2950	end = 0;
				2951
				2952	end -= substring->length;
				2953	if (end < start)
				2954	return 0;
				2955
				2956	if (direction > 0) {
				2957	if (Py_UNICODE_MATCH(self, end, substring))
				2958	return 1;
				2959	} else {
				2960	if (Py_UNICODE_MATCH(self, start, substring))
				2961	return 1;
				2962	}
				2963
				2964	return 0;
				2965	}
				2966
				2967	int PyUnicode_Tailmatch(PyObject *str,
				2968	PyObject *substr,
				2969	int start,
				2970	int end,
				2971	int direction)
				2972	{
				2973	int result;
				2974
				2975	str = PyUnicode_FromObject(str);
				2976	if (str == NULL)
				2977	return -1;
				2978	substr = PyUnicode_FromObject(substr);
				2979	if (substr == NULL) {
				2980	Py_DECREF(substr);
				2981	return -1;
				2982	}
				2983
				2984	result = tailmatch((PyUnicodeObject *)str,
				2985	(PyUnicodeObject *)substr,
				2986	start, end, direction);
				2987	Py_DECREF(str);
				2988	Py_DECREF(substr);
				2989	return result;
				2990	}
				2991
				2992	static
				2993	const Py_UNICODE findchar(const Py_UNICODE s,
				2994	int size,
				2995	Py_UNICODE ch)
				2996	{
				2997	/* like wcschr, but doesn't stop at NULL characters */
				2998
				2999	while (size-- > 0) {
				3000	if (*s == ch)
				3001	return s;
				3002	s++;
				3003	}
				3004
				3005	return NULL;
				3006	}
				3007
				3008	/* Apply fixfct filter to the Unicode object self and return a
				3009	reference to the modified object */
				3010
				3011	static
				3012	PyObject fixup(PyUnicodeObject self,
				3013	int (fixfct)(PyUnicodeObject s))
				3014	{
				3015
				3016	PyUnicodeObject *u;
				3017
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3018	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3019	if (u == NULL)
				3020	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3021
				3022	Py_UNICODE_COPY(u->str, self->str, self->length);
				3023
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3024	if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3025	/* fixfct should return TRUE if it modified the buffer. If
				3026	FALSE, return a reference to the original buffer instead
				3027	(to save space, not time) */
				3028	Py_INCREF(self);
				3029	Py_DECREF(u);
				3030	return (PyObject*) self;
				3031	}
				3032	return (PyObject*) u;
				3033	}
				3034
				3035	static
				3036	int fixupper(PyUnicodeObject *self)
				3037	{
				3038	int len = self->length;
				3039	Py_UNICODE *s = self->str;
				3040	int status = 0;
				3041
				3042	while (len-- > 0) {
				3043	register Py_UNICODE ch;
				3044
				3045	ch = Py_UNICODE_TOUPPER(*s);
				3046	if (ch != *s) {
				3047	status = 1;
				3048	*s = ch;
				3049	}
				3050	s++;
				3051	}
				3052
				3053	return status;
				3054	}
				3055
				3056	static
				3057	int fixlower(PyUnicodeObject *self)
				3058	{
				3059	int len = self->length;
				3060	Py_UNICODE *s = self->str;
				3061	int status = 0;
				3062
				3063	while (len-- > 0) {
				3064	register Py_UNICODE ch;
				3065
				3066	ch = Py_UNICODE_TOLOWER(*s);
				3067	if (ch != *s) {
				3068	status = 1;
				3069	*s = ch;
				3070	}
				3071	s++;
				3072	}
				3073
				3074	return status;
				3075	}
				3076
				3077	static
				3078	int fixswapcase(PyUnicodeObject *self)
				3079	{
				3080	int len = self->length;
				3081	Py_UNICODE *s = self->str;
				3082	int status = 0;
				3083
				3084	while (len-- > 0) {
				3085	if (Py_UNICODE_ISUPPER(*s)) {
				3086	s = Py_UNICODE_TOLOWER(s);
				3087	status = 1;
				3088	} else if (Py_UNICODE_ISLOWER(*s)) {
				3089	s = Py_UNICODE_TOUPPER(s);
				3090	status = 1;
				3091	}
				3092	s++;
				3093	}
				3094
				3095	return status;
				3096	}
				3097
				3098	static
				3099	int fixcapitalize(PyUnicodeObject *self)
				3100	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3101	int len = self->length;
				3102	Py_UNICODE *s = self->str;
				3103	int status = 0;
				3104
				3105	if (len == 0)
				3106	return 0;
				3107	if (Py_UNICODE_ISLOWER(*s)) {
				3108	s = Py_UNICODE_TOUPPER(s);
				3109	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3110	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3111	s++;
				3112	while (--len > 0) {
				3113	if (Py_UNICODE_ISUPPER(*s)) {
				3114	s = Py_UNICODE_TOLOWER(s);
				3115	status = 1;
				3116	}
				3117	s++;
				3118	}
				3119	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3120	}
				3121
				3122	static
				3123	int fixtitle(PyUnicodeObject *self)
				3124	{
				3125	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3126	register Py_UNICODE *e;
				3127	int previous_is_cased;
				3128
				3129	/* Shortcut for single character strings */
				3130	if (PyUnicode_GET_SIZE(self) == 1) {
				3131	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				3132	if (*p != ch) {
				3133	*p = ch;
				3134	return 1;
				3135	}
				3136	else
				3137	return 0;
				3138	}
				3139
				3140	e = p + PyUnicode_GET_SIZE(self);
				3141	previous_is_cased = 0;
				3142	for (; p < e; p++) {
				3143	register const Py_UNICODE ch = *p;
				3144
				3145	if (previous_is_cased)
				3146	*p = Py_UNICODE_TOLOWER(ch);
				3147	else
				3148	*p = Py_UNICODE_TOTITLE(ch);
				3149
				3150	if (Py_UNICODE_ISLOWER(ch) \|\|
				3151	Py_UNICODE_ISUPPER(ch) \|\|
				3152	Py_UNICODE_ISTITLE(ch))
				3153	previous_is_cased = 1;
				3154	else
				3155	previous_is_cased = 0;
				3156	}
				3157	return 1;
				3158	}
				3159
				3160	PyObject PyUnicode_Join(PyObject separator,
				3161	PyObject *seq)
				3162	{
				3163	Py_UNICODE *sep;
				3164	int seplen;
				3165	PyUnicodeObject *res = NULL;
				3166	int reslen = 0;
				3167	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3168	int sz = 100;
				3169	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3170	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3171
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3172	it = PyObject_GetIter(seq);
				3173	if (it == NULL)
				3174	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3175
				3176	if (separator == NULL) {
				3177	Py_UNICODE blank = ' ';
				3178	sep = &blank;
				3179	seplen = 1;
				3180	}
				3181	else {
				3182	separator = PyUnicode_FromObject(separator);
				3183	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3184	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3185	sep = PyUnicode_AS_UNICODE(separator);
				3186	seplen = PyUnicode_GET_SIZE(separator);
				3187	}
				3188
				3189	res = _PyUnicode_New(sz);
				3190	if (res == NULL)
				3191	goto onError;
				3192	p = PyUnicode_AS_UNICODE(res);
				3193	reslen = 0;
				3194
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3195	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3196	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3197	PyObject *item = PyIter_Next(it);
				3198	if (item == NULL) {
				3199	if (PyErr_Occurred())
				3200	goto onError;
				3201	break;
				3202	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3203	if (!PyUnicode_Check(item)) {
				3204	PyObject *v;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3205	if (!PyString_Check(item)) {
				3206	PyErr_Format(PyExc_TypeError,
				3207	"sequence item %i: expected string or Unicode,"
				3208	" %.80s found",
				3209	i, item->ob_type->tp_name);
				3210	Py_DECREF(item);
				3211	goto onError;
				3212	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3213	v = PyUnicode_FromObject(item);
				3214	Py_DECREF(item);
				3215	item = v;
				3216	if (item == NULL)
				3217	goto onError;
				3218	}
				3219	itemlen = PyUnicode_GET_SIZE(item);
				3220	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3221	if (_PyUnicode_Resize(&res, sz*2)) {
				3222	Py_DECREF(item);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3223	goto onError;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3224	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3225	sz *= 2;
				3226	p = PyUnicode_AS_UNICODE(res) + reslen;
				3227	}
				3228	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3229	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3230	p += seplen;
				3231	reslen += seplen;
				3232	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3233	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3234	p += itemlen;
				3235	reslen += itemlen;
				3236	Py_DECREF(item);
				3237	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3238	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3239	goto onError;
				3240
				3241	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3242	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3243	return (PyObject *)res;
				3244
				3245	onError:
				3246	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3247	Py_XDECREF(res);
				3248	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3249	return NULL;
				3250	}
				3251
				3252	static
				3253	PyUnicodeObject pad(PyUnicodeObject self,
				3254	int left,
				3255	int right,
				3256	Py_UNICODE fill)
				3257	{
				3258	PyUnicodeObject *u;
				3259
				3260	if (left < 0)
				3261	left = 0;
				3262	if (right < 0)
				3263	right = 0;
				3264
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3265	if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3266	Py_INCREF(self);
				3267	return self;
				3268	}
				3269
				3270	u = _PyUnicode_New(left + self->length + right);
				3271	if (u) {
				3272	if (left)
				3273	Py_UNICODE_FILL(u->str, fill, left);
				3274	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				3275	if (right)
				3276	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				3277	}
				3278
				3279	return u;
				3280	}
				3281
				3282	#define SPLIT_APPEND(data, left, right) \
				3283	str = PyUnicode_FromUnicode(data + left, right - left); \
				3284	if (!str) \
				3285	goto onError; \
				3286	if (PyList_Append(list, str)) { \
				3287	Py_DECREF(str); \
				3288	goto onError; \
				3289	} \
				3290	else \
				3291	Py_DECREF(str);
				3292
				3293	static
				3294	PyObject split_whitespace(PyUnicodeObject self,
				3295	PyObject *list,
				3296	int maxcount)
				3297	{
				3298	register int i;
				3299	register int j;
				3300	int len = self->length;
				3301	PyObject *str;
				3302
				3303	for (i = j = 0; i < len; ) {
				3304	/* find a token */
				3305	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3306	i++;
				3307	j = i;
				3308	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				3309	i++;
				3310	if (j < i) {
				3311	if (maxcount-- <= 0)
				3312	break;
				3313	SPLIT_APPEND(self->str, j, i);
				3314	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3315	i++;
				3316	j = i;
				3317	}
				3318	}
				3319	if (j < len) {
				3320	SPLIT_APPEND(self->str, j, len);
				3321	}
				3322	return list;
				3323
				3324	onError:
				3325	Py_DECREF(list);
				3326	return NULL;
				3327	}
				3328
				3329	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3330	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3331	{
				3332	register int i;
				3333	register int j;
				3334	int len;
				3335	PyObject *list;
				3336	PyObject *str;
				3337	Py_UNICODE *data;
				3338
				3339	string = PyUnicode_FromObject(string);
				3340	if (string == NULL)
				3341	return NULL;
				3342	data = PyUnicode_AS_UNICODE(string);
				3343	len = PyUnicode_GET_SIZE(string);
				3344
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3345	list = PyList_New(0);
				3346	if (!list)
				3347	goto onError;
				3348
				3349	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3350	int eol;
				3351
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3352	/* Find a line and append it */
				3353	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				3354	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3355
				3356	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3357	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3358	if (i < len) {
				3359	if (data[i] == '\r' && i + 1 < len &&
				3360	data[i+1] == '\n')
				3361	i += 2;
				3362	else
				3363	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3364	if (keepends)
				3365	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3366	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3367	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3368	j = i;
				3369	}
				3370	if (j < len) {
				3371	SPLIT_APPEND(data, j, len);
				3372	}
				3373
				3374	Py_DECREF(string);
				3375	return list;
				3376
				3377	onError:
				3378	Py_DECREF(list);
				3379	Py_DECREF(string);
				3380	return NULL;
				3381	}
				3382
				3383	static
				3384	PyObject split_char(PyUnicodeObject self,
				3385	PyObject *list,
				3386	Py_UNICODE ch,
				3387	int maxcount)
				3388	{
				3389	register int i;
				3390	register int j;
				3391	int len = self->length;
				3392	PyObject *str;
				3393
				3394	for (i = j = 0; i < len; ) {
				3395	if (self->str[i] == ch) {
				3396	if (maxcount-- <= 0)
				3397	break;
				3398	SPLIT_APPEND(self->str, j, i);
				3399	i = j = i + 1;
				3400	} else
				3401	i++;
				3402	}
				3403	if (j <= len) {
				3404	SPLIT_APPEND(self->str, j, len);
				3405	}
				3406	return list;
				3407
				3408	onError:
				3409	Py_DECREF(list);
				3410	return NULL;
				3411	}
				3412
				3413	static
				3414	PyObject split_substring(PyUnicodeObject self,
				3415	PyObject *list,
				3416	PyUnicodeObject *substring,
				3417	int maxcount)
				3418	{
				3419	register int i;
				3420	register int j;
				3421	int len = self->length;
				3422	int sublen = substring->length;
				3423	PyObject *str;
				3424
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3425	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3426	if (Py_UNICODE_MATCH(self, i, substring)) {
				3427	if (maxcount-- <= 0)
				3428	break;
				3429	SPLIT_APPEND(self->str, j, i);
				3430	i = j = i + sublen;
				3431	} else
				3432	i++;
				3433	}
				3434	if (j <= len) {
				3435	SPLIT_APPEND(self->str, j, len);
				3436	}
				3437	return list;
				3438
				3439	onError:
				3440	Py_DECREF(list);
				3441	return NULL;
				3442	}
				3443
				3444	#undef SPLIT_APPEND
				3445
				3446	static
				3447	PyObject split(PyUnicodeObject self,
				3448	PyUnicodeObject *substring,
				3449	int maxcount)
				3450	{
				3451	PyObject *list;
				3452
				3453	if (maxcount < 0)
				3454	maxcount = INT_MAX;
				3455
				3456	list = PyList_New(0);
				3457	if (!list)
				3458	return NULL;
				3459
				3460	if (substring == NULL)
				3461	return split_whitespace(self,list,maxcount);
				3462
				3463	else if (substring->length == 1)
				3464	return split_char(self,list,substring->str[0],maxcount);
				3465
				3466	else if (substring->length == 0) {
				3467	Py_DECREF(list);
				3468	PyErr_SetString(PyExc_ValueError, "empty separator");
				3469	return NULL;
				3470	}
				3471	else
				3472	return split_substring(self,list,substring,maxcount);
				3473	}
				3474
				3475	static
				3476	PyObject strip(PyUnicodeObject self,
				3477	int left,
				3478	int right)
				3479	{
				3480	Py_UNICODE *p = self->str;
				3481	int start = 0;
				3482	int end = self->length;
				3483
				3484	if (left)
				3485	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3486	start++;
				3487
				3488	if (right)
				3489	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3490	end--;
				3491
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3492	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3493	/* couldn't strip anything off, return original string */
				3494	Py_INCREF(self);
				3495	return (PyObject*) self;
				3496	}
				3497
				3498	return (PyObject*) PyUnicode_FromUnicode(
				3499	self->str + start,
				3500	end - start
				3501	);
				3502	}
				3503
				3504	static
				3505	PyObject replace(PyUnicodeObject self,
				3506	PyUnicodeObject *str1,
				3507	PyUnicodeObject *str2,
				3508	int maxcount)
				3509	{
				3510	PyUnicodeObject *u;
				3511
				3512	if (maxcount < 0)
				3513	maxcount = INT_MAX;
				3514
				3515	if (str1->length == 1 && str2->length == 1) {
				3516	int i;
				3517
				3518	/* replace characters */
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3519	if (!findchar(self->str, self->length, str1->str[0]) &&
				3520	PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3521	/* nothing to replace, return original string */
				3522	Py_INCREF(self);
				3523	u = self;
				3524	} else {
				3525	Py_UNICODE u1 = str1->str[0];
				3526	Py_UNICODE u2 = str2->str[0];
				3527
				3528	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3529	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3530	self->length
				3531	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3532	if (u != NULL) {
				3533	Py_UNICODE_COPY(u->str, self->str,
				3534	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3535	for (i = 0; i < u->length; i++)
				3536	if (u->str[i] == u1) {
				3537	if (--maxcount < 0)
				3538	break;
				3539	u->str[i] = u2;
				3540	}
				3541	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3542	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3543
				3544	} else {
				3545	int n, i;
				3546	Py_UNICODE *p;
				3547
				3548	/* replace strings */
				3549	n = count(self, 0, self->length, str1);
				3550	if (n > maxcount)
				3551	n = maxcount;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3552	if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3553	/* nothing to replace, return original string */
				3554	Py_INCREF(self);
				3555	u = self;
				3556	} else {
				3557	u = _PyUnicode_New(
				3558	self->length + n * (str2->length - str1->length));
				3559	if (u) {
				3560	i = 0;
				3561	p = u->str;
				3562	while (i <= self->length - str1->length)
				3563	if (Py_UNICODE_MATCH(self, i, str1)) {
				3564	/* replace string segment */
				3565	Py_UNICODE_COPY(p, str2->str, str2->length);
				3566	p += str2->length;
				3567	i += str1->length;
				3568	if (--n <= 0) {
				3569	/* copy remaining part */
				3570	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3571	break;
				3572	}
				3573	} else
				3574	*p++ = self->str[i++];
				3575	}
				3576	}
				3577	}
				3578
				3579	return (PyObject *) u;
				3580	}
				3581
				3582	/* --- Unicode Object Methods --------------------------------------------- */
				3583
				3584	static char title__doc__[] =
				3585	"S.title() -> unicode\n\
				3586	\n\
				3587	Return a titlecased version of S, i.e. words start with title case\n\
				3588	characters, all remaining cased characters have lower case.";
				3589
				3590	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3591	unicode_title(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3592	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3593	return fixup(self, fixtitle);
				3594	}
				3595
				3596	static char capitalize__doc__[] =
				3597	"S.capitalize() -> unicode\n\
				3598	\n\
				3599	Return a capitalized version of S, i.e. make the first character\n\
				3600	have upper case.";
				3601
				3602	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3603	unicode_capitalize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3604	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3605	return fixup(self, fixcapitalize);
				3606	}
				3607
				3608	#if 0
				3609	static char capwords__doc__[] =
				3610	"S.capwords() -> unicode\n\
				3611	\n\
				3612	Apply .capitalize() to all words in S and return the result with\n\
				3613	normalized whitespace (all whitespace strings are replaced by ' ').";
				3614
				3615	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3616	unicode_capwords(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3617	{
				3618	PyObject *list;
				3619	PyObject *item;
				3620	int i;
				3621
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3622	/* Split into words */
				3623	list = split(self, NULL, -1);
				3624	if (!list)
				3625	return NULL;
				3626
				3627	/* Capitalize each word */
				3628	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3629	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3630	fixcapitalize);
				3631	if (item == NULL)
				3632	goto onError;
				3633	Py_DECREF(PyList_GET_ITEM(list, i));
				3634	PyList_SET_ITEM(list, i, item);
				3635	}
				3636
				3637	/* Join the words to form a new string */
				3638	item = PyUnicode_Join(NULL, list);
				3639
				3640	onError:
				3641	Py_DECREF(list);
				3642	return (PyObject *)item;
				3643	}
				3644	#endif
				3645
				3646	static char center__doc__[] =
				3647	"S.center(width) -> unicode\n\
				3648	\n\
				3649	Return S centered in a Unicode string of length width. Padding is done\n\
				3650	using spaces.";
				3651
				3652	static PyObject *
				3653	unicode_center(PyUnicodeObject self, PyObject args)
				3654	{
				3655	int marg, left;
				3656	int width;
				3657
				3658	if (!PyArg_ParseTuple(args, "i:center", &width))
				3659	return NULL;
				3660
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3661	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3662	Py_INCREF(self);
				3663	return (PyObject*) self;
				3664	}
				3665
				3666	marg = width - self->length;
				3667	left = marg / 2 + (marg & width & 1);
				3668
				3669	return (PyObject*) pad(self, left, marg - left, ' ');
				3670	}
				3671
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3672	#if 0
				3673
				3674	/* This code should go into some future Unicode collation support
				3675	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3676	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3677
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3678	/* speedy UTF-16 code point order comparison */
				3679	/* gleaned from: */
				3680	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3681
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3682	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3683	{
				3684	0, 0, 0, 0, 0, 0, 0, 0,
				3685	0, 0, 0, 0, 0, 0, 0, 0,
				3686	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3687	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3688	};
				3689
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3690	static int
				3691	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3692	{
				3693	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3694
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3695	Py_UNICODE *s1 = str1->str;
				3696	Py_UNICODE *s2 = str2->str;
				3697
				3698	len1 = str1->length;
				3699	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3700
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3701	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3702	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3703
				3704	c1 = *s1++;
				3705	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3706
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3707	if (c1 > (1<<11) * 26)
				3708	c1 += utf16Fixup[c1>>11];
				3709	if (c2 > (1<<11) * 26)
				3710	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3711	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3712
				3713	if (c1 != c2)
				3714	return (c1 < c2) ? -1 : 1;
				3715
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3716	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3717	}
				3718
				3719	return (len1 < len2) ? -1 : (len1 != len2);
				3720	}
				3721
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3722	#else
				3723
				3724	static int
				3725	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3726	{
				3727	register int len1, len2;
				3728
				3729	Py_UNICODE *s1 = str1->str;
				3730	Py_UNICODE *s2 = str2->str;
				3731
				3732	len1 = str1->length;
				3733	len2 = str2->length;
				3734
				3735	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3736	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3737
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3738	c1 = *s1++;
				3739	c2 = *s2++;
				3740
				3741	if (c1 != c2)
				3742	return (c1 < c2) ? -1 : 1;
				3743
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3744	len1--; len2--;
				3745	}
				3746
				3747	return (len1 < len2) ? -1 : (len1 != len2);
				3748	}
				3749
				3750	#endif
				3751
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3752	int PyUnicode_Compare(PyObject *left,
				3753	PyObject *right)
				3754	{
				3755	PyUnicodeObject u = NULL, v = NULL;
				3756	int result;
				3757
				3758	/* Coerce the two arguments */
				3759	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3760	if (u == NULL)
				3761	goto onError;
				3762	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3763	if (v == NULL)
				3764	goto onError;
				3765
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3766	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3767	if (v == u) {
				3768	Py_DECREF(u);
				3769	Py_DECREF(v);
				3770	return 0;
				3771	}
				3772
				3773	result = unicode_compare(u, v);
				3774
				3775	Py_DECREF(u);
				3776	Py_DECREF(v);
				3777	return result;
				3778
				3779	onError:
				3780	Py_XDECREF(u);
				3781	Py_XDECREF(v);
				3782	return -1;
				3783	}
				3784
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3785	int PyUnicode_Contains(PyObject *container,
				3786	PyObject *element)
				3787	{
				3788	PyUnicodeObject u = NULL, v = NULL;
				3789	int result;
				3790	register const Py_UNICODE p, e;
				3791	register Py_UNICODE ch;
				3792
				3793	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3794	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3795	if (v == NULL) {
				3796	PyErr_SetString(PyExc_TypeError,
				3797	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3798	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3799	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3800	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3801	if (u == NULL) {
				3802	Py_DECREF(v);
				3803	goto onError;
				3804	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3805
				3806	/* Check v in u */
				3807	if (PyUnicode_GET_SIZE(v) != 1) {
				3808	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3809	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3810	goto onError;
				3811	}
				3812	ch = *PyUnicode_AS_UNICODE(v);
				3813	p = PyUnicode_AS_UNICODE(u);
				3814	e = p + PyUnicode_GET_SIZE(u);
				3815	result = 0;
				3816	while (p < e) {
				3817	if (*p++ == ch) {
				3818	result = 1;
				3819	break;
				3820	}
				3821	}
				3822
				3823	Py_DECREF(u);
				3824	Py_DECREF(v);
				3825	return result;
				3826
				3827	onError:
				3828	Py_XDECREF(u);
				3829	Py_XDECREF(v);
				3830	return -1;
				3831	}
				3832
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3833	/* Concat to string or Unicode object giving a new Unicode object. */
				3834
				3835	PyObject PyUnicode_Concat(PyObject left,
				3836	PyObject *right)
				3837	{
				3838	PyUnicodeObject u = NULL, v = NULL, *w;
				3839
				3840	/* Coerce the two arguments */
				3841	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3842	if (u == NULL)
				3843	goto onError;
				3844	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3845	if (v == NULL)
				3846	goto onError;
				3847
				3848	/* Shortcuts */
				3849	if (v == unicode_empty) {
				3850	Py_DECREF(v);
				3851	return (PyObject *)u;
				3852	}
				3853	if (u == unicode_empty) {
				3854	Py_DECREF(u);
				3855	return (PyObject *)v;
				3856	}
				3857
				3858	/* Concat the two Unicode strings */
				3859	w = _PyUnicode_New(u->length + v->length);
				3860	if (w == NULL)
				3861	goto onError;
				3862	Py_UNICODE_COPY(w->str, u->str, u->length);
				3863	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3864
				3865	Py_DECREF(u);
				3866	Py_DECREF(v);
				3867	return (PyObject *)w;
				3868
				3869	onError:
				3870	Py_XDECREF(u);
				3871	Py_XDECREF(v);
				3872	return NULL;
				3873	}
				3874
				3875	static char count__doc__[] =
				3876	"S.count(sub[, start[, end]]) -> int\n\
				3877	\n\
				3878	Return the number of occurrences of substring sub in Unicode string\n\
				3879	S[start:end]. Optional arguments start and end are\n\
				3880	interpreted as in slice notation.";
				3881
				3882	static PyObject *
				3883	unicode_count(PyUnicodeObject self, PyObject args)
				3884	{
				3885	PyUnicodeObject *substring;
				3886	int start = 0;
				3887	int end = INT_MAX;
				3888	PyObject *result;
				3889
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3890	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3891	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3892	return NULL;
				3893
				3894	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3895	(PyObject *)substring);
				3896	if (substring == NULL)
				3897	return NULL;
				3898
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3899	if (start < 0)
				3900	start += self->length;
				3901	if (start < 0)
				3902	start = 0;
				3903	if (end > self->length)
				3904	end = self->length;
				3905	if (end < 0)
				3906	end += self->length;
				3907	if (end < 0)
				3908	end = 0;
				3909
				3910	result = PyInt_FromLong((long) count(self, start, end, substring));
				3911
				3912	Py_DECREF(substring);
				3913	return result;
				3914	}
				3915
				3916	static char encode__doc__[] =
				3917	"S.encode([encoding[,errors]]) -> string\n\
				3918	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3919	Return an encoded string version of S. Default encoding is the current\n\
				3920	default string encoding. errors may be given to set a different error\n\
				3921	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3922	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3923
				3924	static PyObject *
				3925	unicode_encode(PyUnicodeObject self, PyObject args)
				3926	{
				3927	char *encoding = NULL;
				3928	char *errors = NULL;
				3929	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3930	return NULL;
				3931	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3932	}
				3933
				3934	static char expandtabs__doc__[] =
				3935	"S.expandtabs([tabsize]) -> unicode\n\
				3936	\n\
				3937	Return a copy of S where all tab characters are expanded using spaces.\n\
				3938	If tabsize is not given, a tab size of 8 characters is assumed.";
				3939
				3940	static PyObject*
				3941	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3942	{
				3943	Py_UNICODE *e;
				3944	Py_UNICODE *p;
				3945	Py_UNICODE *q;
				3946	int i, j;
				3947	PyUnicodeObject *u;
				3948	int tabsize = 8;
				3949
				3950	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3951	return NULL;
				3952
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3953	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3954	i = j = 0;
				3955	e = self->str + self->length;
				3956	for (p = self->str; p < e; p++)
				3957	if (*p == '\t') {
				3958	if (tabsize > 0)
				3959	j += tabsize - (j % tabsize);
				3960	}
				3961	else {
				3962	j++;
				3963	if (p == '\n' \|\| p == '\r') {
				3964	i += j;
				3965	j = 0;
				3966	}
				3967	}
				3968
				3969	/* Second pass: create output string and fill it */
				3970	u = _PyUnicode_New(i + j);
				3971	if (!u)
				3972	return NULL;
				3973
				3974	j = 0;
				3975	q = u->str;
				3976
				3977	for (p = self->str; p < e; p++)
				3978	if (*p == '\t') {
				3979	if (tabsize > 0) {
				3980	i = tabsize - (j % tabsize);
				3981	j += i;
				3982	while (i--)
				3983	*q++ = ' ';
				3984	}
				3985	}
				3986	else {
				3987	j++;
				3988	q++ = p;
				3989	if (p == '\n' \|\| p == '\r')
				3990	j = 0;
				3991	}
				3992
				3993	return (PyObject*) u;
				3994	}
				3995
				3996	static char find__doc__[] =
				3997	"S.find(sub [,start [,end]]) -> int\n\
				3998	\n\
				3999	Return the lowest index in S where substring sub is found,\n\
				4000	such that sub is contained within s[start,end]. Optional\n\
				4001	arguments start and end are interpreted as in slice notation.\n\
				4002	\n\
				4003	Return -1 on failure.";
				4004
				4005	static PyObject *
				4006	unicode_find(PyUnicodeObject self, PyObject args)
				4007	{
				4008	PyUnicodeObject *substring;
				4009	int start = 0;
				4010	int end = INT_MAX;
				4011	PyObject *result;
				4012
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4013	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				4014	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4015	return NULL;
				4016	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4017	(PyObject *)substring);
				4018	if (substring == NULL)
				4019	return NULL;
				4020
				4021	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				4022
				4023	Py_DECREF(substring);
				4024	return result;
				4025	}
				4026
				4027	static PyObject *
				4028	unicode_getitem(PyUnicodeObject *self, int index)
				4029	{
				4030	if (index < 0 \|\| index >= self->length) {
				4031	PyErr_SetString(PyExc_IndexError, "string index out of range");
				4032	return NULL;
				4033	}
				4034
				4035	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				4036	}
				4037
				4038	static long
				4039	unicode_hash(PyUnicodeObject *self)
				4040	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4041	/* Since Unicode objects compare equal to their ASCII string
				4042	counterparts, they should use the individual character values
				4043	as basis for their hash value. This is needed to assure that
				4044	strings and Unicode objects behave in the same way as
				4045	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4046
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4047	register int len;
				4048	register Py_UNICODE *p;
				4049	register long x;
				4050
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4051	if (self->hash != -1)
				4052	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4053	len = PyUnicode_GET_SIZE(self);
				4054	p = PyUnicode_AS_UNICODE(self);
				4055	x = *p << 7;
				4056	while (--len >= 0)
				4057	x = (1000003x) ^ p++;
				4058	x ^= PyUnicode_GET_SIZE(self);
				4059	if (x == -1)
				4060	x = -2;
				4061	self->hash = x;
				4062	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4063	}
				4064
				4065	static char index__doc__[] =
				4066	"S.index(sub [,start [,end]]) -> int\n\
				4067	\n\
				4068	Like S.find() but raise ValueError when the substring is not found.";
				4069
				4070	static PyObject *
				4071	unicode_index(PyUnicodeObject self, PyObject args)
				4072	{
				4073	int result;
				4074	PyUnicodeObject *substring;
				4075	int start = 0;
				4076	int end = INT_MAX;
				4077
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4078	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				4079	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4080	return NULL;
				4081
				4082	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4083	(PyObject *)substring);
				4084	if (substring == NULL)
				4085	return NULL;
				4086
				4087	result = findstring(self, substring, start, end, 1);
				4088
				4089	Py_DECREF(substring);
				4090	if (result < 0) {
				4091	PyErr_SetString(PyExc_ValueError, "substring not found");
				4092	return NULL;
				4093	}
				4094	return PyInt_FromLong(result);
				4095	}
				4096
				4097	static char islower__doc__[] =
				4098	"S.islower() -> int\n\
				4099	\n\
				4100	Return 1 if all cased characters in S are lowercase and there is\n\
				4101	at least one cased character in S, 0 otherwise.";
				4102
				4103	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4104	unicode_islower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4105	{
				4106	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4107	register const Py_UNICODE *e;
				4108	int cased;
				4109
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4110	/* Shortcut for single character strings */
				4111	if (PyUnicode_GET_SIZE(self) == 1)
				4112	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				4113
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4114	/* Special case for empty strings */
				4115	if (PyString_GET_SIZE(self) == 0)
				4116	return PyInt_FromLong(0);
				4117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4118	e = p + PyUnicode_GET_SIZE(self);
				4119	cased = 0;
				4120	for (; p < e; p++) {
				4121	register const Py_UNICODE ch = *p;
				4122
				4123	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4124	return PyInt_FromLong(0);
				4125	else if (!cased && Py_UNICODE_ISLOWER(ch))
				4126	cased = 1;
				4127	}
				4128	return PyInt_FromLong(cased);
				4129	}
				4130
				4131	static char isupper__doc__[] =
				4132	"S.isupper() -> int\n\
				4133	\n\
				4134	Return 1 if all cased characters in S are uppercase and there is\n\
				4135	at least one cased character in S, 0 otherwise.";
				4136
				4137	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4138	unicode_isupper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4139	{
				4140	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4141	register const Py_UNICODE *e;
				4142	int cased;
				4143
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4144	/* Shortcut for single character strings */
				4145	if (PyUnicode_GET_SIZE(self) == 1)
				4146	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				4147
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4148	/* Special case for empty strings */
				4149	if (PyString_GET_SIZE(self) == 0)
				4150	return PyInt_FromLong(0);
				4151
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4152	e = p + PyUnicode_GET_SIZE(self);
				4153	cased = 0;
				4154	for (; p < e; p++) {
				4155	register const Py_UNICODE ch = *p;
				4156
				4157	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4158	return PyInt_FromLong(0);
				4159	else if (!cased && Py_UNICODE_ISUPPER(ch))
				4160	cased = 1;
				4161	}
				4162	return PyInt_FromLong(cased);
				4163	}
				4164
				4165	static char istitle__doc__[] =
				4166	"S.istitle() -> int\n\
				4167	\n\
				4168	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				4169	may only follow uncased characters and lowercase characters only cased\n\
				4170	ones. Return 0 otherwise.";
				4171
				4172	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4173	unicode_istitle(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4174	{
				4175	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4176	register const Py_UNICODE *e;
				4177	int cased, previous_is_cased;
				4178
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4179	/* Shortcut for single character strings */
				4180	if (PyUnicode_GET_SIZE(self) == 1)
				4181	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				4182	(Py_UNICODE_ISUPPER(*p) != 0));
				4183
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4184	/* Special case for empty strings */
				4185	if (PyString_GET_SIZE(self) == 0)
				4186	return PyInt_FromLong(0);
				4187
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4188	e = p + PyUnicode_GET_SIZE(self);
				4189	cased = 0;
				4190	previous_is_cased = 0;
				4191	for (; p < e; p++) {
				4192	register const Py_UNICODE ch = *p;
				4193
				4194	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				4195	if (previous_is_cased)
				4196	return PyInt_FromLong(0);
				4197	previous_is_cased = 1;
				4198	cased = 1;
				4199	}
				4200	else if (Py_UNICODE_ISLOWER(ch)) {
				4201	if (!previous_is_cased)
				4202	return PyInt_FromLong(0);
				4203	previous_is_cased = 1;
				4204	cased = 1;
				4205	}
				4206	else
				4207	previous_is_cased = 0;
				4208	}
				4209	return PyInt_FromLong(cased);
				4210	}
				4211
				4212	static char isspace__doc__[] =
				4213	"S.isspace() -> int\n\
				4214	\n\
				4215	Return 1 if there are only whitespace characters in S,\n\
				4216	0 otherwise.";
				4217
				4218	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4219	unicode_isspace(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4220	{
				4221	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4222	register const Py_UNICODE *e;
				4223
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4224	/* Shortcut for single character strings */
				4225	if (PyUnicode_GET_SIZE(self) == 1 &&
				4226	Py_UNICODE_ISSPACE(*p))
				4227	return PyInt_FromLong(1);
				4228
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4229	/* Special case for empty strings */
				4230	if (PyString_GET_SIZE(self) == 0)
				4231	return PyInt_FromLong(0);
				4232
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4233	e = p + PyUnicode_GET_SIZE(self);
				4234	for (; p < e; p++) {
				4235	if (!Py_UNICODE_ISSPACE(*p))
				4236	return PyInt_FromLong(0);
				4237	}
				4238	return PyInt_FromLong(1);
				4239	}
				4240
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4241	static char isalpha__doc__[] =
				4242	"S.isalpha() -> int\n\
				4243	\n\
				4244	Return 1 if all characters in S are alphabetic\n\
				4245	and there is at least one character in S, 0 otherwise.";
				4246
				4247	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4248	unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4249	{
				4250	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4251	register const Py_UNICODE *e;
				4252
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4253	/* Shortcut for single character strings */
				4254	if (PyUnicode_GET_SIZE(self) == 1 &&
				4255	Py_UNICODE_ISALPHA(*p))
				4256	return PyInt_FromLong(1);
				4257
				4258	/* Special case for empty strings */
				4259	if (PyString_GET_SIZE(self) == 0)
				4260	return PyInt_FromLong(0);
				4261
				4262	e = p + PyUnicode_GET_SIZE(self);
				4263	for (; p < e; p++) {
				4264	if (!Py_UNICODE_ISALPHA(*p))
				4265	return PyInt_FromLong(0);
				4266	}
				4267	return PyInt_FromLong(1);
				4268	}
				4269
				4270	static char isalnum__doc__[] =
				4271	"S.isalnum() -> int\n\
				4272	\n\
				4273	Return 1 if all characters in S are alphanumeric\n\
				4274	and there is at least one character in S, 0 otherwise.";
				4275
				4276	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4277	unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4278	{
				4279	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4280	register const Py_UNICODE *e;
				4281
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4282	/* Shortcut for single character strings */
				4283	if (PyUnicode_GET_SIZE(self) == 1 &&
				4284	Py_UNICODE_ISALNUM(*p))
				4285	return PyInt_FromLong(1);
				4286
				4287	/* Special case for empty strings */
				4288	if (PyString_GET_SIZE(self) == 0)
				4289	return PyInt_FromLong(0);
				4290
				4291	e = p + PyUnicode_GET_SIZE(self);
				4292	for (; p < e; p++) {
				4293	if (!Py_UNICODE_ISALNUM(*p))
				4294	return PyInt_FromLong(0);
				4295	}
				4296	return PyInt_FromLong(1);
				4297	}
				4298
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4299	static char isdecimal__doc__[] =
				4300	"S.isdecimal() -> int\n\
				4301	\n\
				4302	Return 1 if there are only decimal characters in S,\n\
				4303	0 otherwise.";
				4304
				4305	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4306	unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4307	{
				4308	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4309	register const Py_UNICODE *e;
				4310
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4311	/* Shortcut for single character strings */
				4312	if (PyUnicode_GET_SIZE(self) == 1 &&
				4313	Py_UNICODE_ISDECIMAL(*p))
				4314	return PyInt_FromLong(1);
				4315
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4316	/* Special case for empty strings */
				4317	if (PyString_GET_SIZE(self) == 0)
				4318	return PyInt_FromLong(0);
				4319
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4320	e = p + PyUnicode_GET_SIZE(self);
				4321	for (; p < e; p++) {
				4322	if (!Py_UNICODE_ISDECIMAL(*p))
				4323	return PyInt_FromLong(0);
				4324	}
				4325	return PyInt_FromLong(1);
				4326	}
				4327
				4328	static char isdigit__doc__[] =
				4329	"S.isdigit() -> int\n\
				4330	\n\
				4331	Return 1 if there are only digit characters in S,\n\
				4332	0 otherwise.";
				4333
				4334	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4335	unicode_isdigit(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4336	{
				4337	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4338	register const Py_UNICODE *e;
				4339
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4340	/* Shortcut for single character strings */
				4341	if (PyUnicode_GET_SIZE(self) == 1 &&
				4342	Py_UNICODE_ISDIGIT(*p))
				4343	return PyInt_FromLong(1);
				4344
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4345	/* Special case for empty strings */
				4346	if (PyString_GET_SIZE(self) == 0)
				4347	return PyInt_FromLong(0);
				4348
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4349	e = p + PyUnicode_GET_SIZE(self);
				4350	for (; p < e; p++) {
				4351	if (!Py_UNICODE_ISDIGIT(*p))
				4352	return PyInt_FromLong(0);
				4353	}
				4354	return PyInt_FromLong(1);
				4355	}
				4356
				4357	static char isnumeric__doc__[] =
				4358	"S.isnumeric() -> int\n\
				4359	\n\
				4360	Return 1 if there are only numeric characters in S,\n\
				4361	0 otherwise.";
				4362
				4363	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4364	unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4365	{
				4366	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4367	register const Py_UNICODE *e;
				4368
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4369	/* Shortcut for single character strings */
				4370	if (PyUnicode_GET_SIZE(self) == 1 &&
				4371	Py_UNICODE_ISNUMERIC(*p))
				4372	return PyInt_FromLong(1);
				4373
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4374	/* Special case for empty strings */
				4375	if (PyString_GET_SIZE(self) == 0)
				4376	return PyInt_FromLong(0);
				4377
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4378	e = p + PyUnicode_GET_SIZE(self);
				4379	for (; p < e; p++) {
				4380	if (!Py_UNICODE_ISNUMERIC(*p))
				4381	return PyInt_FromLong(0);
				4382	}
				4383	return PyInt_FromLong(1);
				4384	}
				4385
				4386	static char join__doc__[] =
				4387	"S.join(sequence) -> unicode\n\
				4388	\n\
				4389	Return a string which is the concatenation of the strings in the\n\
				4390	sequence. The separator between elements is S.";
				4391
				4392	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4393	unicode_join(PyObject self, PyObject data)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4394	{
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4395	return PyUnicode_Join(self, data);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4396	}
				4397
				4398	static int
				4399	unicode_length(PyUnicodeObject *self)
				4400	{
				4401	return self->length;
				4402	}
				4403
				4404	static char ljust__doc__[] =
				4405	"S.ljust(width) -> unicode\n\
				4406	\n\
				4407	Return S left justified in a Unicode string of length width. Padding is\n\
				4408	done using spaces.";
				4409
				4410	static PyObject *
				4411	unicode_ljust(PyUnicodeObject self, PyObject args)
				4412	{
				4413	int width;
				4414	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4415	return NULL;
				4416
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4417	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4418	Py_INCREF(self);
				4419	return (PyObject*) self;
				4420	}
				4421
				4422	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4423	}
				4424
				4425	static char lower__doc__[] =
				4426	"S.lower() -> unicode\n\
				4427	\n\
				4428	Return a copy of the string S converted to lowercase.";
				4429
				4430	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4431	unicode_lower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4432	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4433	return fixup(self, fixlower);
				4434	}
				4435
				4436	static char lstrip__doc__[] =
				4437	"S.lstrip() -> unicode\n\
				4438	\n\
				4439	Return a copy of the string S with leading whitespace removed.";
				4440
				4441	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4442	unicode_lstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4443	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4444	return strip(self, 1, 0);
				4445	}
				4446
				4447	static PyObject*
				4448	unicode_repeat(PyUnicodeObject *str, int len)
				4449	{
				4450	PyUnicodeObject *u;
				4451	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4452	int nchars;
				4453	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4454
				4455	if (len < 0)
				4456	len = 0;
				4457
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4458	if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4459	/* no repeat, return original string */
				4460	Py_INCREF(str);
				4461	return (PyObject*) str;
				4462	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4463
				4464	/* ensure # of chars needed doesn't overflow int and # of bytes
				4465	* needed doesn't overflow size_t
				4466	*/
				4467	nchars = len * str->length;
				4468	if (len && nchars / len != str->length) {
				4469	PyErr_SetString(PyExc_OverflowError,
				4470	"repeated string is too long");
				4471	return NULL;
				4472	}
				4473	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4474	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4475	PyErr_SetString(PyExc_OverflowError,
				4476	"repeated string is too long");
				4477	return NULL;
				4478	}
				4479	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4480	if (!u)
				4481	return NULL;
				4482
				4483	p = u->str;
				4484
				4485	while (len-- > 0) {
				4486	Py_UNICODE_COPY(p, str->str, str->length);
				4487	p += str->length;
				4488	}
				4489
				4490	return (PyObject*) u;
				4491	}
				4492
				4493	PyObject PyUnicode_Replace(PyObject obj,
				4494	PyObject *subobj,
				4495	PyObject *replobj,
				4496	int maxcount)
				4497	{
				4498	PyObject *self;
				4499	PyObject *str1;
				4500	PyObject *str2;
				4501	PyObject *result;
				4502
				4503	self = PyUnicode_FromObject(obj);
				4504	if (self == NULL)
				4505	return NULL;
				4506	str1 = PyUnicode_FromObject(subobj);
				4507	if (str1 == NULL) {
				4508	Py_DECREF(self);
				4509	return NULL;
				4510	}
				4511	str2 = PyUnicode_FromObject(replobj);
				4512	if (str2 == NULL) {
				4513	Py_DECREF(self);
				4514	Py_DECREF(str1);
				4515	return NULL;
				4516	}
				4517	result = replace((PyUnicodeObject *)self,
				4518	(PyUnicodeObject *)str1,
				4519	(PyUnicodeObject *)str2,
				4520	maxcount);
				4521	Py_DECREF(self);
				4522	Py_DECREF(str1);
				4523	Py_DECREF(str2);
				4524	return result;
				4525	}
				4526
				4527	static char replace__doc__[] =
				4528	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4529	\n\
				4530	Return a copy of S with all occurrences of substring\n\
				4531	old replaced by new. If the optional argument maxsplit is\n\
				4532	given, only the first maxsplit occurrences are replaced.";
				4533
				4534	static PyObject*
				4535	unicode_replace(PyUnicodeObject self, PyObject args)
				4536	{
				4537	PyUnicodeObject *str1;
				4538	PyUnicodeObject *str2;
				4539	int maxcount = -1;
				4540	PyObject *result;
				4541
				4542	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4543	return NULL;
				4544	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4545	if (str1 == NULL)
				4546	return NULL;
				4547	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4548	if (str2 == NULL)
				4549	return NULL;
				4550
				4551	result = replace(self, str1, str2, maxcount);
				4552
				4553	Py_DECREF(str1);
				4554	Py_DECREF(str2);
				4555	return result;
				4556	}
				4557
				4558	static
				4559	PyObject unicode_repr(PyObject unicode)
				4560	{
				4561	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4562	PyUnicode_GET_SIZE(unicode),
				4563	1);
				4564	}
				4565
				4566	static char rfind__doc__[] =
				4567	"S.rfind(sub [,start [,end]]) -> int\n\
				4568	\n\
				4569	Return the highest index in S where substring sub is found,\n\
				4570	such that sub is contained within s[start,end]. Optional\n\
				4571	arguments start and end are interpreted as in slice notation.\n\
				4572	\n\
				4573	Return -1 on failure.";
				4574
				4575	static PyObject *
				4576	unicode_rfind(PyUnicodeObject self, PyObject args)
				4577	{
				4578	PyUnicodeObject *substring;
				4579	int start = 0;
				4580	int end = INT_MAX;
				4581	PyObject *result;
				4582
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4583	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4584	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4585	return NULL;
				4586	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4587	(PyObject *)substring);
				4588	if (substring == NULL)
				4589	return NULL;
				4590
				4591	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4592
				4593	Py_DECREF(substring);
				4594	return result;
				4595	}
				4596
				4597	static char rindex__doc__[] =
				4598	"S.rindex(sub [,start [,end]]) -> int\n\
				4599	\n\
				4600	Like S.rfind() but raise ValueError when the substring is not found.";
				4601
				4602	static PyObject *
				4603	unicode_rindex(PyUnicodeObject self, PyObject args)
				4604	{
				4605	int result;
				4606	PyUnicodeObject *substring;
				4607	int start = 0;
				4608	int end = INT_MAX;
				4609
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4610	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4611	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4612	return NULL;
				4613	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4614	(PyObject *)substring);
				4615	if (substring == NULL)
				4616	return NULL;
				4617
				4618	result = findstring(self, substring, start, end, -1);
				4619
				4620	Py_DECREF(substring);
				4621	if (result < 0) {
				4622	PyErr_SetString(PyExc_ValueError, "substring not found");
				4623	return NULL;
				4624	}
				4625	return PyInt_FromLong(result);
				4626	}
				4627
				4628	static char rjust__doc__[] =
				4629	"S.rjust(width) -> unicode\n\
				4630	\n\
				4631	Return S right justified in a Unicode string of length width. Padding is\n\
				4632	done using spaces.";
				4633
				4634	static PyObject *
				4635	unicode_rjust(PyUnicodeObject self, PyObject args)
				4636	{
				4637	int width;
				4638	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4639	return NULL;
				4640
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4641	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4642	Py_INCREF(self);
				4643	return (PyObject*) self;
				4644	}
				4645
				4646	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4647	}
				4648
				4649	static char rstrip__doc__[] =
				4650	"S.rstrip() -> unicode\n\
				4651	\n\
				4652	Return a copy of the string S with trailing whitespace removed.";
				4653
				4654	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4655	unicode_rstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4656	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4657	return strip(self, 0, 1);
				4658	}
				4659
				4660	static PyObject*
				4661	unicode_slice(PyUnicodeObject *self, int start, int end)
				4662	{
				4663	/* standard clamping */
				4664	if (start < 0)
				4665	start = 0;
				4666	if (end < 0)
				4667	end = 0;
				4668	if (end > self->length)
				4669	end = self->length;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4670	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4671	/* full slice, return original string */
				4672	Py_INCREF(self);
				4673	return (PyObject*) self;
				4674	}
				4675	if (start > end)
				4676	start = end;
				4677	/* copy slice */
				4678	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4679	end - start);
				4680	}
				4681
				4682	PyObject PyUnicode_Split(PyObject s,
				4683	PyObject *sep,
				4684	int maxsplit)
				4685	{
				4686	PyObject *result;
				4687
				4688	s = PyUnicode_FromObject(s);
				4689	if (s == NULL)
				4690	return NULL;
				4691	if (sep != NULL) {
				4692	sep = PyUnicode_FromObject(sep);
				4693	if (sep == NULL) {
				4694	Py_DECREF(s);
				4695	return NULL;
				4696	}
				4697	}
				4698
				4699	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4700
				4701	Py_DECREF(s);
				4702	Py_XDECREF(sep);
				4703	return result;
				4704	}
				4705
				4706	static char split__doc__[] =
				4707	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4708	\n\
				4709	Return a list of the words in S, using sep as the\n\
				4710	delimiter string. If maxsplit is given, at most maxsplit\n\
				4711	splits are done. If sep is not specified, any whitespace string\n\
				4712	is a separator.";
				4713
				4714	static PyObject*
				4715	unicode_split(PyUnicodeObject self, PyObject args)
				4716	{
				4717	PyObject *substring = Py_None;
				4718	int maxcount = -1;
				4719
				4720	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4721	return NULL;
				4722
				4723	if (substring == Py_None)
				4724	return split(self, NULL, maxcount);
				4725	else if (PyUnicode_Check(substring))
				4726	return split(self, (PyUnicodeObject *)substring, maxcount);
				4727	else
				4728	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4729	}
				4730
				4731	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4732	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4733	\n\
				4734	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4735	Line breaks are not included in the resulting list unless keepends\n\
				4736	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4737
				4738	static PyObject*
				4739	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4740	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4741	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4742
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4743	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4744	return NULL;
				4745
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4746	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4747	}
				4748
				4749	static
				4750	PyObject unicode_str(PyUnicodeObject self)
				4751	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4752	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4753	}
				4754
				4755	static char strip__doc__[] =
				4756	"S.strip() -> unicode\n\
				4757	\n\
				4758	Return a copy of S with leading and trailing whitespace removed.";
				4759
				4760	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4761	unicode_strip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4762	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4763	return strip(self, 1, 1);
				4764	}
				4765
				4766	static char swapcase__doc__[] =
				4767	"S.swapcase() -> unicode\n\
				4768	\n\
				4769	Return a copy of S with uppercase characters converted to lowercase\n\
				4770	and vice versa.";
				4771
				4772	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4773	unicode_swapcase(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4774	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4775	return fixup(self, fixswapcase);
				4776	}
				4777
				4778	static char translate__doc__[] =
				4779	"S.translate(table) -> unicode\n\
				4780	\n\
				4781	Return a copy of the string S, where all characters have been mapped\n\
				4782	through the given translation table, which must be a mapping of\n\
				4783	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4784	are left untouched. Characters mapped to None are deleted.";
				4785
				4786	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4787	unicode_translate(PyUnicodeObject self, PyObject table)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4788	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4789	return PyUnicode_TranslateCharmap(self->str,
				4790	self->length,
				4791	table,
				4792	"ignore");
				4793	}
				4794
				4795	static char upper__doc__[] =
				4796	"S.upper() -> unicode\n\
				4797	\n\
				4798	Return a copy of S converted to uppercase.";
				4799
				4800	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4801	unicode_upper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4802	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4803	return fixup(self, fixupper);
				4804	}
				4805
				4806	#if 0
				4807	static char zfill__doc__[] =
				4808	"S.zfill(width) -> unicode\n\
				4809	\n\
				4810	Pad a numeric string x with zeros on the left, to fill a field\n\
				4811	of the specified width. The string x is never truncated.";
				4812
				4813	static PyObject *
				4814	unicode_zfill(PyUnicodeObject self, PyObject args)
				4815	{
				4816	int fill;
				4817	PyUnicodeObject *u;
				4818
				4819	int width;
				4820	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4821	return NULL;
				4822
				4823	if (self->length >= width) {
				4824	Py_INCREF(self);
				4825	return (PyObject*) self;
				4826	}
				4827
				4828	fill = width - self->length;
				4829
				4830	u = pad(self, fill, 0, '0');
				4831
				4832	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4833	/* move sign to beginning of string */
				4834	u->str[0] = u->str[fill];
				4835	u->str[fill] = '0';
				4836	}
				4837
				4838	return (PyObject*) u;
				4839	}
				4840	#endif
				4841
				4842	#if 0
				4843	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4844	unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4845	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4846	return PyInt_FromLong(unicode_freelist_size);
				4847	}
				4848	#endif
				4849
				4850	static char startswith__doc__[] =
				4851	"S.startswith(prefix[, start[, end]]) -> int\n\
				4852	\n\
				4853	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4854	optional start, test S beginning at that position. With optional end, stop\n\
				4855	comparing S at that position.";
				4856
				4857	static PyObject *
				4858	unicode_startswith(PyUnicodeObject *self,
				4859	PyObject *args)
				4860	{
				4861	PyUnicodeObject *substring;
				4862	int start = 0;
				4863	int end = INT_MAX;
				4864	PyObject *result;
				4865
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4866	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4867	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4868	return NULL;
				4869	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4870	(PyObject *)substring);
				4871	if (substring == NULL)
				4872	return NULL;
				4873
				4874	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4875
				4876	Py_DECREF(substring);
				4877	return result;
				4878	}
				4879
				4880
				4881	static char endswith__doc__[] =
				4882	"S.endswith(suffix[, start[, end]]) -> int\n\
				4883	\n\
				4884	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4885	optional start, test S beginning at that position. With optional end, stop\n\
				4886	comparing S at that position.";
				4887
				4888	static PyObject *
				4889	unicode_endswith(PyUnicodeObject *self,
				4890	PyObject *args)
				4891	{
				4892	PyUnicodeObject *substring;
				4893	int start = 0;
				4894	int end = INT_MAX;
				4895	PyObject *result;
				4896
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4897	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4898	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4899	return NULL;
				4900	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4901	(PyObject *)substring);
				4902	if (substring == NULL)
				4903	return NULL;
				4904
				4905	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4906
				4907	Py_DECREF(substring);
				4908	return result;
				4909	}
				4910
				4911
				4912	static PyMethodDef unicode_methods[] = {
				4913
				4914	/* Order is according to common usage: often used methods should
				4915	appear first, since lookup is done sequentially. */
				4916
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4917	{"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
				4918	{"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
				4919	{"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
				4920	{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
				4921	{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
				4922	{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
				4923	{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
				4924	{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
				4925	{"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
				4926	{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
				4927	{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
				4928	{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
				4929	{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
				4930	{"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
				4931	/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
				4932	{"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
				4933	{"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
				4934	{"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
				4935	{"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
				4936	{"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
				4937	{"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
				4938	{"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
				4939	{"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
				4940	{"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
				4941	{"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
				4942	{"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
				4943	{"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
				4944	{"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
				4945	{"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
				4946	{"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
				4947	{"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
				4948	{"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
				4949	{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
				4950	{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
				4951	{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4952	#if 0
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4953	{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
				4954	{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4955	#endif
				4956
				4957	#if 0
				4958	/* This one is just used for debugging the implementation. */
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4959	{"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4960	#endif
				4961
				4962	{NULL, NULL}
				4963	};
				4964
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4965	static PySequenceMethods unicode_as_sequence = {
				4966	(inquiry) unicode_length, /* sq_length */
				4967	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4968	(intargfunc) unicode_repeat, /* sq_repeat */
				4969	(intargfunc) unicode_getitem, /* sq_item */
				4970	(intintargfunc) unicode_slice, /* sq_slice */
				4971	0, /* sq_ass_item */
				4972	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4973	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4974	};
				4975
				4976	static int
				4977	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4978	int index,
				4979	const void **ptr)
				4980	{
				4981	if (index != 0) {
				4982	PyErr_SetString(PyExc_SystemError,
				4983	"accessing non-existent unicode segment");
				4984	return -1;
				4985	}
				4986	ptr = (void ) self->str;
				4987	return PyUnicode_GET_DATA_SIZE(self);
				4988	}
				4989
				4990	static int
				4991	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4992	const void **ptr)
				4993	{
				4994	PyErr_SetString(PyExc_TypeError,
				4995	"cannot use unicode as modifyable buffer");
				4996	return -1;
				4997	}
				4998
				4999	static int
				5000	unicode_buffer_getsegcount(PyUnicodeObject *self,
				5001	int *lenp)
				5002	{
				5003	if (lenp)
				5004	*lenp = PyUnicode_GET_DATA_SIZE(self);
				5005	return 1;
				5006	}
				5007
				5008	static int
				5009	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				5010	int index,
				5011	const void **ptr)
				5012	{
				5013	PyObject *str;
				5014
				5015	if (index != 0) {
				5016	PyErr_SetString(PyExc_SystemError,
				5017	"accessing non-existent unicode segment");
				5018	return -1;
				5019	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5020	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5021	if (str == NULL)
				5022	return -1;
				5023	ptr = (void ) PyString_AS_STRING(str);
				5024	return PyString_GET_SIZE(str);
				5025	}
				5026
				5027	/* Helpers for PyUnicode_Format() */
				5028
				5029	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5030	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5031	{
				5032	int argidx = *p_argidx;
				5033	if (argidx < arglen) {
				5034	(*p_argidx)++;
				5035	if (arglen < 0)
				5036	return args;
				5037	else
				5038	return PyTuple_GetItem(args, argidx);
				5039	}
				5040	PyErr_SetString(PyExc_TypeError,
				5041	"not enough arguments for format string");
				5042	return NULL;
				5043	}
				5044
				5045	#define F_LJUST (1<<0)
				5046	#define F_SIGN (1<<1)
				5047	#define F_BLANK (1<<2)
				5048	#define F_ALT (1<<3)
				5049	#define F_ZERO (1<<4)
				5050
				5051	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5052	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5053	{
				5054	register int i;
				5055	int len;
				5056	va_list va;
				5057	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5058	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5059
				5060	/* First, format the string as char array, then expand to Py_UNICODE
				5061	array. */
				5062	charbuffer = (char *)buffer;
				5063	len = vsprintf(charbuffer, format, va);
				5064	for (i = len - 1; i >= 0; i--)
				5065	buffer[i] = (Py_UNICODE) charbuffer[i];
				5066
				5067	va_end(va);
				5068	return len;
				5069	}
				5070
				5071	static int
				5072	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5073	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5074	int flags,
				5075	int prec,
				5076	int type,
				5077	PyObject *v)
				5078	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5079	/* fmt = '%#.' + `prec` + `type`
				5080	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5081	char fmt[20];
				5082	double x;
				5083
				5084	x = PyFloat_AsDouble(v);
				5085	if (x == -1.0 && PyErr_Occurred())
				5086	return -1;
				5087	if (prec < 0)
				5088	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5089	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				5090	type = 'g';
				5091	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5092	/* worst case length calc to ensure no buffer overrun:
				5093	fmt = %#.<prec>g
				5094	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				5095	for any double rep.)
				5096	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				5097	If prec=0 the effective precision is 1 (the leading digit is
				5098	always given), therefore increase by one to 10+prec. */
				5099	if (buflen <= (size_t)10 + (size_t)prec) {
				5100	PyErr_SetString(PyExc_OverflowError,
				5101	"formatted float is too long (precision too long?)");
				5102	return -1;
				5103	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5104	return usprintf(buf, fmt, x);
				5105	}
				5106
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5107	static PyObject*
				5108	formatlong(PyObject *val, int flags, int prec, int type)
				5109	{
				5110	char *buf;
				5111	int i, len;
				5112	PyObject str; / temporary string object. */
				5113	PyUnicodeObject *result;
				5114
				5115	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				5116	if (!str)
				5117	return NULL;
				5118	result = _PyUnicode_New(len);
				5119	for (i = 0; i < len; i++)
				5120	result->str[i] = buf[i];
				5121	result->str[len] = 0;
				5122	Py_DECREF(str);
				5123	return (PyObject*)result;
				5124	}
				5125
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5126	static int
				5127	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5128	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5129	int flags,
				5130	int prec,
				5131	int type,
				5132	PyObject *v)
				5133	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5134	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5135	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				5136	+ 1 + 1 = 24*/
				5137	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5138	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5139	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5140
				5141	x = PyInt_AsLong(v);
				5142	if (x == -1 && PyErr_Occurred())
				5143	return -1;
				5144	if (prec < 0)
				5145	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5146	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				5147	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				5148	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				5149	PyErr_SetString(PyExc_OverflowError,
				5150	"formatted integer is too long (precision too long?)");
				5151	return -1;
				5152	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5153	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				5154	* but we want it (for consistency with other %#x conversions, and
				5155	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5156	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				5157	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				5158	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5159	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5160	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				5161	/* Only way to know what the platform does is to try it. */
				5162	sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
				5163	if (fmt[1] != (char)type) {
				5164	/* Supply our own leading 0x/0X -- needed under std C */
				5165	use_native_c_format = 0;
				5166	sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
				5167	}
				5168	}
				5169	if (use_native_c_format)
				5170	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5171	return usprintf(buf, fmt, x);
				5172	}
				5173
				5174	static int
				5175	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5176	size_t buflen,
				5177	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5178	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5179	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5180	if (PyUnicode_Check(v)) {
				5181	if (PyUnicode_GET_SIZE(v) != 1)
				5182	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5183	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5184	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5185
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5186	else if (PyString_Check(v)) {
				5187	if (PyString_GET_SIZE(v) != 1)
				5188	goto onError;
				5189	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				5190	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5191
				5192	else {
				5193	/* Integer input truncated to a character */
				5194	long x;
				5195	x = PyInt_AsLong(v);
				5196	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5197	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5198	buf[0] = (char) x;
				5199	}
				5200	buf[1] = '\0';
				5201	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5202
				5203	onError:
				5204	PyErr_SetString(PyExc_TypeError,
				5205	"%c requires int or char");
				5206	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5207	}
				5208
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5209	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				5210
				5211	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				5212	chars are formatted. XXX This is a magic number. Each formatting
				5213	routine does bounds checking to ensure no overflow, but a better
				5214	solution may be to malloc a buffer of appropriate size for each
				5215	format. For now, the current solution is sufficient.
				5216	*/
				5217	#define FORMATBUFLEN (size_t)120
				5218
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5219	PyObject PyUnicode_Format(PyObject format,
				5220	PyObject *args)
				5221	{
				5222	Py_UNICODE fmt, res;
				5223	int fmtcnt, rescnt, reslen, arglen, argidx;
				5224	int args_owned = 0;
				5225	PyUnicodeObject *result = NULL;
				5226	PyObject *dict = NULL;
				5227	PyObject *uformat;
				5228
				5229	if (format == NULL \|\| args == NULL) {
				5230	PyErr_BadInternalCall();
				5231	return NULL;
				5232	}
				5233	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5234	if (uformat == NULL)
				5235	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5236	fmt = PyUnicode_AS_UNICODE(uformat);
				5237	fmtcnt = PyUnicode_GET_SIZE(uformat);
				5238
				5239	reslen = rescnt = fmtcnt + 100;
				5240	result = _PyUnicode_New(reslen);
				5241	if (result == NULL)
				5242	goto onError;
				5243	res = PyUnicode_AS_UNICODE(result);
				5244
				5245	if (PyTuple_Check(args)) {
				5246	arglen = PyTuple_Size(args);
				5247	argidx = 0;
				5248	}
				5249	else {
				5250	arglen = -1;
				5251	argidx = -2;
				5252	}
				5253	if (args->ob_type->tp_as_mapping)
				5254	dict = args;
				5255
				5256	while (--fmtcnt >= 0) {
				5257	if (*fmt != '%') {
				5258	if (--rescnt < 0) {
				5259	rescnt = fmtcnt + 100;
				5260	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5261	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5262	return NULL;
				5263	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				5264	--rescnt;
				5265	}
				5266	res++ = fmt++;
				5267	}
				5268	else {
				5269	/* Got a format specifier */
				5270	int flags = 0;
				5271	int width = -1;
				5272	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5273	Py_UNICODE c = '\0';
				5274	Py_UNICODE fill;
				5275	PyObject *v = NULL;
				5276	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5277	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5278	Py_UNICODE sign;
				5279	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5280	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5281
				5282	fmt++;
				5283	if (*fmt == '(') {
				5284	Py_UNICODE *keystart;
				5285	int keylen;
				5286	PyObject *key;
				5287	int pcount = 1;
				5288
				5289	if (dict == NULL) {
				5290	PyErr_SetString(PyExc_TypeError,
				5291	"format requires a mapping");
				5292	goto onError;
				5293	}
				5294	++fmt;
				5295	--fmtcnt;
				5296	keystart = fmt;
				5297	/* Skip over balanced parentheses */
				5298	while (pcount > 0 && --fmtcnt >= 0) {
				5299	if (*fmt == ')')
				5300	--pcount;
				5301	else if (*fmt == '(')
				5302	++pcount;
				5303	fmt++;
				5304	}
				5305	keylen = fmt - keystart - 1;
				5306	if (fmtcnt < 0 \|\| pcount > 0) {
				5307	PyErr_SetString(PyExc_ValueError,
				5308	"incomplete format key");
				5309	goto onError;
				5310	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5311	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5312	then looked up since Python uses strings to hold
				5313	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5314	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5315	key = PyUnicode_EncodeUTF8(keystart,
				5316	keylen,
				5317	NULL);
				5318	if (key == NULL)
				5319	goto onError;
				5320	if (args_owned) {
				5321	Py_DECREF(args);
				5322	args_owned = 0;
				5323	}
				5324	args = PyObject_GetItem(dict, key);
				5325	Py_DECREF(key);
				5326	if (args == NULL) {
				5327	goto onError;
				5328	}
				5329	args_owned = 1;
				5330	arglen = -1;
				5331	argidx = -2;
				5332	}
				5333	while (--fmtcnt >= 0) {
				5334	switch (c = *fmt++) {
				5335	case '-': flags \|= F_LJUST; continue;
				5336	case '+': flags \|= F_SIGN; continue;
				5337	case ' ': flags \|= F_BLANK; continue;
				5338	case '#': flags \|= F_ALT; continue;
				5339	case '0': flags \|= F_ZERO; continue;
				5340	}
				5341	break;
				5342	}
				5343	if (c == '*') {
				5344	v = getnextarg(args, arglen, &argidx);
				5345	if (v == NULL)
				5346	goto onError;
				5347	if (!PyInt_Check(v)) {
				5348	PyErr_SetString(PyExc_TypeError,
				5349	"* wants int");
				5350	goto onError;
				5351	}
				5352	width = PyInt_AsLong(v);
				5353	if (width < 0) {
				5354	flags \|= F_LJUST;
				5355	width = -width;
				5356	}
				5357	if (--fmtcnt >= 0)
				5358	c = *fmt++;
				5359	}
				5360	else if (c >= '0' && c <= '9') {
				5361	width = c - '0';
				5362	while (--fmtcnt >= 0) {
				5363	c = *fmt++;
				5364	if (c < '0' \|\| c > '9')
				5365	break;
				5366	if ((width*10) / 10 != width) {
				5367	PyErr_SetString(PyExc_ValueError,
				5368	"width too big");
				5369	goto onError;
				5370	}
				5371	width = width*10 + (c - '0');
				5372	}
				5373	}
				5374	if (c == '.') {
				5375	prec = 0;
				5376	if (--fmtcnt >= 0)
				5377	c = *fmt++;
				5378	if (c == '*') {
				5379	v = getnextarg(args, arglen, &argidx);
				5380	if (v == NULL)
				5381	goto onError;
				5382	if (!PyInt_Check(v)) {
				5383	PyErr_SetString(PyExc_TypeError,
				5384	"* wants int");
				5385	goto onError;
				5386	}
				5387	prec = PyInt_AsLong(v);
				5388	if (prec < 0)
				5389	prec = 0;
				5390	if (--fmtcnt >= 0)
				5391	c = *fmt++;
				5392	}
				5393	else if (c >= '0' && c <= '9') {
				5394	prec = c - '0';
				5395	while (--fmtcnt >= 0) {
				5396	c = Py_CHARMASK(*fmt++);
				5397	if (c < '0' \|\| c > '9')
				5398	break;
				5399	if ((prec*10) / 10 != prec) {
				5400	PyErr_SetString(PyExc_ValueError,
				5401	"prec too big");
				5402	goto onError;
				5403	}
				5404	prec = prec*10 + (c - '0');
				5405	}
				5406	}
				5407	} /* prec */
				5408	if (fmtcnt >= 0) {
				5409	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5410	if (--fmtcnt >= 0)
				5411	c = *fmt++;
				5412	}
				5413	}
				5414	if (fmtcnt < 0) {
				5415	PyErr_SetString(PyExc_ValueError,
				5416	"incomplete format");
				5417	goto onError;
				5418	}
				5419	if (c != '%') {
				5420	v = getnextarg(args, arglen, &argidx);
				5421	if (v == NULL)
				5422	goto onError;
				5423	}
				5424	sign = 0;
				5425	fill = ' ';
				5426	switch (c) {
				5427
				5428	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5429	pbuf = formatbuf;
				5430	/* presume that buffer length is at least 1 */
				5431	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5432	len = 1;
				5433	break;
				5434
				5435	case 's':
				5436	case 'r':
				5437	if (PyUnicode_Check(v) && c == 's') {
				5438	temp = v;
				5439	Py_INCREF(temp);
				5440	}
				5441	else {
				5442	PyObject *unicode;
				5443	if (c == 's')
				5444	temp = PyObject_Str(v);
				5445	else
				5446	temp = PyObject_Repr(v);
				5447	if (temp == NULL)
				5448	goto onError;
				5449	if (!PyString_Check(temp)) {
				5450	/* XXX Note: this should never happen, since
				5451	PyObject_Repr() and PyObject_Str() assure
				5452	this */
				5453	Py_DECREF(temp);
				5454	PyErr_SetString(PyExc_TypeError,
				5455	"%s argument has non-string str()");
				5456	goto onError;
				5457	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5458	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5459	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5460	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5461	"strict");
				5462	Py_DECREF(temp);
				5463	temp = unicode;
				5464	if (temp == NULL)
				5465	goto onError;
				5466	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5467	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5468	len = PyUnicode_GET_SIZE(temp);
				5469	if (prec >= 0 && len > prec)
				5470	len = prec;
				5471	break;
				5472
				5473	case 'i':
				5474	case 'd':
				5475	case 'u':
				5476	case 'o':
				5477	case 'x':
				5478	case 'X':
				5479	if (c == 'i')
				5480	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5481	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5482	temp = formatlong(v, flags, prec, c);
				5483	if (!temp)
				5484	goto onError;
				5485	pbuf = PyUnicode_AS_UNICODE(temp);
				5486	len = PyUnicode_GET_SIZE(temp);
				5487	/* unbounded ints can always produce
				5488	a sign character! */
				5489	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5490	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5491	else {
				5492	pbuf = formatbuf;
				5493	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5494	flags, prec, c, v);
				5495	if (len < 0)
				5496	goto onError;
				5497	/* only d conversion is signed */
				5498	sign = c == 'd';
				5499	}
				5500	if (flags & F_ZERO)
				5501	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5502	break;
				5503
				5504	case 'e':
				5505	case 'E':
				5506	case 'f':
				5507	case 'g':
				5508	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5509	pbuf = formatbuf;
				5510	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5511	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5512	if (len < 0)
				5513	goto onError;
				5514	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5515	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5516	fill = '0';
				5517	break;
				5518
				5519	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5520	pbuf = formatbuf;
				5521	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5522	if (len < 0)
				5523	goto onError;
				5524	break;
				5525
				5526	default:
				5527	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5528	"unsupported format character '%c' (0x%x) "
				5529	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5530	(31<=c && c<=126) ? c : '?',
				5531	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5532	goto onError;
				5533	}
				5534	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5535	if (pbuf == '-' \|\| pbuf == '+') {
				5536	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5537	len--;
				5538	}
				5539	else if (flags & F_SIGN)
				5540	sign = '+';
				5541	else if (flags & F_BLANK)
				5542	sign = ' ';
				5543	else
				5544	sign = 0;
				5545	}
				5546	if (width < len)
				5547	width = len;
				5548	if (rescnt < width + (sign != 0)) {
				5549	reslen -= rescnt;
				5550	rescnt = width + fmtcnt + 100;
				5551	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5552	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5553	return NULL;
				5554	res = PyUnicode_AS_UNICODE(result)
				5555	+ reslen - rescnt;
				5556	}
				5557	if (sign) {
				5558	if (fill != ' ')
				5559	*res++ = sign;
				5560	rescnt--;
				5561	if (width > len)
				5562	width--;
				5563	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5564	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5565	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5566	assert(pbuf[1] == c);
				5567	if (fill != ' ') {
				5568	res++ = pbuf++;
				5569	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5570	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5571	rescnt -= 2;
				5572	width -= 2;
				5573	if (width < 0)
				5574	width = 0;
				5575	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5576	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5577	if (width > len && !(flags & F_LJUST)) {
				5578	do {
				5579	--rescnt;
				5580	*res++ = fill;
				5581	} while (--width > len);
				5582	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5583	if (fill == ' ') {
				5584	if (sign)
				5585	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5586	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5587	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5588	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5589	res++ = pbuf++;
				5590	res++ = pbuf++;
				5591	}
				5592	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5593	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5594	res += len;
				5595	rescnt -= len;
				5596	while (--width >= len) {
				5597	--rescnt;
				5598	*res++ = ' ';
				5599	}
				5600	if (dict && (argidx < arglen) && c != '%') {
				5601	PyErr_SetString(PyExc_TypeError,
				5602	"not all arguments converted");
				5603	goto onError;
				5604	}
				5605	Py_XDECREF(temp);
				5606	} /* '%' */
				5607	} /* until end */
				5608	if (argidx < arglen && !dict) {
				5609	PyErr_SetString(PyExc_TypeError,
				5610	"not all arguments converted");
				5611	goto onError;
				5612	}
				5613
				5614	if (args_owned) {
				5615	Py_DECREF(args);
				5616	}
				5617	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5618	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5619	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5620	return (PyObject *)result;
				5621
				5622	onError:
				5623	Py_XDECREF(result);
				5624	Py_DECREF(uformat);
				5625	if (args_owned) {
				5626	Py_DECREF(args);
				5627	}
				5628	return NULL;
				5629	}
				5630
				5631	static PyBufferProcs unicode_as_buffer = {
				5632	(getreadbufferproc) unicode_buffer_getreadbuf,
				5633	(getwritebufferproc) unicode_buffer_getwritebuf,
				5634	(getsegcountproc) unicode_buffer_getsegcount,
				5635	(getcharbufferproc) unicode_buffer_getcharbuf,
				5636	};
				5637
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5638	staticforward PyObject *
				5639	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds);
				5640
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5641	static PyObject *
				5642	unicode_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5643	{
				5644	PyObject *x = NULL;
				5645	static char *kwlist[] = {"string", "encoding", "errors", 0};
				5646	char *encoding = NULL;
				5647	char *errors = NULL;
				5648
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5649	if (type != &PyUnicode_Type)
				5650	return unicode_subtype_new(type, args, kwds);
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5651	if (!PyArg_ParseTupleAndKeywords(args, kwds, "\|Oss:unicode",
				5652	kwlist, &x, &encoding, &errors))
				5653	return NULL;
				5654	if (x == NULL)
				5655	return (PyObject *)_PyUnicode_New(0);
				5656	return PyUnicode_FromEncodedObject(x, encoding, errors);
				5657	}
				5658
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5659	static PyObject *
				5660	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5661	{
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5662	PyUnicodeObject tmp, pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5663	int n;
				5664
				5665	assert(PyType_IsSubtype(type, &PyUnicode_Type));
				5666	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
				5667	if (tmp == NULL)
				5668	return NULL;
				5669	assert(PyUnicode_Check(tmp));
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5670	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
				5671	if (pnew == NULL)
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5672	return NULL;
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5673	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
				5674	if (pnew->str == NULL) {
				5675	_Py_ForgetReference((PyObject *)pnew);
				5676	PyObject_DEL(pnew);
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5677	return NULL;
				5678	}
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5679	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
				5680	pnew->length = n;
				5681	pnew->hash = tmp->hash;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5682	Py_DECREF(tmp);
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5683	return (PyObject *)pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5684	}
				5685
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5686	static char unicode_doc[] =
				5687	"unicode(string [, encoding[, errors]]) -> object\n\
				5688	\n\
				5689	Create a new Unicode object from the given encoded string.\n\
				5690	encoding defaults to the current default string encoding and \n\
				5691	errors, defining the error handling, to 'strict'.";
				5692
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5693	PyTypeObject PyUnicode_Type = {
				5694	PyObject_HEAD_INIT(&PyType_Type)
				5695	0, /* ob_size */
				5696	"unicode", /* tp_name */
				5697	sizeof(PyUnicodeObject), /* tp_size */
				5698	0, /* tp_itemsize */
				5699	/* Slots */
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	5700	(destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5701	0, /* tp_print */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5702	0, /* tp_getattr */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5703	0, /* tp_setattr */
				5704	(cmpfunc) unicode_compare, /* tp_compare */
				5705	(reprfunc) unicode_repr, /* tp_repr */
				5706	0, /* tp_as_number */
				5707	&unicode_as_sequence, /* tp_as_sequence */
				5708	0, /* tp_as_mapping */
				5709	(hashfunc) unicode_hash, /* tp_hash*/
				5710	0, /* tp_call*/
				5711	(reprfunc) unicode_str, /* tp_str */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5712	PyObject_GenericGetAttr, /* tp_getattro */
				5713	0, /* tp_setattro */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5714	&unicode_as_buffer, /* tp_as_buffer */
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5715	Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5716	unicode_doc, /* tp_doc */
				5717	0, /* tp_traverse */
				5718	0, /* tp_clear */
				5719	0, /* tp_richcompare */
				5720	0, /* tp_weaklistoffset */
				5721	0, /* tp_iter */
				5722	0, /* tp_iternext */
				5723	unicode_methods, /* tp_methods */
				5724	0, /* tp_members */
				5725	0, /* tp_getset */
				5726	0, /* tp_base */
				5727	0, /* tp_dict */
				5728	0, /* tp_descr_get */
				5729	0, /* tp_descr_set */
				5730	0, /* tp_dictoffset */
				5731	0, /* tp_init */
				5732	0, /* tp_alloc */
				5733	unicode_new, /* tp_new */
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	5734	_PyObject_Del, /* tp_free */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5735	};
				5736
				5737	/* Initialize the Unicode implementation */
				5738
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5739	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5740	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5741	int i;
				5742
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5743	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5744	unicode_freelist = NULL;
				5745	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5746	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5747	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5748	for (i = 0; i < 256; i++)
				5749	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5750	}
				5751
				5752	/* Finalize the Unicode implementation */
				5753
				5754	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5755	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5756	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5757	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5758	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5759
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5760	Py_XDECREF(unicode_empty);
				5761	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5762
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5763	for (i = 0; i < 256; i++) {
				5764	if (unicode_latin1[i]) {
				5765	Py_DECREF(unicode_latin1[i]);
				5766	unicode_latin1[i] = NULL;
				5767	}
				5768	}
				5769
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5770	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5771	PyUnicodeObject *v = u;
				5772	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5773	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5774	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5775	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5776	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5777	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5778	unicode_freelist = NULL;
				5779	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5780	}