Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 68afaa05c85fb25c5e3c1a7932b0878d36f2ee9f [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	107	PyUnicode_GetMax(void)
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	108	{
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	109	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
				204	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	222	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	227	void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	228	{
Guido van Rossum	604ddf8	2001-12-06 20:03:56 +0000	[diff] [blame]	229	if (PyUnicode_CheckExact(unicode) &&
				230	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	231	/* Keep-Alive optimization */
				232	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	233	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	234	unicode->str = NULL;
				235	unicode->length = 0;
				236	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	237	if (unicode->defenc) {
				238	Py_DECREF(unicode->defenc);
				239	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	240	}
				241	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	242	(PyUnicodeObject *)unicode = unicode_freelist;
				243	unicode_freelist = unicode;
				244	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	245	}
				246	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	247	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	248	Py_XDECREF(unicode->defenc);
Guido van Rossum	604ddf8	2001-12-06 20:03:56 +0000	[diff] [blame]	249	unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	250	}
				251	}
				252
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	253	int PyUnicode_Resize(PyObject **unicode,
				254	int length)
				255	{
				256	register PyUnicodeObject *v;
				257
				258	/* Argument checks */
				259	if (unicode == NULL) {
				260	PyErr_BadInternalCall();
				261	return -1;
				262	}
				263	v = (PyUnicodeObject )unicode;
				264	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				265	PyErr_BadInternalCall();
				266	return -1;
				267	}
				268
				269	/* Resizing unicode_empty and single character objects is not
				270	possible since these are being shared. We simply return a fresh
				271	copy with the same Unicode content. */
				272	if (v->length != length &&
				273	(v == unicode_empty \|\| v->length == 1)) {
				274	PyUnicodeObject *w = _PyUnicode_New(length);
				275	if (w == NULL)
				276	return -1;
				277	Py_UNICODE_COPY(w->str, v->str,
				278	length < v->length ? length : v->length);
				279	unicode = (PyObject )w;
				280	return 0;
				281	}
				282
				283	/* Note that we don't have to modify *unicode for unshared Unicode
				284	objects, since we can modify them in-place. */
				285	return unicode_resize(v, length);
				286	}
				287
				288	/* Internal API for use in unicodeobject.c only ! */
				289	#define _PyUnicode_Resize(unicodevar, length) \
				290	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				291
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	292	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				293	int size)
				294	{
				295	PyUnicodeObject *unicode;
				296
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	297	/* If the Unicode data is known at construction time, we can apply
				298	some optimizations which share commonly used objects. */
				299	if (u != NULL) {
				300
				301	/* Optimization for empty strings */
				302	if (size == 0 && unicode_empty != NULL) {
				303	Py_INCREF(unicode_empty);
				304	return (PyObject *)unicode_empty;
				305	}
				306
				307	/* Single character Unicode objects in the Latin-1 range are
				308	shared when using this constructor */
				309	if (size == 1 && *u < 256) {
				310	unicode = unicode_latin1[*u];
				311	if (!unicode) {
				312	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	313	if (!unicode)
				314	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	315	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	316	unicode_latin1[*u] = unicode;
				317	}
				318	Py_INCREF(unicode);
				319	return (PyObject *)unicode;
				320	}
				321	}
				322
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	323	unicode = _PyUnicode_New(size);
				324	if (!unicode)
				325	return NULL;
				326
				327	/* Copy the Unicode data into the new object */
				328	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	329	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	330
				331	return (PyObject *)unicode;
				332	}
				333
				334	#ifdef HAVE_WCHAR_H
				335
				336	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				337	int size)
				338	{
				339	PyUnicodeObject *unicode;
				340
				341	if (w == NULL) {
				342	PyErr_BadInternalCall();
				343	return NULL;
				344	}
				345
				346	unicode = _PyUnicode_New(size);
				347	if (!unicode)
				348	return NULL;
				349
				350	/* Copy the wchar_t data into the new object */
				351	#ifdef HAVE_USABLE_WCHAR_T
				352	memcpy(unicode->str, w, size * sizeof(wchar_t));
				353	#else
				354	{
				355	register Py_UNICODE *u;
				356	register int i;
				357	u = PyUnicode_AS_UNICODE(unicode);
				358	for (i = size; i >= 0; i--)
				359	u++ = w++;
				360	}
				361	#endif
				362
				363	return (PyObject *)unicode;
				364	}
				365
				366	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				367	register wchar_t *w,
				368	int size)
				369	{
				370	if (unicode == NULL) {
				371	PyErr_BadInternalCall();
				372	return -1;
				373	}
				374	if (size > PyUnicode_GET_SIZE(unicode))
				375	size = PyUnicode_GET_SIZE(unicode);
				376	#ifdef HAVE_USABLE_WCHAR_T
				377	memcpy(w, unicode->str, size * sizeof(wchar_t));
				378	#else
				379	{
				380	register Py_UNICODE *u;
				381	register int i;
				382	u = PyUnicode_AS_UNICODE(unicode);
				383	for (i = size; i >= 0; i--)
				384	w++ = u++;
				385	}
				386	#endif
				387
				388	return size;
				389	}
				390
				391	#endif
				392
				393	PyObject PyUnicode_FromObject(register PyObject obj)
				394	{
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	395	/* XXX Perhaps we should make this API an alias of
				396	PyObject_Unicode() instead ?! */
				397	if (PyUnicode_CheckExact(obj)) {
				398	Py_INCREF(obj);
				399	return obj;
				400	}
				401	if (PyUnicode_Check(obj)) {
				402	/* For a Unicode subtype that's not a Unicode object,
				403	return a true Unicode object with the same data. */
				404	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
				405	PyUnicode_GET_SIZE(obj));
				406	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	407	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				408	}
				409
				410	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				411	const char *encoding,
				412	const char *errors)
				413	{
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	414	const char *s = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	415	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	416	int owned = 0;
				417	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	418
				419	if (obj == NULL) {
				420	PyErr_BadInternalCall();
				421	return NULL;
				422	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	423
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	424	#if 0
				425	/* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburg	b5507ec	2001-10-19 12:02:29 +0000	[diff] [blame]	426	that no encodings is given and then redirect to
				427	PyObject_Unicode() which then applies the additional logic for
				428	Unicode subclasses.
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	429
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	430	NOTE: This API should really only be used for object which
				431	represent encoded Unicode !
				432
				433	*/
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	434	if (PyUnicode_Check(obj)) {
				435	if (encoding) {
				436	PyErr_SetString(PyExc_TypeError,
				437	"decoding Unicode is not supported");
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	438	return NULL;
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	439	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	440	return PyObject_Unicode(obj);
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	441	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	442	#else
				443	if (PyUnicode_Check(obj)) {
				444	PyErr_SetString(PyExc_TypeError,
				445	"decoding Unicode is not supported");
				446	return NULL;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	447	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	448	#endif
				449
				450	/* Coerce object */
				451	if (PyString_Check(obj)) {
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	452	s = PyString_AS_STRING(obj);
				453	len = PyString_GET_SIZE(obj);
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	454	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	455	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				456	/* Overwrite the error message with something more useful in
				457	case of a TypeError. */
				458	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	459	PyErr_Format(PyExc_TypeError,
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	460	"coercing to Unicode: need string or buffer, "
				461	"%.80s found",
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	462	obj->ob_type->tp_name);
				463	goto onError;
				464	}
				465
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	466	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	467	if (len == 0) {
				468	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	469	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	470	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	471	else
				472	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	473
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	474	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	475	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	476	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	477	return v;
				478
				479	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	480	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	481	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	482	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	483	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	484	}
				485
				486	PyObject PyUnicode_Decode(const char s,
				487	int size,
				488	const char *encoding,
				489	const char *errors)
				490	{
				491	PyObject buffer = NULL, unicode;
				492
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	493	if (encoding == NULL)
				494	encoding = PyUnicode_GetDefaultEncoding();
				495
				496	/* Shortcuts for common default encodings */
				497	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	498	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	499	else if (strcmp(encoding, "latin-1") == 0)
				500	return PyUnicode_DecodeLatin1(s, size, errors);
				501	else if (strcmp(encoding, "ascii") == 0)
				502	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	503
				504	/* Decode via the codec registry */
				505	buffer = PyBuffer_FromMemory((void *)s, size);
				506	if (buffer == NULL)
				507	goto onError;
				508	unicode = PyCodec_Decode(buffer, encoding, errors);
				509	if (unicode == NULL)
				510	goto onError;
				511	if (!PyUnicode_Check(unicode)) {
				512	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	513	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	514	unicode->ob_type->tp_name);
				515	Py_DECREF(unicode);
				516	goto onError;
				517	}
				518	Py_DECREF(buffer);
				519	return unicode;
				520
				521	onError:
				522	Py_XDECREF(buffer);
				523	return NULL;
				524	}
				525
				526	PyObject PyUnicode_Encode(const Py_UNICODE s,
				527	int size,
				528	const char *encoding,
				529	const char *errors)
				530	{
				531	PyObject v, unicode;
				532
				533	unicode = PyUnicode_FromUnicode(s, size);
				534	if (unicode == NULL)
				535	return NULL;
				536	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				537	Py_DECREF(unicode);
				538	return v;
				539	}
				540
				541	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				542	const char *encoding,
				543	const char *errors)
				544	{
				545	PyObject *v;
				546
				547	if (!PyUnicode_Check(unicode)) {
				548	PyErr_BadArgument();
				549	goto onError;
				550	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	551
				552	if (encoding == NULL)
				553	encoding = PyUnicode_GetDefaultEncoding();
				554
				555	/* Shortcuts for common default encodings */
				556	if (errors == NULL) {
				557	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	558	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	559	else if (strcmp(encoding, "latin-1") == 0)
				560	return PyUnicode_AsLatin1String(unicode);
				561	else if (strcmp(encoding, "ascii") == 0)
				562	return PyUnicode_AsASCIIString(unicode);
				563	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	564
				565	/* Encode via the codec registry */
				566	v = PyCodec_Encode(unicode, encoding, errors);
				567	if (v == NULL)
				568	goto onError;
				569	/* XXX Should we really enforce this ? */
				570	if (!PyString_Check(v)) {
				571	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	572	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	573	v->ob_type->tp_name);
				574	Py_DECREF(v);
				575	goto onError;
				576	}
				577	return v;
				578
				579	onError:
				580	return NULL;
				581	}
				582
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	583	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				584	const char *errors)
				585	{
				586	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				587
				588	if (v)
				589	return v;
				590	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				591	if (v && errors == NULL)
				592	((PyUnicodeObject *)unicode)->defenc = v;
				593	return v;
				594	}
				595
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	596	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				597	{
				598	if (!PyUnicode_Check(unicode)) {
				599	PyErr_BadArgument();
				600	goto onError;
				601	}
				602	return PyUnicode_AS_UNICODE(unicode);
				603
				604	onError:
				605	return NULL;
				606	}
				607
				608	int PyUnicode_GetSize(PyObject *unicode)
				609	{
				610	if (!PyUnicode_Check(unicode)) {
				611	PyErr_BadArgument();
				612	goto onError;
				613	}
				614	return PyUnicode_GET_SIZE(unicode);
				615
				616	onError:
				617	return -1;
				618	}
				619
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	620	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	621	{
				622	return unicode_default_encoding;
				623	}
				624
				625	int PyUnicode_SetDefaultEncoding(const char *encoding)
				626	{
				627	PyObject *v;
				628
				629	/* Make sure the encoding is valid. As side effect, this also
				630	loads the encoding into the codec registry cache. */
				631	v = _PyCodec_Lookup(encoding);
				632	if (v == NULL)
				633	goto onError;
				634	Py_DECREF(v);
				635	strncpy(unicode_default_encoding,
				636	encoding,
				637	sizeof(unicode_default_encoding));
				638	return 0;
				639
				640	onError:
				641	return -1;
				642	}
				643
Marc-André Lemburg	c60e6f7	2001-09-20 10:35:46 +0000	[diff] [blame]	644	/* --- UTF-7 Codec -------------------------------------------------------- */
				645
				646	/* see RFC2152 for details */
				647
				648	static
				649	char utf7_special[128] = {
				650	/* indicate whether a UTF-7 character is special i.e. cannot be directly
				651	encoded:
				652	0 - not special
				653	1 - special
				654	2 - whitespace (optional)
				655	3 - RFC2152 Set O (optional) */
				656	1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
				657	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				658	2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
				659	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
				660	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				661	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
				662	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				663	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
				664
				665	};
				666
				667	#define SPECIAL(c, encodeO, encodeWS) \
				668	(((c)>127 \|\| utf7_special[(c)] == 1) \|\| \
				669	(encodeWS && (utf7_special[(c)] == 2)) \|\| \
				670	(encodeO && (utf7_special[(c)] == 3)))
				671
				672	#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
				673	#define B64CHAR(c) (isalnum(c) \|\| (c) == '+' \|\| (c) == '/')
				674	#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
				675	(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
				676
				677	#define ENCODE(out, ch, bits) \
				678	while (bits >= 6) { \
				679	*out++ = B64(ch >> (bits-6)); \
				680	bits -= 6; \
				681	}
				682
				683	#define DECODE(out, ch, bits, surrogate) \
				684	while (bits >= 16) { \
				685	Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
				686	bits -= 16; \
				687	if (surrogate) { \
				688	/* We have already generated an error for the high surrogate
				689	so let's not bother seeing if the low surrogate is correct or not */\
				690	surrogate = 0; \
				691	} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
				692	/* This is a surrogate pair. Unfortunately we can't represent \
				693	it in a 16-bit character */ \
				694	surrogate = 1; \
				695	errmsg = "code pairs are not supported"; \
				696	goto utf7Error; \
				697	} else { \
				698	*out++ = outCh; \
				699	} \
				700	} \
				701
				702	static
				703	int utf7_decoding_error(Py_UNICODE **dest,
				704	const char *errors,
				705	const char *details)
				706	{
				707	if ((errors == NULL) \|\|
				708	(strcmp(errors,"strict") == 0)) {
				709	PyErr_Format(PyExc_UnicodeError,
				710	"UTF-7 decoding error: %.400s",
				711	details);
				712	return -1;
				713	}
				714	else if (strcmp(errors,"ignore") == 0) {
				715	return 0;
				716	}
				717	else if (strcmp(errors,"replace") == 0) {
				718	if (dest != NULL) {
				719	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				720	(*dest)++;
				721	}
				722	return 0;
				723	}
				724	else {
				725	PyErr_Format(PyExc_ValueError,
				726	"UTF-7 decoding error; unknown error handling code: %.400s",
				727	errors);
				728	return -1;
				729	}
				730	}
				731
				732	PyObject PyUnicode_DecodeUTF7(const char s,
				733	int size,
				734	const char *errors)
				735	{
				736	const char *e;
				737	PyUnicodeObject *unicode;
				738	Py_UNICODE *p;
				739	const char *errmsg = "";
				740	int inShift = 0;
				741	unsigned int bitsleft = 0;
				742	unsigned long charsleft = 0;
				743	int surrogate = 0;
				744
				745	unicode = _PyUnicode_New(size);
				746	if (!unicode)
				747	return NULL;
				748	if (size == 0)
				749	return (PyObject *)unicode;
				750
				751	p = unicode->str;
				752	e = s + size;
				753
				754	while (s < e) {
				755	Py_UNICODE ch = *s;
				756
				757	if (inShift) {
				758	if ((ch == '-') \|\| !B64CHAR(ch)) {
				759	inShift = 0;
				760	s++;
				761
				762	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				763	if (bitsleft >= 6) {
				764	/* The shift sequence has a partial character in it. If
				765	bitsleft < 6 then we could just classify it as padding
				766	but that is not the case here */
				767
				768	errmsg = "partial character in shift sequence";
				769	goto utf7Error;
				770	}
				771	/* According to RFC2152 the remaining bits should be zero. We
				772	choose to signal an error/insert a replacement character
				773	here so indicate the potential of a misencoded character. */
				774
				775	/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
				776	if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
				777	errmsg = "non-zero padding bits in shift sequence";
				778	goto utf7Error;
				779	}
				780
				781	if (ch == '-') {
				782	if ((s < e) && (*(s) == '-')) {
				783	*p++ = '-';
				784	inShift = 1;
				785	}
				786	} else if (SPECIAL(ch,0,0)) {
				787	errmsg = "unexpected special character";
				788	goto utf7Error;
				789	} else {
				790	*p++ = ch;
				791	}
				792	} else {
				793	charsleft = (charsleft << 6) \| UB64(ch);
				794	bitsleft += 6;
				795	s++;
				796	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				797	}
				798	}
				799	else if ( ch == '+' ) {
				800	s++;
				801	if (s < e && *s == '-') {
				802	s++;
				803	*p++ = '+';
				804	} else
				805	{
				806	inShift = 1;
				807	bitsleft = 0;
				808	}
				809	}
				810	else if (SPECIAL(ch,0,0)) {
				811	errmsg = "unexpected special character";
				812	s++;
				813	goto utf7Error;
				814	}
				815	else {
				816	*p++ = ch;
				817	s++;
				818	}
				819	continue;
				820	utf7Error:
				821	if (utf7_decoding_error(&p, errors, errmsg))
				822	goto onError;
				823	}
				824
				825	if (inShift) {
				826	if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
				827	goto onError;
				828	}
				829
				830	if (_PyUnicode_Resize(&unicode, p - unicode->str))
				831	goto onError;
				832
				833	return (PyObject *)unicode;
				834
				835	onError:
				836	Py_DECREF(unicode);
				837	return NULL;
				838	}
				839
				840
				841	PyObject PyUnicode_EncodeUTF7(const Py_UNICODE s,
				842	int size,
				843	int encodeSetO,
				844	int encodeWhiteSpace,
				845	const char *errors)
				846	{
				847	PyObject *v;
				848	/* It might be possible to tighten this worst case */
				849	unsigned int cbAllocated = 5 * size;
				850	int inShift = 0;
				851	int i = 0;
				852	unsigned int bitsleft = 0;
				853	unsigned long charsleft = 0;
				854	char * out;
				855	char * start;
				856
				857	if (size == 0)
				858	return PyString_FromStringAndSize(NULL, 0);
				859
				860	v = PyString_FromStringAndSize(NULL, cbAllocated);
				861	if (v == NULL)
				862	return NULL;
				863
				864	start = out = PyString_AS_STRING(v);
				865	for (;i < size; ++i) {
				866	Py_UNICODE ch = s[i];
				867
				868	if (!inShift) {
				869	if (ch == '+') {
				870	*out++ = '+';
				871	*out++ = '-';
				872	} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				873	charsleft = ch;
				874	bitsleft = 16;
				875	*out++ = '+';
				876	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				877	inShift = bitsleft > 0;
				878	} else {
				879	*out++ = (char) ch;
				880	}
				881	} else {
				882	if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				883	*out++ = B64(charsleft << (6-bitsleft));
				884	charsleft = 0;
				885	bitsleft = 0;
				886	/* Characters not in the BASE64 set implicitly unshift the sequence
				887	so no '-' is required, except if the character is itself a '-' */
				888	if (B64CHAR(ch) \|\| ch == '-') {
				889	*out++ = '-';
				890	}
				891	inShift = 0;
				892	*out++ = (char) ch;
				893	} else {
				894	bitsleft += 16;
				895	charsleft = (charsleft << 16) \| ch;
				896	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				897
				898	/* If the next character is special then we dont' need to terminate
				899	the shift sequence. If the next character is not a BASE64 character
				900	or '-' then the shift sequence will be terminated implicitly and we
				901	don't have to insert a '-'. */
				902
				903	if (bitsleft == 0) {
				904	if (i + 1 < size) {
				905	Py_UNICODE ch2 = s[i+1];
				906
				907	if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
				908
				909	} else if (B64CHAR(ch2) \|\| ch2 == '-') {
				910	*out++ = '-';
				911	inShift = 0;
				912	} else {
				913	inShift = 0;
				914	}
				915
				916	}
				917	else {
				918	*out++ = '-';
				919	inShift = 0;
				920	}
				921	}
				922	}
				923	}
				924	}
				925	if (bitsleft) {
				926	*out++= B64(charsleft << (6-bitsleft) );
				927	*out++ = '-';
				928	}
				929
				930	if (_PyString_Resize(&v, out - start)) {
				931	Py_DECREF(v);
				932	return NULL;
				933	}
				934	return v;
				935	}
				936
				937	#undef SPECIAL
				938	#undef B64
				939	#undef B64CHAR
				940	#undef UB64
				941	#undef ENCODE
				942	#undef DECODE
				943
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	944	/* --- UTF-8 Codec -------------------------------------------------------- */
				945
				946	static
				947	char utf8_code_length[256] = {
				948	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				949	illegal prefix. see RFC 2279 for details */
				950	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				951	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				952	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				953	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				954	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				955	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				956	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				957	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				958	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				959	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				960	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				961	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				962	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				963	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				964	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				965	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				966	};
				967
				968	static
				969	int utf8_decoding_error(const char **source,
				970	Py_UNICODE **dest,
				971	const char *errors,
				972	const char *details)
				973	{
				974	if ((errors == NULL) \|\|
				975	(strcmp(errors,"strict") == 0)) {
				976	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	977	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	978	details);
				979	return -1;
				980	}
				981	else if (strcmp(errors,"ignore") == 0) {
				982	(*source)++;
				983	return 0;
				984	}
				985	else if (strcmp(errors,"replace") == 0) {
				986	(*source)++;
				987	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				988	(*dest)++;
				989	return 0;
				990	}
				991	else {
				992	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	993	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	994	errors);
				995	return -1;
				996	}
				997	}
				998
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	999	PyObject PyUnicode_DecodeUTF8(const char s,
				1000	int size,
				1001	const char *errors)
				1002	{
				1003	int n;
				1004	const char *e;
				1005	PyUnicodeObject *unicode;
				1006	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1007	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1008
				1009	/* Note: size will always be longer than the resulting Unicode
				1010	character count */
				1011	unicode = _PyUnicode_New(size);
				1012	if (!unicode)
				1013	return NULL;
				1014	if (size == 0)
				1015	return (PyObject *)unicode;
				1016
				1017	/* Unpack UTF-8 encoded data */
				1018	p = unicode->str;
				1019	e = s + size;
				1020
				1021	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1022	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1023
				1024	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1025	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1026	s++;
				1027	continue;
				1028	}
				1029
				1030	n = utf8_code_length[ch];
				1031
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1032	if (s + n > e) {
				1033	errmsg = "unexpected end of data";
				1034	goto utf8Error;
				1035	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1036
				1037	switch (n) {
				1038
				1039	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1040	errmsg = "unexpected code byte";
				1041	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1042
				1043	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1044	errmsg = "internal error";
				1045	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1046
				1047	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1048	if ((s[1] & 0xc0) != 0x80) {
				1049	errmsg = "invalid data";
				1050	goto utf8Error;
				1051	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1052	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1053	if (ch < 0x80) {
				1054	errmsg = "illegal encoding";
				1055	goto utf8Error;
				1056	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1057	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1058	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1059	break;
				1060
				1061	case 3:
				1062	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1063	(s[2] & 0xc0) != 0x80) {
				1064	errmsg = "invalid data";
				1065	goto utf8Error;
				1066	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1067	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1068	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				1069	errmsg = "illegal encoding";
				1070	goto utf8Error;
				1071	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1072	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1073	*p++ = (Py_UNICODE)ch;
				1074	break;
				1075
				1076	case 4:
				1077	if ((s[1] & 0xc0) != 0x80 \|\|
				1078	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1079	(s[3] & 0xc0) != 0x80) {
				1080	errmsg = "invalid data";
				1081	goto utf8Error;
				1082	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1083	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				1084	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				1085	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1086	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1087	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1088	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1089	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1090	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1091	errmsg = "illegal encoding";
				1092	goto utf8Error;
				1093	}
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1094	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1095	*p++ = (Py_UNICODE)ch;
				1096	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1097	/* compute and append the two surrogates: */
				1098
				1099	/* translate from 10000..10FFFF to 0..FFFF */
				1100	ch -= 0x10000;
				1101
				1102	/* high surrogate = top 10 bits added to D800 */
				1103	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				1104
				1105	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1106	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1107	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1108	break;
				1109
				1110	default:
				1111	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1112	errmsg = "unsupported Unicode code range";
				1113	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1114	}
				1115	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1116	continue;
				1117
				1118	utf8Error:
				1119	if (utf8_decoding_error(&s, &p, errors, errmsg))
				1120	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1121	}
				1122
				1123	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1124	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1125	goto onError;
				1126
				1127	return (PyObject *)unicode;
				1128
				1129	onError:
				1130	Py_DECREF(unicode);
				1131	return NULL;
				1132	}
				1133
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1134	/* Not used anymore, now that the encoder supports UTF-16
				1135	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1136	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1137	static
				1138	int utf8_encoding_error(const Py_UNICODE **source,
				1139	char **dest,
				1140	const char *errors,
				1141	const char *details)
				1142	{
				1143	if ((errors == NULL) \|\|
				1144	(strcmp(errors,"strict") == 0)) {
				1145	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1146	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1147	details);
				1148	return -1;
				1149	}
				1150	else if (strcmp(errors,"ignore") == 0) {
				1151	return 0;
				1152	}
				1153	else if (strcmp(errors,"replace") == 0) {
				1154	**dest = '?';
				1155	(*dest)++;
				1156	return 0;
				1157	}
				1158	else {
				1159	PyErr_Format(PyExc_ValueError,
				1160	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1161	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1162	errors);
				1163	return -1;
				1164	}
				1165	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1166	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1167
				1168	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				1169	int size,
				1170	const char *errors)
				1171	{
				1172	PyObject *v;
				1173	char *p;
				1174	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1175	Py_UCS4 ch2;
				1176	unsigned int cbAllocated = 3 * size;
				1177	unsigned int cbWritten = 0;
				1178	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1179
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1180	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1181	if (v == NULL)
				1182	return NULL;
				1183	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1184	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1185
				1186	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1187	while (i < size) {
				1188	Py_UCS4 ch = s[i++];
				1189	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1190	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1191	cbWritten++;
				1192	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1193	else if (ch < 0x0800) {
				1194	*p++ = 0xc0 \| (ch >> 6);
				1195	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1196	cbWritten += 2;
				1197	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1198	else if (ch < 0x10000) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1199	/* Check for high surrogate */
				1200	if (0xD800 <= ch && ch <= 0xDBFF) {
				1201	if (i != size) {
				1202	ch2 = s[i];
				1203	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				1204
				1205	if (cbWritten >= (cbAllocated - 4)) {
				1206	/* Provide enough room for some more
				1207	surrogates */
				1208	cbAllocated += 4*10;
				1209	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1210	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1211	}
				1212
				1213	/* combine the two values */
				1214	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				1215
				1216	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1217	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1218	i++;
				1219	cbWritten += 4;
				1220	}
				1221	}
				1222	}
				1223	else {
				1224	*p++ = (char)(0xe0 \| (ch >> 12));
				1225	cbWritten += 3;
				1226	}
				1227	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				1228	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1229	} else {
				1230	*p++ = 0xf0 \| (ch>>18);
				1231	*p++ = 0x80 \| ((ch>>12) & 0x3f);
				1232	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				1233	*p++ = 0x80 \| (ch & 0x3f);
				1234	cbWritten += 4;
				1235	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1236	}
				1237	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1238	if (_PyString_Resize(&v, p - q))
				1239	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1240	return v;
				1241
				1242	onError:
				1243	Py_DECREF(v);
				1244	return NULL;
				1245	}
				1246
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1247	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				1248	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1249	if (!PyUnicode_Check(unicode)) {
				1250	PyErr_BadArgument();
				1251	return NULL;
				1252	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	1253	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				1254	PyUnicode_GET_SIZE(unicode),
				1255	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1256	}
				1257
				1258	/* --- UTF-16 Codec ------------------------------------------------------- */
				1259
				1260	static
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1261	int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1262	const char *errors,
				1263	const char *details)
				1264	{
				1265	if ((errors == NULL) \|\|
				1266	(strcmp(errors,"strict") == 0)) {
				1267	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1268	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1269	details);
				1270	return -1;
				1271	}
				1272	else if (strcmp(errors,"ignore") == 0) {
				1273	return 0;
				1274	}
				1275	else if (strcmp(errors,"replace") == 0) {
				1276	if (dest) {
				1277	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1278	(*dest)++;
				1279	}
				1280	return 0;
				1281	}
				1282	else {
				1283	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	1284	"UTF-16 decoding error; "
				1285	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1286	errors);
				1287	return -1;
				1288	}
				1289	}
				1290
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1291	PyObject *
				1292	PyUnicode_DecodeUTF16(const char *s,
				1293	int size,
				1294	const char *errors,
				1295	int *byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1296	{
				1297	PyUnicodeObject *unicode;
				1298	Py_UNICODE *p;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1299	const unsigned char q, e;
				1300	int bo = 0; /* assume native ordering by default */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1301	const char *errmsg = "";
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1302	/* Offsets from q for retrieving byte pairs in the right order. */
				1303	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1304	int ihi = 1, ilo = 0;
				1305	#else
				1306	int ihi = 0, ilo = 1;
				1307	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1308
				1309	/* size should be an even number */
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1310	if (size & 1) {
				1311	if (utf16_decoding_error(NULL, errors, "truncated data"))
				1312	return NULL;
				1313	--size; /* else ignore the oddball byte */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1314	}
				1315
				1316	/* Note: size will always be longer than the resulting Unicode
				1317	character count */
				1318	unicode = _PyUnicode_New(size);
				1319	if (!unicode)
				1320	return NULL;
				1321	if (size == 0)
				1322	return (PyObject *)unicode;
				1323
				1324	/* Unpack UTF-16 encoded data */
				1325	p = unicode->str;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1326	q = (unsigned char *)s;
				1327	e = q + size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1328
				1329	if (byteorder)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1330	bo = *byteorder;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1331
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1332	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1333	byte order setting accordingly. In native mode, the leading BOM
				1334	mark is skipped, in all other modes, it is copied to the output
				1335	stream as-is (giving a ZWNBSP character). */
				1336	if (bo == 0) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1337	const Py_UNICODE bom = (q[ihi] << 8) \| q[ilo];
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1338	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1339	if (bom == 0xFEFF) {
				1340	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1341	bo = -1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1342	}
				1343	else if (bom == 0xFFFE) {
				1344	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1345	bo = 1;
				1346	}
				1347	#else
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1348	if (bom == 0xFEFF) {
				1349	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1350	bo = 1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1351	}
				1352	else if (bom == 0xFFFE) {
				1353	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1354	bo = -1;
				1355	}
				1356	#endif
				1357	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1358
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1359	if (bo == -1) {
				1360	/* force LE */
				1361	ihi = 1;
				1362	ilo = 0;
				1363	}
				1364	else if (bo == 1) {
				1365	/* force BE */
				1366	ihi = 0;
				1367	ilo = 1;
				1368	}
				1369
				1370	while (q < e) {
				1371	Py_UNICODE ch = (q[ihi] << 8) \| q[ilo];
				1372	q += 2;
				1373
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1374	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1375	*p++ = ch;
				1376	continue;
				1377	}
				1378
				1379	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1380	if (q >= e) {
				1381	errmsg = "unexpected end of data";
				1382	goto utf16Error;
				1383	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1384	if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1385	Py_UNICODE ch2 = (q[ihi] << 8) \| q[ilo];
				1386	q += 2;
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1387	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1388	#ifndef Py_UNICODE_WIDE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1389	*p++ = ch;
				1390	*p++ = ch2;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1391	#else
				1392	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1393	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1394	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1395	}
				1396	else {
				1397	errmsg = "illegal UTF-16 surrogate";
				1398	goto utf16Error;
				1399	}
				1400
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1401	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1402	errmsg = "illegal encoding";
				1403	/* Fall through to report the error */
				1404
				1405	utf16Error:
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1406	if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1407	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1408	}
				1409
				1410	if (byteorder)
				1411	*byteorder = bo;
				1412
				1413	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1414	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1415	goto onError;
				1416
				1417	return (PyObject *)unicode;
				1418
				1419	onError:
				1420	Py_DECREF(unicode);
				1421	return NULL;
				1422	}
				1423
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1424	PyObject *
				1425	PyUnicode_EncodeUTF16(const Py_UNICODE *s,
				1426	int size,
				1427	const char *errors,
				1428	int byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1429	{
				1430	PyObject *v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1431	unsigned char *p;
				1432	int i, pairs;
				1433	/* Offsets from p for storing byte pairs in the right order. */
				1434	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1435	int ihi = 1, ilo = 0;
				1436	#else
				1437	int ihi = 0, ilo = 1;
				1438	#endif
				1439
				1440	#define STORECHAR(CH) \
				1441	do { \
				1442	p[ihi] = ((CH) >> 8) & 0xff; \
				1443	p[ilo] = (CH) & 0xff; \
				1444	p += 2; \
				1445	} while(0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1446
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1447	for (i = pairs = 0; i < size; i++)
				1448	if (s[i] >= 0x10000)
				1449	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1450	v = PyString_FromStringAndSize(NULL,
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1451	2 * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1452	if (v == NULL)
				1453	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1454
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1455	p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1456	if (byteorder == 0)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1457	STORECHAR(0xFEFF);
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1458	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1459	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1460
				1461	if (byteorder == -1) {
				1462	/* force LE */
				1463	ihi = 1;
				1464	ilo = 0;
				1465	}
				1466	else if (byteorder == 1) {
				1467	/* force BE */
				1468	ihi = 0;
				1469	ilo = 1;
				1470	}
				1471
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1472	while (size-- > 0) {
				1473	Py_UNICODE ch = *s++;
				1474	Py_UNICODE ch2 = 0;
				1475	if (ch >= 0x10000) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1476	ch2 = 0xDC00 \| ((ch-0x10000) & 0x3FF);
				1477	ch = 0xD800 \| ((ch-0x10000) >> 10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1478	}
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1479	STORECHAR(ch);
				1480	if (ch2)
				1481	STORECHAR(ch2);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1482	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1483	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1484	#undef STORECHAR
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1485	}
				1486
				1487	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1488	{
				1489	if (!PyUnicode_Check(unicode)) {
				1490	PyErr_BadArgument();
				1491	return NULL;
				1492	}
				1493	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1494	PyUnicode_GET_SIZE(unicode),
				1495	NULL,
				1496	0);
				1497	}
				1498
				1499	/* --- Unicode Escape Codec ----------------------------------------------- */
				1500
				1501	static
				1502	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1503	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1504	const char *errors,
				1505	const char *details)
				1506	{
				1507	if ((errors == NULL) \|\|
				1508	(strcmp(errors,"strict") == 0)) {
				1509	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1510	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1511	details);
				1512	return -1;
				1513	}
				1514	else if (strcmp(errors,"ignore") == 0) {
				1515	return 0;
				1516	}
				1517	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1518	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1519	return 0;
				1520	}
				1521	else {
				1522	PyErr_Format(PyExc_ValueError,
				1523	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1524	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1525	errors);
				1526	return -1;
				1527	}
				1528	}
				1529
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1530	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1531
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1532	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1533	int size,
				1534	const char *errors)
				1535	{
				1536	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1537	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1538	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1539	char* message;
				1540	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1541
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1542	/* Escaped strings will always be longer than the resulting
				1543	Unicode string, so we start with size here and then reduce the
				1544	length after conversion to the true value. */
				1545	v = _PyUnicode_New(size);
				1546	if (v == NULL)
				1547	goto onError;
				1548	if (size == 0)
				1549	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1550
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1551	p = buf = PyUnicode_AS_UNICODE(v);
				1552	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1553
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1554	while (s < end) {
				1555	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1556	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1557	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1558
				1559	/* Non-escape characters are interpreted as Unicode ordinals */
				1560	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1561	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1562	continue;
				1563	}
				1564
				1565	/* \ - Escapes */
				1566	s++;
				1567	switch (*s++) {
				1568
				1569	/* \x escapes */
				1570	case '\n': break;
				1571	case '\\': *p++ = '\\'; break;
				1572	case '\'': *p++ = '\''; break;
				1573	case '\"': *p++ = '\"'; break;
				1574	case 'b': *p++ = '\b'; break;
				1575	case 'f': p++ = '\014'; break; / FF */
				1576	case 't': *p++ = '\t'; break;
				1577	case 'n': *p++ = '\n'; break;
				1578	case 'r': *p++ = '\r'; break;
				1579	case 'v': p++ = '\013'; break; / VT */
				1580	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1581
				1582	/* \OOO (octal) escapes */
				1583	case '0': case '1': case '2': case '3':
				1584	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1585	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1586	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1587	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1588	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1589	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1590	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1591	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1592	break;
				1593
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1594	/* hex escapes */
				1595	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1596	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1597	digits = 2;
				1598	message = "truncated \\xXX escape";
				1599	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1600
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1601	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1602	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1603	digits = 4;
				1604	message = "truncated \\uXXXX escape";
				1605	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1606
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1607	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1608	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1609	digits = 8;
				1610	message = "truncated \\UXXXXXXXX escape";
				1611	hexescape:
				1612	chr = 0;
				1613	for (i = 0; i < digits; i++) {
				1614	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1615	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1616	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1617	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1618	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1619	i++;
				1620	break;
				1621	}
				1622	chr = (chr<<4) & ~0xF;
				1623	if (c >= '0' && c <= '9')
				1624	chr += c - '0';
				1625	else if (c >= 'a' && c <= 'f')
				1626	chr += 10 + c - 'a';
				1627	else
				1628	chr += 10 + c - 'A';
				1629	}
				1630	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1631	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1632	/* when we get here, chr is a 32-bit unicode character */
				1633	if (chr <= 0xffff)
				1634	/* UCS-2 character */
				1635	*p++ = (Py_UNICODE) chr;
				1636	else if (chr <= 0x10ffff) {
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1637	/* UCS-4 character. Either store directly, or as
				1638	surrogate pair. */
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1639	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1640	*p++ = chr;
				1641	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1642	chr -= 0x10000L;
				1643	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1644	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1645	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1646	} else {
				1647	if (unicodeescape_decoding_error(
				1648	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1649	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1650	)
				1651	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1652	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1653	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1654	break;
				1655
				1656	/* \N{name} */
				1657	case 'N':
				1658	message = "malformed \\N character escape";
				1659	if (ucnhash_CAPI == NULL) {
				1660	/* load the unicode data module */
				1661	PyObject m, v;
				1662	m = PyImport_ImportModule("unicodedata");
				1663	if (m == NULL)
				1664	goto ucnhashError;
				1665	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1666	Py_DECREF(m);
				1667	if (v == NULL)
				1668	goto ucnhashError;
				1669	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1670	Py_DECREF(v);
				1671	if (ucnhash_CAPI == NULL)
				1672	goto ucnhashError;
				1673	}
				1674	if (*s == '{') {
				1675	const char *start = s+1;
				1676	/* look for the closing brace */
				1677	while (*s != '}' && s < end)
				1678	s++;
				1679	if (s > start && s < end && *s == '}') {
				1680	/* found a name. look it up in the unicode database */
				1681	message = "unknown Unicode character name";
				1682	s++;
				1683	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1684	goto store;
				1685	}
				1686	}
				1687	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1688	goto onError;
				1689	*p++ = x;
				1690	break;
				1691
				1692	default:
				1693	*p++ = '\\';
				1694	*p++ = (unsigned char)s[-1];
				1695	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1696	}
				1697	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1698	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1699	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1700	return (PyObject *)v;
				1701
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1702	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1703	PyErr_SetString(
				1704	PyExc_UnicodeError,
				1705	"\\N escapes not supported (can't load unicodedata module)"
				1706	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1707	return NULL;
				1708
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1709	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1710	Py_XDECREF(v);
				1711	return NULL;
				1712	}
				1713
				1714	/* Return a Unicode-Escape string version of the Unicode object.
				1715
				1716	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1717	appropriate.
				1718
				1719	*/
				1720
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1721	static const Py_UNICODE findchar(const Py_UNICODE s,
				1722	int size,
				1723	Py_UNICODE ch);
				1724
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1725	static
				1726	PyObject unicodeescape_string(const Py_UNICODE s,
				1727	int size,
				1728	int quotes)
				1729	{
				1730	PyObject *repr;
				1731	char *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1732
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1733	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1734
				1735	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1736	if (repr == NULL)
				1737	return NULL;
				1738
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1739	p = PyString_AS_STRING(repr);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1740
				1741	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1742	*p++ = 'u';
				1743	*p++ = (findchar(s, size, '\'') &&
				1744	!findchar(s, size, '"')) ? '"' : '\'';
				1745	}
				1746	while (size-- > 0) {
				1747	Py_UNICODE ch = *s++;
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1748
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1749	/* Escape quotes */
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1750	if (quotes &&
				1751	(ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1752	*p++ = '\\';
				1753	*p++ = (char) ch;
Guido van Rossum	ad9744a	2001-09-21 15:38:17 +0000	[diff] [blame]	1754	continue;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1755	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1756
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1757	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1758	/* Map 21-bit characters to '\U00xxxxxx' */
				1759	else if (ch >= 0x10000) {
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1760	int offset = p - PyString_AS_STRING(repr);
				1761
				1762	/* Resize the string if necessary */
				1763	if (offset + 12 > PyString_GET_SIZE(repr)) {
				1764	if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
				1765	goto onError;
				1766	p = PyString_AS_STRING(repr) + offset;
				1767	}
				1768
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1769	*p++ = '\\';
				1770	*p++ = 'U';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1771	*p++ = hexdigit[(ch >> 28) & 0x0000000F];
				1772	*p++ = hexdigit[(ch >> 24) & 0x0000000F];
				1773	*p++ = hexdigit[(ch >> 20) & 0x0000000F];
				1774	*p++ = hexdigit[(ch >> 16) & 0x0000000F];
				1775	*p++ = hexdigit[(ch >> 12) & 0x0000000F];
				1776	*p++ = hexdigit[(ch >> 8) & 0x0000000F];
				1777	*p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1778	*p++ = hexdigit[ch & 0x0000000F];
				1779	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1780	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1781	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1782	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
				1783	else if (ch >= 0xD800 && ch < 0xDC00) {
				1784	Py_UNICODE ch2;
				1785	Py_UCS4 ucs;
				1786
				1787	ch2 = *s++;
				1788	size--;
				1789	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
				1790	ucs = (((ch & 0x03FF) << 10) \| (ch2 & 0x03FF)) + 0x00010000;
				1791	*p++ = '\\';
				1792	*p++ = 'U';
				1793	*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
				1794	*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
				1795	*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
				1796	*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
				1797	*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
				1798	*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
				1799	*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
				1800	*p++ = hexdigit[ucs & 0x0000000F];
				1801	continue;
				1802	}
				1803	/* Fall through: isolated surrogates are copied as-is */
				1804	s--;
				1805	size++;
				1806	}
				1807
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1808	/* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1809	if (ch >= 256) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1810	*p++ = '\\';
				1811	*p++ = 'u';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1812	*p++ = hexdigit[(ch >> 12) & 0x000F];
				1813	*p++ = hexdigit[(ch >> 8) & 0x000F];
				1814	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1815	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1816	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1817
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1818	/* Map special whitespace to '\t', \n', '\r' */
				1819	else if (ch == '\t') {
				1820	*p++ = '\\';
				1821	*p++ = 't';
				1822	}
				1823	else if (ch == '\n') {
				1824	*p++ = '\\';
				1825	*p++ = 'n';
				1826	}
				1827	else if (ch == '\r') {
				1828	*p++ = '\\';
				1829	*p++ = 'r';
				1830	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1831
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1832	/* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg	11326de	2001-11-28 12:56:20 +0000	[diff] [blame]	1833	else if (ch < ' ' \|\| ch >= 0x7F) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1834	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1835	*p++ = 'x';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1836	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1837	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1838	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1839
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1840	/* Copy everything else as-is */
				1841	else
				1842	*p++ = (char) ch;
				1843	}
				1844	if (quotes)
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1845	*p++ = PyString_AS_STRING(repr)[1];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1846
				1847	*p = '\0';
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1848	if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1849	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1850
				1851	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1852
				1853	onError:
				1854	Py_DECREF(repr);
				1855	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1856	}
				1857
				1858	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1859	int size)
				1860	{
				1861	return unicodeescape_string(s, size, 0);
				1862	}
				1863
				1864	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1865	{
				1866	if (!PyUnicode_Check(unicode)) {
				1867	PyErr_BadArgument();
				1868	return NULL;
				1869	}
				1870	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1871	PyUnicode_GET_SIZE(unicode));
				1872	}
				1873
				1874	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1875
				1876	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1877	int size,
				1878	const char *errors)
				1879	{
				1880	PyUnicodeObject *v;
				1881	Py_UNICODE p, buf;
				1882	const char *end;
				1883	const char *bs;
				1884
				1885	/* Escaped strings will always be longer than the resulting
				1886	Unicode string, so we start with size here and then reduce the
				1887	length after conversion to the true value. */
				1888	v = _PyUnicode_New(size);
				1889	if (v == NULL)
				1890	goto onError;
				1891	if (size == 0)
				1892	return (PyObject *)v;
				1893	p = buf = PyUnicode_AS_UNICODE(v);
				1894	end = s + size;
				1895	while (s < end) {
				1896	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1897	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1898	int i;
				1899
				1900	/* Non-escape characters are interpreted as Unicode ordinals */
				1901	if (*s != '\\') {
				1902	p++ = (unsigned char)s++;
				1903	continue;
				1904	}
				1905
				1906	/* \u-escapes are only interpreted iff the number of leading
				1907	backslashes if odd */
				1908	bs = s;
				1909	for (;s < end;) {
				1910	if (*s != '\\')
				1911	break;
				1912	p++ = (unsigned char)s++;
				1913	}
				1914	if (((s - bs) & 1) == 0 \|\|
				1915	s >= end \|\|
				1916	*s != 'u') {
				1917	continue;
				1918	}
				1919	p--;
				1920	s++;
				1921
				1922	/* \uXXXX with 4 hex digits */
				1923	for (x = 0, i = 0; i < 4; i++) {
				1924	c = (unsigned char)s[i];
				1925	if (!isxdigit(c)) {
				1926	if (unicodeescape_decoding_error(&s, &x, errors,
				1927	"truncated \\uXXXX"))
				1928	goto onError;
				1929	i++;
				1930	break;
				1931	}
				1932	x = (x<<4) & ~0xF;
				1933	if (c >= '0' && c <= '9')
				1934	x += c - '0';
				1935	else if (c >= 'a' && c <= 'f')
				1936	x += 10 + c - 'a';
				1937	else
				1938	x += 10 + c - 'A';
				1939	}
				1940	s += i;
				1941	*p++ = x;
				1942	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1943	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1944	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1945	return (PyObject *)v;
				1946
				1947	onError:
				1948	Py_XDECREF(v);
				1949	return NULL;
				1950	}
				1951
				1952	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1953	int size)
				1954	{
				1955	PyObject *repr;
				1956	char *p;
				1957	char *q;
				1958
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1959	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1960
				1961	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1962	if (repr == NULL)
				1963	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1964	if (size == 0)
				1965	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1966
				1967	p = q = PyString_AS_STRING(repr);
				1968	while (size-- > 0) {
				1969	Py_UNICODE ch = *s++;
				1970	/* Map 16-bit characters to '\uxxxx' */
				1971	if (ch >= 256) {
				1972	*p++ = '\\';
				1973	*p++ = 'u';
				1974	*p++ = hexdigit[(ch >> 12) & 0xf];
				1975	*p++ = hexdigit[(ch >> 8) & 0xf];
				1976	*p++ = hexdigit[(ch >> 4) & 0xf];
				1977	*p++ = hexdigit[ch & 15];
				1978	}
				1979	/* Copy everything else as-is */
				1980	else
				1981	*p++ = (char) ch;
				1982	}
				1983	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1984	if (_PyString_Resize(&repr, p - q))
				1985	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1986
				1987	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1988
				1989	onError:
				1990	Py_DECREF(repr);
				1991	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1992	}
				1993
				1994	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1995	{
				1996	if (!PyUnicode_Check(unicode)) {
				1997	PyErr_BadArgument();
				1998	return NULL;
				1999	}
				2000	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				2001	PyUnicode_GET_SIZE(unicode));
				2002	}
				2003
				2004	/* --- Latin-1 Codec ------------------------------------------------------ */
				2005
				2006	PyObject PyUnicode_DecodeLatin1(const char s,
				2007	int size,
				2008	const char *errors)
				2009	{
				2010	PyUnicodeObject *v;
				2011	Py_UNICODE *p;
				2012
				2013	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2014	if (size == 1 && (unsigned char)s < 256) {
				2015	Py_UNICODE r = (unsigned char)s;
				2016	return PyUnicode_FromUnicode(&r, 1);
				2017	}
				2018
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2019	v = _PyUnicode_New(size);
				2020	if (v == NULL)
				2021	goto onError;
				2022	if (size == 0)
				2023	return (PyObject *)v;
				2024	p = PyUnicode_AS_UNICODE(v);
				2025	while (size-- > 0)
				2026	p++ = (unsigned char)s++;
				2027	return (PyObject *)v;
				2028
				2029	onError:
				2030	Py_XDECREF(v);
				2031	return NULL;
				2032	}
				2033
				2034	static
				2035	int latin1_encoding_error(const Py_UNICODE **source,
				2036	char **dest,
				2037	const char *errors,
				2038	const char *details)
				2039	{
				2040	if ((errors == NULL) \|\|
				2041	(strcmp(errors,"strict") == 0)) {
				2042	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2043	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2044	details);
				2045	return -1;
				2046	}
				2047	else if (strcmp(errors,"ignore") == 0) {
				2048	return 0;
				2049	}
				2050	else if (strcmp(errors,"replace") == 0) {
				2051	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2052	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2053	return 0;
				2054	}
				2055	else {
				2056	PyErr_Format(PyExc_ValueError,
				2057	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2058	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2059	errors);
				2060	return -1;
				2061	}
				2062	}
				2063
				2064	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				2065	int size,
				2066	const char *errors)
				2067	{
				2068	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2069	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2070
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2071	repr = PyString_FromStringAndSize(NULL, size);
				2072	if (repr == NULL)
				2073	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2074	if (size == 0)
				2075	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2076
				2077	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2078	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2079	while (size-- > 0) {
				2080	Py_UNICODE ch = *p++;
				2081	if (ch >= 256) {
				2082	if (latin1_encoding_error(&p, &s, errors,
				2083	"ordinal not in range(256)"))
				2084	goto onError;
				2085	}
				2086	else
				2087	*s++ = (char)ch;
				2088	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2089	/* Resize if error handling skipped some characters */
				2090	if (s - start < PyString_GET_SIZE(repr))
				2091	if (_PyString_Resize(&repr, s - start))
				2092	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2093	return repr;
				2094
				2095	onError:
				2096	Py_DECREF(repr);
				2097	return NULL;
				2098	}
				2099
				2100	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				2101	{
				2102	if (!PyUnicode_Check(unicode)) {
				2103	PyErr_BadArgument();
				2104	return NULL;
				2105	}
				2106	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				2107	PyUnicode_GET_SIZE(unicode),
				2108	NULL);
				2109	}
				2110
				2111	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				2112
				2113	static
				2114	int ascii_decoding_error(const char **source,
				2115	Py_UNICODE **dest,
				2116	const char *errors,
				2117	const char *details)
				2118	{
				2119	if ((errors == NULL) \|\|
				2120	(strcmp(errors,"strict") == 0)) {
				2121	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2122	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2123	details);
				2124	return -1;
				2125	}
				2126	else if (strcmp(errors,"ignore") == 0) {
				2127	return 0;
				2128	}
				2129	else if (strcmp(errors,"replace") == 0) {
				2130	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2131	(*dest)++;
				2132	return 0;
				2133	}
				2134	else {
				2135	PyErr_Format(PyExc_ValueError,
				2136	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2137	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2138	errors);
				2139	return -1;
				2140	}
				2141	}
				2142
				2143	PyObject PyUnicode_DecodeASCII(const char s,
				2144	int size,
				2145	const char *errors)
				2146	{
				2147	PyUnicodeObject *v;
				2148	Py_UNICODE *p;
				2149
				2150	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2151	if (size == 1 && (unsigned char)s < 128) {
				2152	Py_UNICODE r = (unsigned char)s;
				2153	return PyUnicode_FromUnicode(&r, 1);
				2154	}
				2155
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2156	v = _PyUnicode_New(size);
				2157	if (v == NULL)
				2158	goto onError;
				2159	if (size == 0)
				2160	return (PyObject *)v;
				2161	p = PyUnicode_AS_UNICODE(v);
				2162	while (size-- > 0) {
				2163	register unsigned char c;
				2164
				2165	c = (unsigned char)*s++;
				2166	if (c < 128)
				2167	*p++ = c;
				2168	else if (ascii_decoding_error(&s, &p, errors,
				2169	"ordinal not in range(128)"))
				2170	goto onError;
				2171	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2172	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2173	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2174	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2175	return (PyObject *)v;
				2176
				2177	onError:
				2178	Py_XDECREF(v);
				2179	return NULL;
				2180	}
				2181
				2182	static
				2183	int ascii_encoding_error(const Py_UNICODE **source,
				2184	char **dest,
				2185	const char *errors,
				2186	const char *details)
				2187	{
				2188	if ((errors == NULL) \|\|
				2189	(strcmp(errors,"strict") == 0)) {
				2190	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2191	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2192	details);
				2193	return -1;
				2194	}
				2195	else if (strcmp(errors,"ignore") == 0) {
				2196	return 0;
				2197	}
				2198	else if (strcmp(errors,"replace") == 0) {
				2199	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2200	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2201	return 0;
				2202	}
				2203	else {
				2204	PyErr_Format(PyExc_ValueError,
				2205	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2206	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2207	errors);
				2208	return -1;
				2209	}
				2210	}
				2211
				2212	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				2213	int size,
				2214	const char *errors)
				2215	{
				2216	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2217	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2218
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2219	repr = PyString_FromStringAndSize(NULL, size);
				2220	if (repr == NULL)
				2221	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2222	if (size == 0)
				2223	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2224
				2225	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2226	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2227	while (size-- > 0) {
				2228	Py_UNICODE ch = *p++;
				2229	if (ch >= 128) {
				2230	if (ascii_encoding_error(&p, &s, errors,
				2231	"ordinal not in range(128)"))
				2232	goto onError;
				2233	}
				2234	else
				2235	*s++ = (char)ch;
				2236	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2237	/* Resize if error handling skipped some characters */
				2238	if (s - start < PyString_GET_SIZE(repr))
				2239	if (_PyString_Resize(&repr, s - start))
				2240	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2241	return repr;
				2242
				2243	onError:
				2244	Py_DECREF(repr);
				2245	return NULL;
				2246	}
				2247
				2248	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				2249	{
				2250	if (!PyUnicode_Check(unicode)) {
				2251	PyErr_BadArgument();
				2252	return NULL;
				2253	}
				2254	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				2255	PyUnicode_GET_SIZE(unicode),
				2256	NULL);
				2257	}
				2258
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	2259	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2260
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2261	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2262
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2263	PyObject PyUnicode_DecodeMBCS(const char s,
				2264	int size,
				2265	const char *errors)
				2266	{
				2267	PyUnicodeObject *v;
				2268	Py_UNICODE *p;
				2269
				2270	/* First get the size of the result */
				2271	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2272	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2273	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2274
				2275	v = _PyUnicode_New(usize);
				2276	if (v == NULL)
				2277	return NULL;
				2278	if (usize == 0)
				2279	return (PyObject *)v;
				2280	p = PyUnicode_AS_UNICODE(v);
				2281	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				2282	Py_DECREF(v);
				2283	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2284	}
				2285
				2286	return (PyObject *)v;
				2287	}
				2288
				2289	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				2290	int size,
				2291	const char *errors)
				2292	{
				2293	PyObject *repr;
				2294	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2295	DWORD mbcssize;
				2296
				2297	/* If there are no characters, bail now! */
				2298	if (size==0)
				2299	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2300
				2301	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2302	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2303	if (mbcssize==0)
				2304	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2305
				2306	repr = PyString_FromStringAndSize(NULL, mbcssize);
				2307	if (repr == NULL)
				2308	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2309	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2310	return repr;
				2311
				2312	/* Do the conversion */
				2313	s = PyString_AS_STRING(repr);
				2314	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				2315	Py_DECREF(repr);
				2316	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2317	}
				2318	return repr;
				2319	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2320
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2321	#endif /* MS_WIN32 */
				2322
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2323	/* --- Character Mapping Codec -------------------------------------------- */
				2324
				2325	static
				2326	int charmap_decoding_error(const char **source,
				2327	Py_UNICODE **dest,
				2328	const char *errors,
				2329	const char *details)
				2330	{
				2331	if ((errors == NULL) \|\|
				2332	(strcmp(errors,"strict") == 0)) {
				2333	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2334	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2335	details);
				2336	return -1;
				2337	}
				2338	else if (strcmp(errors,"ignore") == 0) {
				2339	return 0;
				2340	}
				2341	else if (strcmp(errors,"replace") == 0) {
				2342	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2343	(*dest)++;
				2344	return 0;
				2345	}
				2346	else {
				2347	PyErr_Format(PyExc_ValueError,
				2348	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2349	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2350	errors);
				2351	return -1;
				2352	}
				2353	}
				2354
				2355	PyObject PyUnicode_DecodeCharmap(const char s,
				2356	int size,
				2357	PyObject *mapping,
				2358	const char *errors)
				2359	{
				2360	PyUnicodeObject *v;
				2361	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2362	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2363
				2364	/* Default to Latin-1 */
				2365	if (mapping == NULL)
				2366	return PyUnicode_DecodeLatin1(s, size, errors);
				2367
				2368	v = _PyUnicode_New(size);
				2369	if (v == NULL)
				2370	goto onError;
				2371	if (size == 0)
				2372	return (PyObject *)v;
				2373	p = PyUnicode_AS_UNICODE(v);
				2374	while (size-- > 0) {
				2375	unsigned char ch = *s++;
				2376	PyObject w, x;
				2377
				2378	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2379	w = PyInt_FromLong((long)ch);
				2380	if (w == NULL)
				2381	goto onError;
				2382	x = PyObject_GetItem(mapping, w);
				2383	Py_DECREF(w);
				2384	if (x == NULL) {
				2385	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2386	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2387	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2388	x = Py_None;
				2389	Py_INCREF(x);
				2390	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2391	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2392	}
				2393
				2394	/* Apply mapping */
				2395	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2396	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2397	if (value < 0 \|\| value > 65535) {
				2398	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2399	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2400	Py_DECREF(x);
				2401	goto onError;
				2402	}
				2403	*p++ = (Py_UNICODE)value;
				2404	}
				2405	else if (x == Py_None) {
				2406	/* undefined mapping */
				2407	if (charmap_decoding_error(&s, &p, errors,
				2408	"character maps to <undefined>")) {
				2409	Py_DECREF(x);
				2410	goto onError;
				2411	}
				2412	}
				2413	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2414	int targetsize = PyUnicode_GET_SIZE(x);
				2415
				2416	if (targetsize == 1)
				2417	/* 1-1 mapping */
				2418	p++ = PyUnicode_AS_UNICODE(x);
				2419
				2420	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2421	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2422	if (targetsize > extrachars) {
				2423	/* resize first */
				2424	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2425	int needed = (targetsize - extrachars) + \
				2426	(targetsize << 2);
				2427	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2428	if (_PyUnicode_Resize(&v,
				2429	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2430	Py_DECREF(x);
				2431	goto onError;
				2432	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2433	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2434	}
				2435	Py_UNICODE_COPY(p,
				2436	PyUnicode_AS_UNICODE(x),
				2437	targetsize);
				2438	p += targetsize;
				2439	extrachars -= targetsize;
				2440	}
				2441	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2442	}
				2443	else {
				2444	/* wrong return value */
				2445	PyErr_SetString(PyExc_TypeError,
				2446	"character mapping must return integer, None or unicode");
				2447	Py_DECREF(x);
				2448	goto onError;
				2449	}
				2450	Py_DECREF(x);
				2451	}
				2452	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2453	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2454	goto onError;
				2455	return (PyObject *)v;
				2456
				2457	onError:
				2458	Py_XDECREF(v);
				2459	return NULL;
				2460	}
				2461
				2462	static
				2463	int charmap_encoding_error(const Py_UNICODE **source,
				2464	char **dest,
				2465	const char *errors,
				2466	const char *details)
				2467	{
				2468	if ((errors == NULL) \|\|
				2469	(strcmp(errors,"strict") == 0)) {
				2470	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2471	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2472	details);
				2473	return -1;
				2474	}
				2475	else if (strcmp(errors,"ignore") == 0) {
				2476	return 0;
				2477	}
				2478	else if (strcmp(errors,"replace") == 0) {
				2479	**dest = '?';
				2480	(*dest)++;
				2481	return 0;
				2482	}
				2483	else {
				2484	PyErr_Format(PyExc_ValueError,
				2485	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2486	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2487	errors);
				2488	return -1;
				2489	}
				2490	}
				2491
				2492	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2493	int size,
				2494	PyObject *mapping,
				2495	const char *errors)
				2496	{
				2497	PyObject *v;
				2498	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2499	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2500
				2501	/* Default to Latin-1 */
				2502	if (mapping == NULL)
				2503	return PyUnicode_EncodeLatin1(p, size, errors);
				2504
				2505	v = PyString_FromStringAndSize(NULL, size);
				2506	if (v == NULL)
				2507	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2508	if (size == 0)
				2509	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2510	s = PyString_AS_STRING(v);
				2511	while (size-- > 0) {
				2512	Py_UNICODE ch = *p++;
				2513	PyObject w, x;
				2514
				2515	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2516	w = PyInt_FromLong((long)ch);
				2517	if (w == NULL)
				2518	goto onError;
				2519	x = PyObject_GetItem(mapping, w);
				2520	Py_DECREF(w);
				2521	if (x == NULL) {
				2522	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2523	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2524	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2525	x = Py_None;
				2526	Py_INCREF(x);
				2527	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2528	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2529	}
				2530
				2531	/* Apply mapping */
				2532	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2533	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2534	if (value < 0 \|\| value > 255) {
				2535	PyErr_SetString(PyExc_TypeError,
				2536	"character mapping must be in range(256)");
				2537	Py_DECREF(x);
				2538	goto onError;
				2539	}
				2540	*s++ = (char)value;
				2541	}
				2542	else if (x == Py_None) {
				2543	/* undefined mapping */
				2544	if (charmap_encoding_error(&p, &s, errors,
				2545	"character maps to <undefined>")) {
				2546	Py_DECREF(x);
				2547	goto onError;
				2548	}
				2549	}
				2550	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2551	int targetsize = PyString_GET_SIZE(x);
				2552
				2553	if (targetsize == 1)
				2554	/* 1-1 mapping */
				2555	s++ = PyString_AS_STRING(x);
				2556
				2557	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2558	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2559	if (targetsize > extrachars) {
				2560	/* resize first */
				2561	int oldpos = (int)(s - PyString_AS_STRING(v));
				2562	int needed = (targetsize - extrachars) + \
				2563	(targetsize << 2);
				2564	extrachars += needed;
				2565	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2566	Py_DECREF(x);
				2567	goto onError;
				2568	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2569	s = PyString_AS_STRING(v) + oldpos;
				2570	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2571	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2572	s += targetsize;
				2573	extrachars -= targetsize;
				2574	}
				2575	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2576	}
				2577	else {
				2578	/* wrong return value */
				2579	PyErr_SetString(PyExc_TypeError,
				2580	"character mapping must return integer, None or unicode");
				2581	Py_DECREF(x);
				2582	goto onError;
				2583	}
				2584	Py_DECREF(x);
				2585	}
				2586	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2587	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2588	goto onError;
				2589	return v;
				2590
				2591	onError:
				2592	Py_DECREF(v);
				2593	return NULL;
				2594	}
				2595
				2596	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2597	PyObject *mapping)
				2598	{
				2599	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2600	PyErr_BadArgument();
				2601	return NULL;
				2602	}
				2603	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2604	PyUnicode_GET_SIZE(unicode),
				2605	mapping,
				2606	NULL);
				2607	}
				2608
				2609	static
				2610	int translate_error(const Py_UNICODE **source,
				2611	Py_UNICODE **dest,
				2612	const char *errors,
				2613	const char *details)
				2614	{
				2615	if ((errors == NULL) \|\|
				2616	(strcmp(errors,"strict") == 0)) {
				2617	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2618	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2619	details);
				2620	return -1;
				2621	}
				2622	else if (strcmp(errors,"ignore") == 0) {
				2623	return 0;
				2624	}
				2625	else if (strcmp(errors,"replace") == 0) {
				2626	**dest = '?';
				2627	(*dest)++;
				2628	return 0;
				2629	}
				2630	else {
				2631	PyErr_Format(PyExc_ValueError,
				2632	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2633	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2634	errors);
				2635	return -1;
				2636	}
				2637	}
				2638
				2639	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2640	int size,
				2641	PyObject *mapping,
				2642	const char *errors)
				2643	{
				2644	PyUnicodeObject *v;
				2645	Py_UNICODE *p;
				2646
				2647	if (mapping == NULL) {
				2648	PyErr_BadArgument();
				2649	return NULL;
				2650	}
				2651
				2652	/* Output will never be longer than input */
				2653	v = _PyUnicode_New(size);
				2654	if (v == NULL)
				2655	goto onError;
				2656	if (size == 0)
				2657	goto done;
				2658	p = PyUnicode_AS_UNICODE(v);
				2659	while (size-- > 0) {
				2660	Py_UNICODE ch = *s++;
				2661	PyObject w, x;
				2662
				2663	/* Get mapping */
				2664	w = PyInt_FromLong(ch);
				2665	if (w == NULL)
				2666	goto onError;
				2667	x = PyObject_GetItem(mapping, w);
				2668	Py_DECREF(w);
				2669	if (x == NULL) {
				2670	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2671	/* No mapping found: default to 1-1 mapping */
				2672	PyErr_Clear();
				2673	*p++ = ch;
				2674	continue;
				2675	}
				2676	goto onError;
				2677	}
				2678
				2679	/* Apply mapping */
				2680	if (PyInt_Check(x))
				2681	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2682	else if (x == Py_None) {
				2683	/* undefined mapping */
				2684	if (translate_error(&s, &p, errors,
				2685	"character maps to <undefined>")) {
				2686	Py_DECREF(x);
				2687	goto onError;
				2688	}
				2689	}
				2690	else if (PyUnicode_Check(x)) {
				2691	if (PyUnicode_GET_SIZE(x) != 1) {
				2692	/* 1-n mapping */
				2693	PyErr_SetString(PyExc_NotImplementedError,
				2694	"1-n mappings are currently not implemented");
				2695	Py_DECREF(x);
				2696	goto onError;
				2697	}
				2698	p++ = PyUnicode_AS_UNICODE(x);
				2699	}
				2700	else {
				2701	/* wrong return value */
				2702	PyErr_SetString(PyExc_TypeError,
				2703	"translate mapping must return integer, None or unicode");
				2704	Py_DECREF(x);
				2705	goto onError;
				2706	}
				2707	Py_DECREF(x);
				2708	}
				2709	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2710	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2711	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2712
				2713	done:
				2714	return (PyObject *)v;
				2715
				2716	onError:
				2717	Py_XDECREF(v);
				2718	return NULL;
				2719	}
				2720
				2721	PyObject PyUnicode_Translate(PyObject str,
				2722	PyObject *mapping,
				2723	const char *errors)
				2724	{
				2725	PyObject *result;
				2726
				2727	str = PyUnicode_FromObject(str);
				2728	if (str == NULL)
				2729	goto onError;
				2730	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2731	PyUnicode_GET_SIZE(str),
				2732	mapping,
				2733	errors);
				2734	Py_DECREF(str);
				2735	return result;
				2736
				2737	onError:
				2738	Py_XDECREF(str);
				2739	return NULL;
				2740	}
				2741
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2742	/* --- Decimal Encoder ---------------------------------------------------- */
				2743
				2744	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2745	int length,
				2746	char *output,
				2747	const char *errors)
				2748	{
				2749	Py_UNICODE p, end;
				2750
				2751	if (output == NULL) {
				2752	PyErr_BadArgument();
				2753	return -1;
				2754	}
				2755
				2756	p = s;
				2757	end = s + length;
				2758	while (p < end) {
				2759	register Py_UNICODE ch = *p++;
				2760	int decimal;
				2761
				2762	if (Py_UNICODE_ISSPACE(ch)) {
				2763	*output++ = ' ';
				2764	continue;
				2765	}
				2766	decimal = Py_UNICODE_TODECIMAL(ch);
				2767	if (decimal >= 0) {
				2768	*output++ = '0' + decimal;
				2769	continue;
				2770	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2771	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2772	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2773	continue;
				2774	}
				2775	/* All other characters are considered invalid */
				2776	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2777	PyErr_SetString(PyExc_ValueError,
				2778	"invalid decimal Unicode string");
				2779	goto onError;
				2780	}
				2781	else if (strcmp(errors, "ignore") == 0)
				2782	continue;
				2783	else if (strcmp(errors, "replace") == 0) {
				2784	*output++ = '?';
				2785	continue;
				2786	}
				2787	}
				2788	/* 0-terminate the output string */
				2789	*output++ = '\0';
				2790	return 0;
				2791
				2792	onError:
				2793	return -1;
				2794	}
				2795
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2796	/* --- Helpers ------------------------------------------------------------ */
				2797
				2798	static
				2799	int count(PyUnicodeObject *self,
				2800	int start,
				2801	int end,
				2802	PyUnicodeObject *substring)
				2803	{
				2804	int count = 0;
				2805
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2806	if (start < 0)
				2807	start += self->length;
				2808	if (start < 0)
				2809	start = 0;
				2810	if (end > self->length)
				2811	end = self->length;
				2812	if (end < 0)
				2813	end += self->length;
				2814	if (end < 0)
				2815	end = 0;
				2816
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2817	if (substring->length == 0)
				2818	return (end - start + 1);
				2819
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2820	end -= substring->length;
				2821
				2822	while (start <= end)
				2823	if (Py_UNICODE_MATCH(self, start, substring)) {
				2824	count++;
				2825	start += substring->length;
				2826	} else
				2827	start++;
				2828
				2829	return count;
				2830	}
				2831
				2832	int PyUnicode_Count(PyObject *str,
				2833	PyObject *substr,
				2834	int start,
				2835	int end)
				2836	{
				2837	int result;
				2838
				2839	str = PyUnicode_FromObject(str);
				2840	if (str == NULL)
				2841	return -1;
				2842	substr = PyUnicode_FromObject(substr);
				2843	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2844	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2845	return -1;
				2846	}
				2847
				2848	result = count((PyUnicodeObject *)str,
				2849	start, end,
				2850	(PyUnicodeObject *)substr);
				2851
				2852	Py_DECREF(str);
				2853	Py_DECREF(substr);
				2854	return result;
				2855	}
				2856
				2857	static
				2858	int findstring(PyUnicodeObject *self,
				2859	PyUnicodeObject *substring,
				2860	int start,
				2861	int end,
				2862	int direction)
				2863	{
				2864	if (start < 0)
				2865	start += self->length;
				2866	if (start < 0)
				2867	start = 0;
				2868
				2869	if (substring->length == 0)
				2870	return start;
				2871
				2872	if (end > self->length)
				2873	end = self->length;
				2874	if (end < 0)
				2875	end += self->length;
				2876	if (end < 0)
				2877	end = 0;
				2878
				2879	end -= substring->length;
				2880
				2881	if (direction < 0) {
				2882	for (; end >= start; end--)
				2883	if (Py_UNICODE_MATCH(self, end, substring))
				2884	return end;
				2885	} else {
				2886	for (; start <= end; start++)
				2887	if (Py_UNICODE_MATCH(self, start, substring))
				2888	return start;
				2889	}
				2890
				2891	return -1;
				2892	}
				2893
				2894	int PyUnicode_Find(PyObject *str,
				2895	PyObject *substr,
				2896	int start,
				2897	int end,
				2898	int direction)
				2899	{
				2900	int result;
				2901
				2902	str = PyUnicode_FromObject(str);
				2903	if (str == NULL)
				2904	return -1;
				2905	substr = PyUnicode_FromObject(substr);
				2906	if (substr == NULL) {
				2907	Py_DECREF(substr);
				2908	return -1;
				2909	}
				2910
				2911	result = findstring((PyUnicodeObject *)str,
				2912	(PyUnicodeObject *)substr,
				2913	start, end, direction);
				2914	Py_DECREF(str);
				2915	Py_DECREF(substr);
				2916	return result;
				2917	}
				2918
				2919	static
				2920	int tailmatch(PyUnicodeObject *self,
				2921	PyUnicodeObject *substring,
				2922	int start,
				2923	int end,
				2924	int direction)
				2925	{
				2926	if (start < 0)
				2927	start += self->length;
				2928	if (start < 0)
				2929	start = 0;
				2930
				2931	if (substring->length == 0)
				2932	return 1;
				2933
				2934	if (end > self->length)
				2935	end = self->length;
				2936	if (end < 0)
				2937	end += self->length;
				2938	if (end < 0)
				2939	end = 0;
				2940
				2941	end -= substring->length;
				2942	if (end < start)
				2943	return 0;
				2944
				2945	if (direction > 0) {
				2946	if (Py_UNICODE_MATCH(self, end, substring))
				2947	return 1;
				2948	} else {
				2949	if (Py_UNICODE_MATCH(self, start, substring))
				2950	return 1;
				2951	}
				2952
				2953	return 0;
				2954	}
				2955
				2956	int PyUnicode_Tailmatch(PyObject *str,
				2957	PyObject *substr,
				2958	int start,
				2959	int end,
				2960	int direction)
				2961	{
				2962	int result;
				2963
				2964	str = PyUnicode_FromObject(str);
				2965	if (str == NULL)
				2966	return -1;
				2967	substr = PyUnicode_FromObject(substr);
				2968	if (substr == NULL) {
				2969	Py_DECREF(substr);
				2970	return -1;
				2971	}
				2972
				2973	result = tailmatch((PyUnicodeObject *)str,
				2974	(PyUnicodeObject *)substr,
				2975	start, end, direction);
				2976	Py_DECREF(str);
				2977	Py_DECREF(substr);
				2978	return result;
				2979	}
				2980
				2981	static
				2982	const Py_UNICODE findchar(const Py_UNICODE s,
				2983	int size,
				2984	Py_UNICODE ch)
				2985	{
				2986	/* like wcschr, but doesn't stop at NULL characters */
				2987
				2988	while (size-- > 0) {
				2989	if (*s == ch)
				2990	return s;
				2991	s++;
				2992	}
				2993
				2994	return NULL;
				2995	}
				2996
				2997	/* Apply fixfct filter to the Unicode object self and return a
				2998	reference to the modified object */
				2999
				3000	static
				3001	PyObject fixup(PyUnicodeObject self,
				3002	int (fixfct)(PyUnicodeObject s))
				3003	{
				3004
				3005	PyUnicodeObject *u;
				3006
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3007	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3008	if (u == NULL)
				3009	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3010
				3011	Py_UNICODE_COPY(u->str, self->str, self->length);
				3012
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3013	if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3014	/* fixfct should return TRUE if it modified the buffer. If
				3015	FALSE, return a reference to the original buffer instead
				3016	(to save space, not time) */
				3017	Py_INCREF(self);
				3018	Py_DECREF(u);
				3019	return (PyObject*) self;
				3020	}
				3021	return (PyObject*) u;
				3022	}
				3023
				3024	static
				3025	int fixupper(PyUnicodeObject *self)
				3026	{
				3027	int len = self->length;
				3028	Py_UNICODE *s = self->str;
				3029	int status = 0;
				3030
				3031	while (len-- > 0) {
				3032	register Py_UNICODE ch;
				3033
				3034	ch = Py_UNICODE_TOUPPER(*s);
				3035	if (ch != *s) {
				3036	status = 1;
				3037	*s = ch;
				3038	}
				3039	s++;
				3040	}
				3041
				3042	return status;
				3043	}
				3044
				3045	static
				3046	int fixlower(PyUnicodeObject *self)
				3047	{
				3048	int len = self->length;
				3049	Py_UNICODE *s = self->str;
				3050	int status = 0;
				3051
				3052	while (len-- > 0) {
				3053	register Py_UNICODE ch;
				3054
				3055	ch = Py_UNICODE_TOLOWER(*s);
				3056	if (ch != *s) {
				3057	status = 1;
				3058	*s = ch;
				3059	}
				3060	s++;
				3061	}
				3062
				3063	return status;
				3064	}
				3065
				3066	static
				3067	int fixswapcase(PyUnicodeObject *self)
				3068	{
				3069	int len = self->length;
				3070	Py_UNICODE *s = self->str;
				3071	int status = 0;
				3072
				3073	while (len-- > 0) {
				3074	if (Py_UNICODE_ISUPPER(*s)) {
				3075	s = Py_UNICODE_TOLOWER(s);
				3076	status = 1;
				3077	} else if (Py_UNICODE_ISLOWER(*s)) {
				3078	s = Py_UNICODE_TOUPPER(s);
				3079	status = 1;
				3080	}
				3081	s++;
				3082	}
				3083
				3084	return status;
				3085	}
				3086
				3087	static
				3088	int fixcapitalize(PyUnicodeObject *self)
				3089	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3090	int len = self->length;
				3091	Py_UNICODE *s = self->str;
				3092	int status = 0;
				3093
				3094	if (len == 0)
				3095	return 0;
				3096	if (Py_UNICODE_ISLOWER(*s)) {
				3097	s = Py_UNICODE_TOUPPER(s);
				3098	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3099	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3100	s++;
				3101	while (--len > 0) {
				3102	if (Py_UNICODE_ISUPPER(*s)) {
				3103	s = Py_UNICODE_TOLOWER(s);
				3104	status = 1;
				3105	}
				3106	s++;
				3107	}
				3108	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3109	}
				3110
				3111	static
				3112	int fixtitle(PyUnicodeObject *self)
				3113	{
				3114	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3115	register Py_UNICODE *e;
				3116	int previous_is_cased;
				3117
				3118	/* Shortcut for single character strings */
				3119	if (PyUnicode_GET_SIZE(self) == 1) {
				3120	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				3121	if (*p != ch) {
				3122	*p = ch;
				3123	return 1;
				3124	}
				3125	else
				3126	return 0;
				3127	}
				3128
				3129	e = p + PyUnicode_GET_SIZE(self);
				3130	previous_is_cased = 0;
				3131	for (; p < e; p++) {
				3132	register const Py_UNICODE ch = *p;
				3133
				3134	if (previous_is_cased)
				3135	*p = Py_UNICODE_TOLOWER(ch);
				3136	else
				3137	*p = Py_UNICODE_TOTITLE(ch);
				3138
				3139	if (Py_UNICODE_ISLOWER(ch) \|\|
				3140	Py_UNICODE_ISUPPER(ch) \|\|
				3141	Py_UNICODE_ISTITLE(ch))
				3142	previous_is_cased = 1;
				3143	else
				3144	previous_is_cased = 0;
				3145	}
				3146	return 1;
				3147	}
				3148
				3149	PyObject PyUnicode_Join(PyObject separator,
				3150	PyObject *seq)
				3151	{
				3152	Py_UNICODE *sep;
				3153	int seplen;
				3154	PyUnicodeObject *res = NULL;
				3155	int reslen = 0;
				3156	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3157	int sz = 100;
				3158	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3159	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3160
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3161	it = PyObject_GetIter(seq);
				3162	if (it == NULL)
				3163	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3164
				3165	if (separator == NULL) {
				3166	Py_UNICODE blank = ' ';
				3167	sep = &blank;
				3168	seplen = 1;
				3169	}
				3170	else {
				3171	separator = PyUnicode_FromObject(separator);
				3172	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3173	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3174	sep = PyUnicode_AS_UNICODE(separator);
				3175	seplen = PyUnicode_GET_SIZE(separator);
				3176	}
				3177
				3178	res = _PyUnicode_New(sz);
				3179	if (res == NULL)
				3180	goto onError;
				3181	p = PyUnicode_AS_UNICODE(res);
				3182	reslen = 0;
				3183
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3184	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3185	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3186	PyObject *item = PyIter_Next(it);
				3187	if (item == NULL) {
				3188	if (PyErr_Occurred())
				3189	goto onError;
				3190	break;
				3191	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3192	if (!PyUnicode_Check(item)) {
				3193	PyObject *v;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3194	if (!PyString_Check(item)) {
				3195	PyErr_Format(PyExc_TypeError,
				3196	"sequence item %i: expected string or Unicode,"
				3197	" %.80s found",
				3198	i, item->ob_type->tp_name);
				3199	Py_DECREF(item);
				3200	goto onError;
				3201	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3202	v = PyUnicode_FromObject(item);
				3203	Py_DECREF(item);
				3204	item = v;
				3205	if (item == NULL)
				3206	goto onError;
				3207	}
				3208	itemlen = PyUnicode_GET_SIZE(item);
				3209	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3210	if (_PyUnicode_Resize(&res, sz*2)) {
				3211	Py_DECREF(item);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3212	goto onError;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3214	sz *= 2;
				3215	p = PyUnicode_AS_UNICODE(res) + reslen;
				3216	}
				3217	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3218	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3219	p += seplen;
				3220	reslen += seplen;
				3221	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3222	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3223	p += itemlen;
				3224	reslen += itemlen;
				3225	Py_DECREF(item);
				3226	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3227	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3228	goto onError;
				3229
				3230	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3231	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3232	return (PyObject *)res;
				3233
				3234	onError:
				3235	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3236	Py_XDECREF(res);
				3237	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3238	return NULL;
				3239	}
				3240
				3241	static
				3242	PyUnicodeObject pad(PyUnicodeObject self,
				3243	int left,
				3244	int right,
				3245	Py_UNICODE fill)
				3246	{
				3247	PyUnicodeObject *u;
				3248
				3249	if (left < 0)
				3250	left = 0;
				3251	if (right < 0)
				3252	right = 0;
				3253
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3254	if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3255	Py_INCREF(self);
				3256	return self;
				3257	}
				3258
				3259	u = _PyUnicode_New(left + self->length + right);
				3260	if (u) {
				3261	if (left)
				3262	Py_UNICODE_FILL(u->str, fill, left);
				3263	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				3264	if (right)
				3265	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				3266	}
				3267
				3268	return u;
				3269	}
				3270
				3271	#define SPLIT_APPEND(data, left, right) \
				3272	str = PyUnicode_FromUnicode(data + left, right - left); \
				3273	if (!str) \
				3274	goto onError; \
				3275	if (PyList_Append(list, str)) { \
				3276	Py_DECREF(str); \
				3277	goto onError; \
				3278	} \
				3279	else \
				3280	Py_DECREF(str);
				3281
				3282	static
				3283	PyObject split_whitespace(PyUnicodeObject self,
				3284	PyObject *list,
				3285	int maxcount)
				3286	{
				3287	register int i;
				3288	register int j;
				3289	int len = self->length;
				3290	PyObject *str;
				3291
				3292	for (i = j = 0; i < len; ) {
				3293	/* find a token */
				3294	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3295	i++;
				3296	j = i;
				3297	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				3298	i++;
				3299	if (j < i) {
				3300	if (maxcount-- <= 0)
				3301	break;
				3302	SPLIT_APPEND(self->str, j, i);
				3303	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3304	i++;
				3305	j = i;
				3306	}
				3307	}
				3308	if (j < len) {
				3309	SPLIT_APPEND(self->str, j, len);
				3310	}
				3311	return list;
				3312
				3313	onError:
				3314	Py_DECREF(list);
				3315	return NULL;
				3316	}
				3317
				3318	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3319	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3320	{
				3321	register int i;
				3322	register int j;
				3323	int len;
				3324	PyObject *list;
				3325	PyObject *str;
				3326	Py_UNICODE *data;
				3327
				3328	string = PyUnicode_FromObject(string);
				3329	if (string == NULL)
				3330	return NULL;
				3331	data = PyUnicode_AS_UNICODE(string);
				3332	len = PyUnicode_GET_SIZE(string);
				3333
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3334	list = PyList_New(0);
				3335	if (!list)
				3336	goto onError;
				3337
				3338	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3339	int eol;
				3340
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3341	/* Find a line and append it */
				3342	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				3343	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3344
				3345	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3346	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3347	if (i < len) {
				3348	if (data[i] == '\r' && i + 1 < len &&
				3349	data[i+1] == '\n')
				3350	i += 2;
				3351	else
				3352	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3353	if (keepends)
				3354	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3355	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3356	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3357	j = i;
				3358	}
				3359	if (j < len) {
				3360	SPLIT_APPEND(data, j, len);
				3361	}
				3362
				3363	Py_DECREF(string);
				3364	return list;
				3365
				3366	onError:
				3367	Py_DECREF(list);
				3368	Py_DECREF(string);
				3369	return NULL;
				3370	}
				3371
				3372	static
				3373	PyObject split_char(PyUnicodeObject self,
				3374	PyObject *list,
				3375	Py_UNICODE ch,
				3376	int maxcount)
				3377	{
				3378	register int i;
				3379	register int j;
				3380	int len = self->length;
				3381	PyObject *str;
				3382
				3383	for (i = j = 0; i < len; ) {
				3384	if (self->str[i] == ch) {
				3385	if (maxcount-- <= 0)
				3386	break;
				3387	SPLIT_APPEND(self->str, j, i);
				3388	i = j = i + 1;
				3389	} else
				3390	i++;
				3391	}
				3392	if (j <= len) {
				3393	SPLIT_APPEND(self->str, j, len);
				3394	}
				3395	return list;
				3396
				3397	onError:
				3398	Py_DECREF(list);
				3399	return NULL;
				3400	}
				3401
				3402	static
				3403	PyObject split_substring(PyUnicodeObject self,
				3404	PyObject *list,
				3405	PyUnicodeObject *substring,
				3406	int maxcount)
				3407	{
				3408	register int i;
				3409	register int j;
				3410	int len = self->length;
				3411	int sublen = substring->length;
				3412	PyObject *str;
				3413
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3414	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3415	if (Py_UNICODE_MATCH(self, i, substring)) {
				3416	if (maxcount-- <= 0)
				3417	break;
				3418	SPLIT_APPEND(self->str, j, i);
				3419	i = j = i + sublen;
				3420	} else
				3421	i++;
				3422	}
				3423	if (j <= len) {
				3424	SPLIT_APPEND(self->str, j, len);
				3425	}
				3426	return list;
				3427
				3428	onError:
				3429	Py_DECREF(list);
				3430	return NULL;
				3431	}
				3432
				3433	#undef SPLIT_APPEND
				3434
				3435	static
				3436	PyObject split(PyUnicodeObject self,
				3437	PyUnicodeObject *substring,
				3438	int maxcount)
				3439	{
				3440	PyObject *list;
				3441
				3442	if (maxcount < 0)
				3443	maxcount = INT_MAX;
				3444
				3445	list = PyList_New(0);
				3446	if (!list)
				3447	return NULL;
				3448
				3449	if (substring == NULL)
				3450	return split_whitespace(self,list,maxcount);
				3451
				3452	else if (substring->length == 1)
				3453	return split_char(self,list,substring->str[0],maxcount);
				3454
				3455	else if (substring->length == 0) {
				3456	Py_DECREF(list);
				3457	PyErr_SetString(PyExc_ValueError, "empty separator");
				3458	return NULL;
				3459	}
				3460	else
				3461	return split_substring(self,list,substring,maxcount);
				3462	}
				3463
				3464	static
				3465	PyObject strip(PyUnicodeObject self,
				3466	int left,
				3467	int right)
				3468	{
				3469	Py_UNICODE *p = self->str;
				3470	int start = 0;
				3471	int end = self->length;
				3472
				3473	if (left)
				3474	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3475	start++;
				3476
				3477	if (right)
				3478	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3479	end--;
				3480
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3481	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3482	/* couldn't strip anything off, return original string */
				3483	Py_INCREF(self);
				3484	return (PyObject*) self;
				3485	}
				3486
				3487	return (PyObject*) PyUnicode_FromUnicode(
				3488	self->str + start,
				3489	end - start
				3490	);
				3491	}
				3492
				3493	static
				3494	PyObject replace(PyUnicodeObject self,
				3495	PyUnicodeObject *str1,
				3496	PyUnicodeObject *str2,
				3497	int maxcount)
				3498	{
				3499	PyUnicodeObject *u;
				3500
				3501	if (maxcount < 0)
				3502	maxcount = INT_MAX;
				3503
				3504	if (str1->length == 1 && str2->length == 1) {
				3505	int i;
				3506
				3507	/* replace characters */
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3508	if (!findchar(self->str, self->length, str1->str[0]) &&
				3509	PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3510	/* nothing to replace, return original string */
				3511	Py_INCREF(self);
				3512	u = self;
				3513	} else {
				3514	Py_UNICODE u1 = str1->str[0];
				3515	Py_UNICODE u2 = str2->str[0];
				3516
				3517	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3518	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3519	self->length
				3520	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3521	if (u != NULL) {
				3522	Py_UNICODE_COPY(u->str, self->str,
				3523	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3524	for (i = 0; i < u->length; i++)
				3525	if (u->str[i] == u1) {
				3526	if (--maxcount < 0)
				3527	break;
				3528	u->str[i] = u2;
				3529	}
				3530	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3531	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3532
				3533	} else {
				3534	int n, i;
				3535	Py_UNICODE *p;
				3536
				3537	/* replace strings */
				3538	n = count(self, 0, self->length, str1);
				3539	if (n > maxcount)
				3540	n = maxcount;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3541	if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3542	/* nothing to replace, return original string */
				3543	Py_INCREF(self);
				3544	u = self;
				3545	} else {
				3546	u = _PyUnicode_New(
				3547	self->length + n * (str2->length - str1->length));
				3548	if (u) {
				3549	i = 0;
				3550	p = u->str;
				3551	while (i <= self->length - str1->length)
				3552	if (Py_UNICODE_MATCH(self, i, str1)) {
				3553	/* replace string segment */
				3554	Py_UNICODE_COPY(p, str2->str, str2->length);
				3555	p += str2->length;
				3556	i += str1->length;
				3557	if (--n <= 0) {
				3558	/* copy remaining part */
				3559	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3560	break;
				3561	}
				3562	} else
				3563	*p++ = self->str[i++];
				3564	}
				3565	}
				3566	}
				3567
				3568	return (PyObject *) u;
				3569	}
				3570
				3571	/* --- Unicode Object Methods --------------------------------------------- */
				3572
				3573	static char title__doc__[] =
				3574	"S.title() -> unicode\n\
				3575	\n\
				3576	Return a titlecased version of S, i.e. words start with title case\n\
				3577	characters, all remaining cased characters have lower case.";
				3578
				3579	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3580	unicode_title(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3581	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3582	return fixup(self, fixtitle);
				3583	}
				3584
				3585	static char capitalize__doc__[] =
				3586	"S.capitalize() -> unicode\n\
				3587	\n\
				3588	Return a capitalized version of S, i.e. make the first character\n\
				3589	have upper case.";
				3590
				3591	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3592	unicode_capitalize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3593	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3594	return fixup(self, fixcapitalize);
				3595	}
				3596
				3597	#if 0
				3598	static char capwords__doc__[] =
				3599	"S.capwords() -> unicode\n\
				3600	\n\
				3601	Apply .capitalize() to all words in S and return the result with\n\
				3602	normalized whitespace (all whitespace strings are replaced by ' ').";
				3603
				3604	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3605	unicode_capwords(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3606	{
				3607	PyObject *list;
				3608	PyObject *item;
				3609	int i;
				3610
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3611	/* Split into words */
				3612	list = split(self, NULL, -1);
				3613	if (!list)
				3614	return NULL;
				3615
				3616	/* Capitalize each word */
				3617	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3618	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3619	fixcapitalize);
				3620	if (item == NULL)
				3621	goto onError;
				3622	Py_DECREF(PyList_GET_ITEM(list, i));
				3623	PyList_SET_ITEM(list, i, item);
				3624	}
				3625
				3626	/* Join the words to form a new string */
				3627	item = PyUnicode_Join(NULL, list);
				3628
				3629	onError:
				3630	Py_DECREF(list);
				3631	return (PyObject *)item;
				3632	}
				3633	#endif
				3634
				3635	static char center__doc__[] =
				3636	"S.center(width) -> unicode\n\
				3637	\n\
				3638	Return S centered in a Unicode string of length width. Padding is done\n\
				3639	using spaces.";
				3640
				3641	static PyObject *
				3642	unicode_center(PyUnicodeObject self, PyObject args)
				3643	{
				3644	int marg, left;
				3645	int width;
				3646
				3647	if (!PyArg_ParseTuple(args, "i:center", &width))
				3648	return NULL;
				3649
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3650	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3651	Py_INCREF(self);
				3652	return (PyObject*) self;
				3653	}
				3654
				3655	marg = width - self->length;
				3656	left = marg / 2 + (marg & width & 1);
				3657
				3658	return (PyObject*) pad(self, left, marg - left, ' ');
				3659	}
				3660
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3661	#if 0
				3662
				3663	/* This code should go into some future Unicode collation support
				3664	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3665	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3666
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3667	/* speedy UTF-16 code point order comparison */
				3668	/* gleaned from: */
				3669	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3670
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3671	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3672	{
				3673	0, 0, 0, 0, 0, 0, 0, 0,
				3674	0, 0, 0, 0, 0, 0, 0, 0,
				3675	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3676	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3677	};
				3678
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3679	static int
				3680	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3681	{
				3682	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3683
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3684	Py_UNICODE *s1 = str1->str;
				3685	Py_UNICODE *s2 = str2->str;
				3686
				3687	len1 = str1->length;
				3688	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3689
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3690	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3691	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3692
				3693	c1 = *s1++;
				3694	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3695
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3696	if (c1 > (1<<11) * 26)
				3697	c1 += utf16Fixup[c1>>11];
				3698	if (c2 > (1<<11) * 26)
				3699	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3700	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3701
				3702	if (c1 != c2)
				3703	return (c1 < c2) ? -1 : 1;
				3704
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3705	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3706	}
				3707
				3708	return (len1 < len2) ? -1 : (len1 != len2);
				3709	}
				3710
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3711	#else
				3712
				3713	static int
				3714	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3715	{
				3716	register int len1, len2;
				3717
				3718	Py_UNICODE *s1 = str1->str;
				3719	Py_UNICODE *s2 = str2->str;
				3720
				3721	len1 = str1->length;
				3722	len2 = str2->length;
				3723
				3724	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3725	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3726
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3727	c1 = *s1++;
				3728	c2 = *s2++;
				3729
				3730	if (c1 != c2)
				3731	return (c1 < c2) ? -1 : 1;
				3732
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3733	len1--; len2--;
				3734	}
				3735
				3736	return (len1 < len2) ? -1 : (len1 != len2);
				3737	}
				3738
				3739	#endif
				3740
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3741	int PyUnicode_Compare(PyObject *left,
				3742	PyObject *right)
				3743	{
				3744	PyUnicodeObject u = NULL, v = NULL;
				3745	int result;
				3746
				3747	/* Coerce the two arguments */
				3748	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3749	if (u == NULL)
				3750	goto onError;
				3751	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3752	if (v == NULL)
				3753	goto onError;
				3754
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3755	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3756	if (v == u) {
				3757	Py_DECREF(u);
				3758	Py_DECREF(v);
				3759	return 0;
				3760	}
				3761
				3762	result = unicode_compare(u, v);
				3763
				3764	Py_DECREF(u);
				3765	Py_DECREF(v);
				3766	return result;
				3767
				3768	onError:
				3769	Py_XDECREF(u);
				3770	Py_XDECREF(v);
				3771	return -1;
				3772	}
				3773
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3774	int PyUnicode_Contains(PyObject *container,
				3775	PyObject *element)
				3776	{
				3777	PyUnicodeObject u = NULL, v = NULL;
				3778	int result;
				3779	register const Py_UNICODE p, e;
				3780	register Py_UNICODE ch;
				3781
				3782	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3783	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3784	if (v == NULL) {
				3785	PyErr_SetString(PyExc_TypeError,
				3786	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3787	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3788	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3789	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3790	if (u == NULL) {
				3791	Py_DECREF(v);
				3792	goto onError;
				3793	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3794
				3795	/* Check v in u */
				3796	if (PyUnicode_GET_SIZE(v) != 1) {
				3797	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3798	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3799	goto onError;
				3800	}
				3801	ch = *PyUnicode_AS_UNICODE(v);
				3802	p = PyUnicode_AS_UNICODE(u);
				3803	e = p + PyUnicode_GET_SIZE(u);
				3804	result = 0;
				3805	while (p < e) {
				3806	if (*p++ == ch) {
				3807	result = 1;
				3808	break;
				3809	}
				3810	}
				3811
				3812	Py_DECREF(u);
				3813	Py_DECREF(v);
				3814	return result;
				3815
				3816	onError:
				3817	Py_XDECREF(u);
				3818	Py_XDECREF(v);
				3819	return -1;
				3820	}
				3821
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3822	/* Concat to string or Unicode object giving a new Unicode object. */
				3823
				3824	PyObject PyUnicode_Concat(PyObject left,
				3825	PyObject *right)
				3826	{
				3827	PyUnicodeObject u = NULL, v = NULL, *w;
				3828
				3829	/* Coerce the two arguments */
				3830	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3831	if (u == NULL)
				3832	goto onError;
				3833	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3834	if (v == NULL)
				3835	goto onError;
				3836
				3837	/* Shortcuts */
				3838	if (v == unicode_empty) {
				3839	Py_DECREF(v);
				3840	return (PyObject *)u;
				3841	}
				3842	if (u == unicode_empty) {
				3843	Py_DECREF(u);
				3844	return (PyObject *)v;
				3845	}
				3846
				3847	/* Concat the two Unicode strings */
				3848	w = _PyUnicode_New(u->length + v->length);
				3849	if (w == NULL)
				3850	goto onError;
				3851	Py_UNICODE_COPY(w->str, u->str, u->length);
				3852	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3853
				3854	Py_DECREF(u);
				3855	Py_DECREF(v);
				3856	return (PyObject *)w;
				3857
				3858	onError:
				3859	Py_XDECREF(u);
				3860	Py_XDECREF(v);
				3861	return NULL;
				3862	}
				3863
				3864	static char count__doc__[] =
				3865	"S.count(sub[, start[, end]]) -> int\n\
				3866	\n\
				3867	Return the number of occurrences of substring sub in Unicode string\n\
				3868	S[start:end]. Optional arguments start and end are\n\
				3869	interpreted as in slice notation.";
				3870
				3871	static PyObject *
				3872	unicode_count(PyUnicodeObject self, PyObject args)
				3873	{
				3874	PyUnicodeObject *substring;
				3875	int start = 0;
				3876	int end = INT_MAX;
				3877	PyObject *result;
				3878
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3879	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3880	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3881	return NULL;
				3882
				3883	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3884	(PyObject *)substring);
				3885	if (substring == NULL)
				3886	return NULL;
				3887
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3888	if (start < 0)
				3889	start += self->length;
				3890	if (start < 0)
				3891	start = 0;
				3892	if (end > self->length)
				3893	end = self->length;
				3894	if (end < 0)
				3895	end += self->length;
				3896	if (end < 0)
				3897	end = 0;
				3898
				3899	result = PyInt_FromLong((long) count(self, start, end, substring));
				3900
				3901	Py_DECREF(substring);
				3902	return result;
				3903	}
				3904
				3905	static char encode__doc__[] =
				3906	"S.encode([encoding[,errors]]) -> string\n\
				3907	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3908	Return an encoded string version of S. Default encoding is the current\n\
				3909	default string encoding. errors may be given to set a different error\n\
				3910	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3911	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3912
				3913	static PyObject *
				3914	unicode_encode(PyUnicodeObject self, PyObject args)
				3915	{
				3916	char *encoding = NULL;
				3917	char *errors = NULL;
				3918	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3919	return NULL;
				3920	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3921	}
				3922
				3923	static char expandtabs__doc__[] =
				3924	"S.expandtabs([tabsize]) -> unicode\n\
				3925	\n\
				3926	Return a copy of S where all tab characters are expanded using spaces.\n\
				3927	If tabsize is not given, a tab size of 8 characters is assumed.";
				3928
				3929	static PyObject*
				3930	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3931	{
				3932	Py_UNICODE *e;
				3933	Py_UNICODE *p;
				3934	Py_UNICODE *q;
				3935	int i, j;
				3936	PyUnicodeObject *u;
				3937	int tabsize = 8;
				3938
				3939	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3940	return NULL;
				3941
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3942	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3943	i = j = 0;
				3944	e = self->str + self->length;
				3945	for (p = self->str; p < e; p++)
				3946	if (*p == '\t') {
				3947	if (tabsize > 0)
				3948	j += tabsize - (j % tabsize);
				3949	}
				3950	else {
				3951	j++;
				3952	if (p == '\n' \|\| p == '\r') {
				3953	i += j;
				3954	j = 0;
				3955	}
				3956	}
				3957
				3958	/* Second pass: create output string and fill it */
				3959	u = _PyUnicode_New(i + j);
				3960	if (!u)
				3961	return NULL;
				3962
				3963	j = 0;
				3964	q = u->str;
				3965
				3966	for (p = self->str; p < e; p++)
				3967	if (*p == '\t') {
				3968	if (tabsize > 0) {
				3969	i = tabsize - (j % tabsize);
				3970	j += i;
				3971	while (i--)
				3972	*q++ = ' ';
				3973	}
				3974	}
				3975	else {
				3976	j++;
				3977	q++ = p;
				3978	if (p == '\n' \|\| p == '\r')
				3979	j = 0;
				3980	}
				3981
				3982	return (PyObject*) u;
				3983	}
				3984
				3985	static char find__doc__[] =
				3986	"S.find(sub [,start [,end]]) -> int\n\
				3987	\n\
				3988	Return the lowest index in S where substring sub is found,\n\
				3989	such that sub is contained within s[start,end]. Optional\n\
				3990	arguments start and end are interpreted as in slice notation.\n\
				3991	\n\
				3992	Return -1 on failure.";
				3993
				3994	static PyObject *
				3995	unicode_find(PyUnicodeObject self, PyObject args)
				3996	{
				3997	PyUnicodeObject *substring;
				3998	int start = 0;
				3999	int end = INT_MAX;
				4000	PyObject *result;
				4001
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4002	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				4003	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4004	return NULL;
				4005	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4006	(PyObject *)substring);
				4007	if (substring == NULL)
				4008	return NULL;
				4009
				4010	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				4011
				4012	Py_DECREF(substring);
				4013	return result;
				4014	}
				4015
				4016	static PyObject *
				4017	unicode_getitem(PyUnicodeObject *self, int index)
				4018	{
				4019	if (index < 0 \|\| index >= self->length) {
				4020	PyErr_SetString(PyExc_IndexError, "string index out of range");
				4021	return NULL;
				4022	}
				4023
				4024	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				4025	}
				4026
				4027	static long
				4028	unicode_hash(PyUnicodeObject *self)
				4029	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4030	/* Since Unicode objects compare equal to their ASCII string
				4031	counterparts, they should use the individual character values
				4032	as basis for their hash value. This is needed to assure that
				4033	strings and Unicode objects behave in the same way as
				4034	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4035
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4036	register int len;
				4037	register Py_UNICODE *p;
				4038	register long x;
				4039
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4040	if (self->hash != -1)
				4041	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4042	len = PyUnicode_GET_SIZE(self);
				4043	p = PyUnicode_AS_UNICODE(self);
				4044	x = *p << 7;
				4045	while (--len >= 0)
				4046	x = (1000003x) ^ p++;
				4047	x ^= PyUnicode_GET_SIZE(self);
				4048	if (x == -1)
				4049	x = -2;
				4050	self->hash = x;
				4051	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4052	}
				4053
				4054	static char index__doc__[] =
				4055	"S.index(sub [,start [,end]]) -> int\n\
				4056	\n\
				4057	Like S.find() but raise ValueError when the substring is not found.";
				4058
				4059	static PyObject *
				4060	unicode_index(PyUnicodeObject self, PyObject args)
				4061	{
				4062	int result;
				4063	PyUnicodeObject *substring;
				4064	int start = 0;
				4065	int end = INT_MAX;
				4066
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4067	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				4068	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4069	return NULL;
				4070
				4071	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4072	(PyObject *)substring);
				4073	if (substring == NULL)
				4074	return NULL;
				4075
				4076	result = findstring(self, substring, start, end, 1);
				4077
				4078	Py_DECREF(substring);
				4079	if (result < 0) {
				4080	PyErr_SetString(PyExc_ValueError, "substring not found");
				4081	return NULL;
				4082	}
				4083	return PyInt_FromLong(result);
				4084	}
				4085
				4086	static char islower__doc__[] =
				4087	"S.islower() -> int\n\
				4088	\n\
				4089	Return 1 if all cased characters in S are lowercase and there is\n\
				4090	at least one cased character in S, 0 otherwise.";
				4091
				4092	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4093	unicode_islower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4094	{
				4095	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4096	register const Py_UNICODE *e;
				4097	int cased;
				4098
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4099	/* Shortcut for single character strings */
				4100	if (PyUnicode_GET_SIZE(self) == 1)
				4101	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				4102
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4103	/* Special case for empty strings */
				4104	if (PyString_GET_SIZE(self) == 0)
				4105	return PyInt_FromLong(0);
				4106
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4107	e = p + PyUnicode_GET_SIZE(self);
				4108	cased = 0;
				4109	for (; p < e; p++) {
				4110	register const Py_UNICODE ch = *p;
				4111
				4112	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4113	return PyInt_FromLong(0);
				4114	else if (!cased && Py_UNICODE_ISLOWER(ch))
				4115	cased = 1;
				4116	}
				4117	return PyInt_FromLong(cased);
				4118	}
				4119
				4120	static char isupper__doc__[] =
				4121	"S.isupper() -> int\n\
				4122	\n\
				4123	Return 1 if all cased characters in S are uppercase and there is\n\
				4124	at least one cased character in S, 0 otherwise.";
				4125
				4126	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4127	unicode_isupper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4128	{
				4129	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4130	register const Py_UNICODE *e;
				4131	int cased;
				4132
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4133	/* Shortcut for single character strings */
				4134	if (PyUnicode_GET_SIZE(self) == 1)
				4135	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				4136
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4137	/* Special case for empty strings */
				4138	if (PyString_GET_SIZE(self) == 0)
				4139	return PyInt_FromLong(0);
				4140
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4141	e = p + PyUnicode_GET_SIZE(self);
				4142	cased = 0;
				4143	for (; p < e; p++) {
				4144	register const Py_UNICODE ch = *p;
				4145
				4146	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4147	return PyInt_FromLong(0);
				4148	else if (!cased && Py_UNICODE_ISUPPER(ch))
				4149	cased = 1;
				4150	}
				4151	return PyInt_FromLong(cased);
				4152	}
				4153
				4154	static char istitle__doc__[] =
				4155	"S.istitle() -> int\n\
				4156	\n\
				4157	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				4158	may only follow uncased characters and lowercase characters only cased\n\
				4159	ones. Return 0 otherwise.";
				4160
				4161	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4162	unicode_istitle(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4163	{
				4164	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4165	register const Py_UNICODE *e;
				4166	int cased, previous_is_cased;
				4167
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4168	/* Shortcut for single character strings */
				4169	if (PyUnicode_GET_SIZE(self) == 1)
				4170	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				4171	(Py_UNICODE_ISUPPER(*p) != 0));
				4172
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4173	/* Special case for empty strings */
				4174	if (PyString_GET_SIZE(self) == 0)
				4175	return PyInt_FromLong(0);
				4176
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4177	e = p + PyUnicode_GET_SIZE(self);
				4178	cased = 0;
				4179	previous_is_cased = 0;
				4180	for (; p < e; p++) {
				4181	register const Py_UNICODE ch = *p;
				4182
				4183	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				4184	if (previous_is_cased)
				4185	return PyInt_FromLong(0);
				4186	previous_is_cased = 1;
				4187	cased = 1;
				4188	}
				4189	else if (Py_UNICODE_ISLOWER(ch)) {
				4190	if (!previous_is_cased)
				4191	return PyInt_FromLong(0);
				4192	previous_is_cased = 1;
				4193	cased = 1;
				4194	}
				4195	else
				4196	previous_is_cased = 0;
				4197	}
				4198	return PyInt_FromLong(cased);
				4199	}
				4200
				4201	static char isspace__doc__[] =
				4202	"S.isspace() -> int\n\
				4203	\n\
				4204	Return 1 if there are only whitespace characters in S,\n\
				4205	0 otherwise.";
				4206
				4207	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4208	unicode_isspace(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4209	{
				4210	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4211	register const Py_UNICODE *e;
				4212
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4213	/* Shortcut for single character strings */
				4214	if (PyUnicode_GET_SIZE(self) == 1 &&
				4215	Py_UNICODE_ISSPACE(*p))
				4216	return PyInt_FromLong(1);
				4217
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4218	/* Special case for empty strings */
				4219	if (PyString_GET_SIZE(self) == 0)
				4220	return PyInt_FromLong(0);
				4221
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4222	e = p + PyUnicode_GET_SIZE(self);
				4223	for (; p < e; p++) {
				4224	if (!Py_UNICODE_ISSPACE(*p))
				4225	return PyInt_FromLong(0);
				4226	}
				4227	return PyInt_FromLong(1);
				4228	}
				4229
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4230	static char isalpha__doc__[] =
				4231	"S.isalpha() -> int\n\
				4232	\n\
				4233	Return 1 if all characters in S are alphabetic\n\
				4234	and there is at least one character in S, 0 otherwise.";
				4235
				4236	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4237	unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4238	{
				4239	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4240	register const Py_UNICODE *e;
				4241
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4242	/* Shortcut for single character strings */
				4243	if (PyUnicode_GET_SIZE(self) == 1 &&
				4244	Py_UNICODE_ISALPHA(*p))
				4245	return PyInt_FromLong(1);
				4246
				4247	/* Special case for empty strings */
				4248	if (PyString_GET_SIZE(self) == 0)
				4249	return PyInt_FromLong(0);
				4250
				4251	e = p + PyUnicode_GET_SIZE(self);
				4252	for (; p < e; p++) {
				4253	if (!Py_UNICODE_ISALPHA(*p))
				4254	return PyInt_FromLong(0);
				4255	}
				4256	return PyInt_FromLong(1);
				4257	}
				4258
				4259	static char isalnum__doc__[] =
				4260	"S.isalnum() -> int\n\
				4261	\n\
				4262	Return 1 if all characters in S are alphanumeric\n\
				4263	and there is at least one character in S, 0 otherwise.";
				4264
				4265	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4266	unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4267	{
				4268	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4269	register const Py_UNICODE *e;
				4270
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4271	/* Shortcut for single character strings */
				4272	if (PyUnicode_GET_SIZE(self) == 1 &&
				4273	Py_UNICODE_ISALNUM(*p))
				4274	return PyInt_FromLong(1);
				4275
				4276	/* Special case for empty strings */
				4277	if (PyString_GET_SIZE(self) == 0)
				4278	return PyInt_FromLong(0);
				4279
				4280	e = p + PyUnicode_GET_SIZE(self);
				4281	for (; p < e; p++) {
				4282	if (!Py_UNICODE_ISALNUM(*p))
				4283	return PyInt_FromLong(0);
				4284	}
				4285	return PyInt_FromLong(1);
				4286	}
				4287
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4288	static char isdecimal__doc__[] =
				4289	"S.isdecimal() -> int\n\
				4290	\n\
				4291	Return 1 if there are only decimal characters in S,\n\
				4292	0 otherwise.";
				4293
				4294	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4295	unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4296	{
				4297	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4298	register const Py_UNICODE *e;
				4299
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4300	/* Shortcut for single character strings */
				4301	if (PyUnicode_GET_SIZE(self) == 1 &&
				4302	Py_UNICODE_ISDECIMAL(*p))
				4303	return PyInt_FromLong(1);
				4304
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4305	/* Special case for empty strings */
				4306	if (PyString_GET_SIZE(self) == 0)
				4307	return PyInt_FromLong(0);
				4308
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4309	e = p + PyUnicode_GET_SIZE(self);
				4310	for (; p < e; p++) {
				4311	if (!Py_UNICODE_ISDECIMAL(*p))
				4312	return PyInt_FromLong(0);
				4313	}
				4314	return PyInt_FromLong(1);
				4315	}
				4316
				4317	static char isdigit__doc__[] =
				4318	"S.isdigit() -> int\n\
				4319	\n\
				4320	Return 1 if there are only digit characters in S,\n\
				4321	0 otherwise.";
				4322
				4323	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4324	unicode_isdigit(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4325	{
				4326	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4327	register const Py_UNICODE *e;
				4328
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4329	/* Shortcut for single character strings */
				4330	if (PyUnicode_GET_SIZE(self) == 1 &&
				4331	Py_UNICODE_ISDIGIT(*p))
				4332	return PyInt_FromLong(1);
				4333
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4334	/* Special case for empty strings */
				4335	if (PyString_GET_SIZE(self) == 0)
				4336	return PyInt_FromLong(0);
				4337
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4338	e = p + PyUnicode_GET_SIZE(self);
				4339	for (; p < e; p++) {
				4340	if (!Py_UNICODE_ISDIGIT(*p))
				4341	return PyInt_FromLong(0);
				4342	}
				4343	return PyInt_FromLong(1);
				4344	}
				4345
				4346	static char isnumeric__doc__[] =
				4347	"S.isnumeric() -> int\n\
				4348	\n\
				4349	Return 1 if there are only numeric characters in S,\n\
				4350	0 otherwise.";
				4351
				4352	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4353	unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4354	{
				4355	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4356	register const Py_UNICODE *e;
				4357
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4358	/* Shortcut for single character strings */
				4359	if (PyUnicode_GET_SIZE(self) == 1 &&
				4360	Py_UNICODE_ISNUMERIC(*p))
				4361	return PyInt_FromLong(1);
				4362
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4363	/* Special case for empty strings */
				4364	if (PyString_GET_SIZE(self) == 0)
				4365	return PyInt_FromLong(0);
				4366
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4367	e = p + PyUnicode_GET_SIZE(self);
				4368	for (; p < e; p++) {
				4369	if (!Py_UNICODE_ISNUMERIC(*p))
				4370	return PyInt_FromLong(0);
				4371	}
				4372	return PyInt_FromLong(1);
				4373	}
				4374
				4375	static char join__doc__[] =
				4376	"S.join(sequence) -> unicode\n\
				4377	\n\
				4378	Return a string which is the concatenation of the strings in the\n\
				4379	sequence. The separator between elements is S.";
				4380
				4381	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4382	unicode_join(PyObject self, PyObject data)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4383	{
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4384	return PyUnicode_Join(self, data);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4385	}
				4386
				4387	static int
				4388	unicode_length(PyUnicodeObject *self)
				4389	{
				4390	return self->length;
				4391	}
				4392
				4393	static char ljust__doc__[] =
				4394	"S.ljust(width) -> unicode\n\
				4395	\n\
				4396	Return S left justified in a Unicode string of length width. Padding is\n\
				4397	done using spaces.";
				4398
				4399	static PyObject *
				4400	unicode_ljust(PyUnicodeObject self, PyObject args)
				4401	{
				4402	int width;
				4403	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4404	return NULL;
				4405
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4406	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4407	Py_INCREF(self);
				4408	return (PyObject*) self;
				4409	}
				4410
				4411	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4412	}
				4413
				4414	static char lower__doc__[] =
				4415	"S.lower() -> unicode\n\
				4416	\n\
				4417	Return a copy of the string S converted to lowercase.";
				4418
				4419	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4420	unicode_lower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4421	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4422	return fixup(self, fixlower);
				4423	}
				4424
				4425	static char lstrip__doc__[] =
				4426	"S.lstrip() -> unicode\n\
				4427	\n\
				4428	Return a copy of the string S with leading whitespace removed.";
				4429
				4430	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4431	unicode_lstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4432	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4433	return strip(self, 1, 0);
				4434	}
				4435
				4436	static PyObject*
				4437	unicode_repeat(PyUnicodeObject *str, int len)
				4438	{
				4439	PyUnicodeObject *u;
				4440	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4441	int nchars;
				4442	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4443
				4444	if (len < 0)
				4445	len = 0;
				4446
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4447	if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4448	/* no repeat, return original string */
				4449	Py_INCREF(str);
				4450	return (PyObject*) str;
				4451	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4452
				4453	/* ensure # of chars needed doesn't overflow int and # of bytes
				4454	* needed doesn't overflow size_t
				4455	*/
				4456	nchars = len * str->length;
				4457	if (len && nchars / len != str->length) {
				4458	PyErr_SetString(PyExc_OverflowError,
				4459	"repeated string is too long");
				4460	return NULL;
				4461	}
				4462	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4463	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4464	PyErr_SetString(PyExc_OverflowError,
				4465	"repeated string is too long");
				4466	return NULL;
				4467	}
				4468	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4469	if (!u)
				4470	return NULL;
				4471
				4472	p = u->str;
				4473
				4474	while (len-- > 0) {
				4475	Py_UNICODE_COPY(p, str->str, str->length);
				4476	p += str->length;
				4477	}
				4478
				4479	return (PyObject*) u;
				4480	}
				4481
				4482	PyObject PyUnicode_Replace(PyObject obj,
				4483	PyObject *subobj,
				4484	PyObject *replobj,
				4485	int maxcount)
				4486	{
				4487	PyObject *self;
				4488	PyObject *str1;
				4489	PyObject *str2;
				4490	PyObject *result;
				4491
				4492	self = PyUnicode_FromObject(obj);
				4493	if (self == NULL)
				4494	return NULL;
				4495	str1 = PyUnicode_FromObject(subobj);
				4496	if (str1 == NULL) {
				4497	Py_DECREF(self);
				4498	return NULL;
				4499	}
				4500	str2 = PyUnicode_FromObject(replobj);
				4501	if (str2 == NULL) {
				4502	Py_DECREF(self);
				4503	Py_DECREF(str1);
				4504	return NULL;
				4505	}
				4506	result = replace((PyUnicodeObject *)self,
				4507	(PyUnicodeObject *)str1,
				4508	(PyUnicodeObject *)str2,
				4509	maxcount);
				4510	Py_DECREF(self);
				4511	Py_DECREF(str1);
				4512	Py_DECREF(str2);
				4513	return result;
				4514	}
				4515
				4516	static char replace__doc__[] =
				4517	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4518	\n\
				4519	Return a copy of S with all occurrences of substring\n\
				4520	old replaced by new. If the optional argument maxsplit is\n\
				4521	given, only the first maxsplit occurrences are replaced.";
				4522
				4523	static PyObject*
				4524	unicode_replace(PyUnicodeObject self, PyObject args)
				4525	{
				4526	PyUnicodeObject *str1;
				4527	PyUnicodeObject *str2;
				4528	int maxcount = -1;
				4529	PyObject *result;
				4530
				4531	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4532	return NULL;
				4533	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4534	if (str1 == NULL)
				4535	return NULL;
				4536	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4537	if (str2 == NULL)
				4538	return NULL;
				4539
				4540	result = replace(self, str1, str2, maxcount);
				4541
				4542	Py_DECREF(str1);
				4543	Py_DECREF(str2);
				4544	return result;
				4545	}
				4546
				4547	static
				4548	PyObject unicode_repr(PyObject unicode)
				4549	{
				4550	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4551	PyUnicode_GET_SIZE(unicode),
				4552	1);
				4553	}
				4554
				4555	static char rfind__doc__[] =
				4556	"S.rfind(sub [,start [,end]]) -> int\n\
				4557	\n\
				4558	Return the highest index in S where substring sub is found,\n\
				4559	such that sub is contained within s[start,end]. Optional\n\
				4560	arguments start and end are interpreted as in slice notation.\n\
				4561	\n\
				4562	Return -1 on failure.";
				4563
				4564	static PyObject *
				4565	unicode_rfind(PyUnicodeObject self, PyObject args)
				4566	{
				4567	PyUnicodeObject *substring;
				4568	int start = 0;
				4569	int end = INT_MAX;
				4570	PyObject *result;
				4571
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4572	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4573	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4574	return NULL;
				4575	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4576	(PyObject *)substring);
				4577	if (substring == NULL)
				4578	return NULL;
				4579
				4580	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4581
				4582	Py_DECREF(substring);
				4583	return result;
				4584	}
				4585
				4586	static char rindex__doc__[] =
				4587	"S.rindex(sub [,start [,end]]) -> int\n\
				4588	\n\
				4589	Like S.rfind() but raise ValueError when the substring is not found.";
				4590
				4591	static PyObject *
				4592	unicode_rindex(PyUnicodeObject self, PyObject args)
				4593	{
				4594	int result;
				4595	PyUnicodeObject *substring;
				4596	int start = 0;
				4597	int end = INT_MAX;
				4598
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4599	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4600	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4601	return NULL;
				4602	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4603	(PyObject *)substring);
				4604	if (substring == NULL)
				4605	return NULL;
				4606
				4607	result = findstring(self, substring, start, end, -1);
				4608
				4609	Py_DECREF(substring);
				4610	if (result < 0) {
				4611	PyErr_SetString(PyExc_ValueError, "substring not found");
				4612	return NULL;
				4613	}
				4614	return PyInt_FromLong(result);
				4615	}
				4616
				4617	static char rjust__doc__[] =
				4618	"S.rjust(width) -> unicode\n\
				4619	\n\
				4620	Return S right justified in a Unicode string of length width. Padding is\n\
				4621	done using spaces.";
				4622
				4623	static PyObject *
				4624	unicode_rjust(PyUnicodeObject self, PyObject args)
				4625	{
				4626	int width;
				4627	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4628	return NULL;
				4629
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4630	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4631	Py_INCREF(self);
				4632	return (PyObject*) self;
				4633	}
				4634
				4635	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4636	}
				4637
				4638	static char rstrip__doc__[] =
				4639	"S.rstrip() -> unicode\n\
				4640	\n\
				4641	Return a copy of the string S with trailing whitespace removed.";
				4642
				4643	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4644	unicode_rstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4645	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4646	return strip(self, 0, 1);
				4647	}
				4648
				4649	static PyObject*
				4650	unicode_slice(PyUnicodeObject *self, int start, int end)
				4651	{
				4652	/* standard clamping */
				4653	if (start < 0)
				4654	start = 0;
				4655	if (end < 0)
				4656	end = 0;
				4657	if (end > self->length)
				4658	end = self->length;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4659	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4660	/* full slice, return original string */
				4661	Py_INCREF(self);
				4662	return (PyObject*) self;
				4663	}
				4664	if (start > end)
				4665	start = end;
				4666	/* copy slice */
				4667	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4668	end - start);
				4669	}
				4670
				4671	PyObject PyUnicode_Split(PyObject s,
				4672	PyObject *sep,
				4673	int maxsplit)
				4674	{
				4675	PyObject *result;
				4676
				4677	s = PyUnicode_FromObject(s);
				4678	if (s == NULL)
				4679	return NULL;
				4680	if (sep != NULL) {
				4681	sep = PyUnicode_FromObject(sep);
				4682	if (sep == NULL) {
				4683	Py_DECREF(s);
				4684	return NULL;
				4685	}
				4686	}
				4687
				4688	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4689
				4690	Py_DECREF(s);
				4691	Py_XDECREF(sep);
				4692	return result;
				4693	}
				4694
				4695	static char split__doc__[] =
				4696	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4697	\n\
				4698	Return a list of the words in S, using sep as the\n\
				4699	delimiter string. If maxsplit is given, at most maxsplit\n\
				4700	splits are done. If sep is not specified, any whitespace string\n\
				4701	is a separator.";
				4702
				4703	static PyObject*
				4704	unicode_split(PyUnicodeObject self, PyObject args)
				4705	{
				4706	PyObject *substring = Py_None;
				4707	int maxcount = -1;
				4708
				4709	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4710	return NULL;
				4711
				4712	if (substring == Py_None)
				4713	return split(self, NULL, maxcount);
				4714	else if (PyUnicode_Check(substring))
				4715	return split(self, (PyUnicodeObject *)substring, maxcount);
				4716	else
				4717	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4718	}
				4719
				4720	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4721	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4722	\n\
				4723	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4724	Line breaks are not included in the resulting list unless keepends\n\
				4725	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4726
				4727	static PyObject*
				4728	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4729	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4730	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4731
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4732	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4733	return NULL;
				4734
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4735	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4736	}
				4737
				4738	static
				4739	PyObject unicode_str(PyUnicodeObject self)
				4740	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4741	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4742	}
				4743
				4744	static char strip__doc__[] =
				4745	"S.strip() -> unicode\n\
				4746	\n\
				4747	Return a copy of S with leading and trailing whitespace removed.";
				4748
				4749	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4750	unicode_strip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4751	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4752	return strip(self, 1, 1);
				4753	}
				4754
				4755	static char swapcase__doc__[] =
				4756	"S.swapcase() -> unicode\n\
				4757	\n\
				4758	Return a copy of S with uppercase characters converted to lowercase\n\
				4759	and vice versa.";
				4760
				4761	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4762	unicode_swapcase(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4763	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4764	return fixup(self, fixswapcase);
				4765	}
				4766
				4767	static char translate__doc__[] =
				4768	"S.translate(table) -> unicode\n\
				4769	\n\
				4770	Return a copy of the string S, where all characters have been mapped\n\
				4771	through the given translation table, which must be a mapping of\n\
				4772	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4773	are left untouched. Characters mapped to None are deleted.";
				4774
				4775	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4776	unicode_translate(PyUnicodeObject self, PyObject table)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4777	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4778	return PyUnicode_TranslateCharmap(self->str,
				4779	self->length,
				4780	table,
				4781	"ignore");
				4782	}
				4783
				4784	static char upper__doc__[] =
				4785	"S.upper() -> unicode\n\
				4786	\n\
				4787	Return a copy of S converted to uppercase.";
				4788
				4789	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4790	unicode_upper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4791	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4792	return fixup(self, fixupper);
				4793	}
				4794
				4795	#if 0
				4796	static char zfill__doc__[] =
				4797	"S.zfill(width) -> unicode\n\
				4798	\n\
				4799	Pad a numeric string x with zeros on the left, to fill a field\n\
				4800	of the specified width. The string x is never truncated.";
				4801
				4802	static PyObject *
				4803	unicode_zfill(PyUnicodeObject self, PyObject args)
				4804	{
				4805	int fill;
				4806	PyUnicodeObject *u;
				4807
				4808	int width;
				4809	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4810	return NULL;
				4811
				4812	if (self->length >= width) {
				4813	Py_INCREF(self);
				4814	return (PyObject*) self;
				4815	}
				4816
				4817	fill = width - self->length;
				4818
				4819	u = pad(self, fill, 0, '0');
				4820
				4821	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4822	/* move sign to beginning of string */
				4823	u->str[0] = u->str[fill];
				4824	u->str[fill] = '0';
				4825	}
				4826
				4827	return (PyObject*) u;
				4828	}
				4829	#endif
				4830
				4831	#if 0
				4832	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4833	unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4834	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4835	return PyInt_FromLong(unicode_freelist_size);
				4836	}
				4837	#endif
				4838
				4839	static char startswith__doc__[] =
				4840	"S.startswith(prefix[, start[, end]]) -> int\n\
				4841	\n\
				4842	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4843	optional start, test S beginning at that position. With optional end, stop\n\
				4844	comparing S at that position.";
				4845
				4846	static PyObject *
				4847	unicode_startswith(PyUnicodeObject *self,
				4848	PyObject *args)
				4849	{
				4850	PyUnicodeObject *substring;
				4851	int start = 0;
				4852	int end = INT_MAX;
				4853	PyObject *result;
				4854
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4855	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4856	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4857	return NULL;
				4858	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4859	(PyObject *)substring);
				4860	if (substring == NULL)
				4861	return NULL;
				4862
				4863	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4864
				4865	Py_DECREF(substring);
				4866	return result;
				4867	}
				4868
				4869
				4870	static char endswith__doc__[] =
				4871	"S.endswith(suffix[, start[, end]]) -> int\n\
				4872	\n\
				4873	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4874	optional start, test S beginning at that position. With optional end, stop\n\
				4875	comparing S at that position.";
				4876
				4877	static PyObject *
				4878	unicode_endswith(PyUnicodeObject *self,
				4879	PyObject *args)
				4880	{
				4881	PyUnicodeObject *substring;
				4882	int start = 0;
				4883	int end = INT_MAX;
				4884	PyObject *result;
				4885
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4886	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4887	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4888	return NULL;
				4889	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4890	(PyObject *)substring);
				4891	if (substring == NULL)
				4892	return NULL;
				4893
				4894	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4895
				4896	Py_DECREF(substring);
				4897	return result;
				4898	}
				4899
				4900
				4901	static PyMethodDef unicode_methods[] = {
				4902
				4903	/* Order is according to common usage: often used methods should
				4904	appear first, since lookup is done sequentially. */
				4905
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4906	{"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
				4907	{"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
				4908	{"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
				4909	{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
				4910	{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
				4911	{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
				4912	{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
				4913	{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
				4914	{"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
				4915	{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
				4916	{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
				4917	{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
				4918	{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
				4919	{"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
				4920	/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
				4921	{"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
				4922	{"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
				4923	{"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
				4924	{"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
				4925	{"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
				4926	{"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
				4927	{"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
				4928	{"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
				4929	{"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
				4930	{"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
				4931	{"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
				4932	{"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
				4933	{"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
				4934	{"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
				4935	{"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
				4936	{"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
				4937	{"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
				4938	{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
				4939	{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
				4940	{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4941	#if 0
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4942	{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
				4943	{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4944	#endif
				4945
				4946	#if 0
				4947	/* This one is just used for debugging the implementation. */
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4948	{"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4949	#endif
				4950
				4951	{NULL, NULL}
				4952	};
				4953
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4954	static PySequenceMethods unicode_as_sequence = {
				4955	(inquiry) unicode_length, /* sq_length */
				4956	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4957	(intargfunc) unicode_repeat, /* sq_repeat */
				4958	(intargfunc) unicode_getitem, /* sq_item */
				4959	(intintargfunc) unicode_slice, /* sq_slice */
				4960	0, /* sq_ass_item */
				4961	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4962	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4963	};
				4964
				4965	static int
				4966	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4967	int index,
				4968	const void **ptr)
				4969	{
				4970	if (index != 0) {
				4971	PyErr_SetString(PyExc_SystemError,
				4972	"accessing non-existent unicode segment");
				4973	return -1;
				4974	}
				4975	ptr = (void ) self->str;
				4976	return PyUnicode_GET_DATA_SIZE(self);
				4977	}
				4978
				4979	static int
				4980	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4981	const void **ptr)
				4982	{
				4983	PyErr_SetString(PyExc_TypeError,
				4984	"cannot use unicode as modifyable buffer");
				4985	return -1;
				4986	}
				4987
				4988	static int
				4989	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4990	int *lenp)
				4991	{
				4992	if (lenp)
				4993	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4994	return 1;
				4995	}
				4996
				4997	static int
				4998	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4999	int index,
				5000	const void **ptr)
				5001	{
				5002	PyObject *str;
				5003
				5004	if (index != 0) {
				5005	PyErr_SetString(PyExc_SystemError,
				5006	"accessing non-existent unicode segment");
				5007	return -1;
				5008	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5009	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5010	if (str == NULL)
				5011	return -1;
				5012	ptr = (void ) PyString_AS_STRING(str);
				5013	return PyString_GET_SIZE(str);
				5014	}
				5015
				5016	/* Helpers for PyUnicode_Format() */
				5017
				5018	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5019	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5020	{
				5021	int argidx = *p_argidx;
				5022	if (argidx < arglen) {
				5023	(*p_argidx)++;
				5024	if (arglen < 0)
				5025	return args;
				5026	else
				5027	return PyTuple_GetItem(args, argidx);
				5028	}
				5029	PyErr_SetString(PyExc_TypeError,
				5030	"not enough arguments for format string");
				5031	return NULL;
				5032	}
				5033
				5034	#define F_LJUST (1<<0)
				5035	#define F_SIGN (1<<1)
				5036	#define F_BLANK (1<<2)
				5037	#define F_ALT (1<<3)
				5038	#define F_ZERO (1<<4)
				5039
				5040	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5041	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5042	{
				5043	register int i;
				5044	int len;
				5045	va_list va;
				5046	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5047	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5048
				5049	/* First, format the string as char array, then expand to Py_UNICODE
				5050	array. */
				5051	charbuffer = (char *)buffer;
				5052	len = vsprintf(charbuffer, format, va);
				5053	for (i = len - 1; i >= 0; i--)
				5054	buffer[i] = (Py_UNICODE) charbuffer[i];
				5055
				5056	va_end(va);
				5057	return len;
				5058	}
				5059
				5060	static int
				5061	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5062	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5063	int flags,
				5064	int prec,
				5065	int type,
				5066	PyObject *v)
				5067	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5068	/* fmt = '%#.' + `prec` + `type`
				5069	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5070	char fmt[20];
				5071	double x;
				5072
				5073	x = PyFloat_AsDouble(v);
				5074	if (x == -1.0 && PyErr_Occurred())
				5075	return -1;
				5076	if (prec < 0)
				5077	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5078	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				5079	type = 'g';
Barry Warsaw	e5c492d	2001-11-28 21:00:41 +0000	[diff] [blame]	5080	PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
				5081	(flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5082	/* worst case length calc to ensure no buffer overrun:
				5083	fmt = %#.<prec>g
				5084	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				5085	for any double rep.)
				5086	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				5087	If prec=0 the effective precision is 1 (the leading digit is
				5088	always given), therefore increase by one to 10+prec. */
				5089	if (buflen <= (size_t)10 + (size_t)prec) {
				5090	PyErr_SetString(PyExc_OverflowError,
				5091	"formatted float is too long (precision too long?)");
				5092	return -1;
				5093	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5094	return usprintf(buf, fmt, x);
				5095	}
				5096
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5097	static PyObject*
				5098	formatlong(PyObject *val, int flags, int prec, int type)
				5099	{
				5100	char *buf;
				5101	int i, len;
				5102	PyObject str; / temporary string object. */
				5103	PyUnicodeObject *result;
				5104
				5105	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				5106	if (!str)
				5107	return NULL;
				5108	result = _PyUnicode_New(len);
				5109	for (i = 0; i < len; i++)
				5110	result->str[i] = buf[i];
				5111	result->str[len] = 0;
				5112	Py_DECREF(str);
				5113	return (PyObject*)result;
				5114	}
				5115
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5116	static int
				5117	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5118	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5119	int flags,
				5120	int prec,
				5121	int type,
				5122	PyObject *v)
				5123	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5124	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5125	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				5126	+ 1 + 1 = 24*/
				5127	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5128	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5129	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5130
				5131	x = PyInt_AsLong(v);
				5132	if (x == -1 && PyErr_Occurred())
				5133	return -1;
				5134	if (prec < 0)
				5135	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5136	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				5137	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				5138	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				5139	PyErr_SetString(PyExc_OverflowError,
				5140	"formatted integer is too long (precision too long?)");
				5141	return -1;
				5142	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5143	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				5144	* but we want it (for consistency with other %#x conversions, and
				5145	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5146	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				5147	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				5148	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5149	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5150	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				5151	/* Only way to know what the platform does is to try it. */
Barry Warsaw	e5c492d	2001-11-28 21:00:41 +0000	[diff] [blame]	5152	PyOS_snprintf(fmt, sizeof(fmt), type == 'x' ? "%#x" : "%#X", 0);
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5153	if (fmt[1] != (char)type) {
				5154	/* Supply our own leading 0x/0X -- needed under std C */
				5155	use_native_c_format = 0;
Barry Warsaw	e5c492d	2001-11-28 21:00:41 +0000	[diff] [blame]	5156	PyOS_snprintf(fmt, sizeof(fmt), "0%c%%#.%dl%c", type, prec, type);
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5157	}
				5158	}
				5159	if (use_native_c_format)
Barry Warsaw	e5c492d	2001-11-28 21:00:41 +0000	[diff] [blame]	5160	PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
				5161	(flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5162	return usprintf(buf, fmt, x);
				5163	}
				5164
				5165	static int
				5166	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5167	size_t buflen,
				5168	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5169	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5170	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5171	if (PyUnicode_Check(v)) {
				5172	if (PyUnicode_GET_SIZE(v) != 1)
				5173	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5174	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5175	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5176
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5177	else if (PyString_Check(v)) {
				5178	if (PyString_GET_SIZE(v) != 1)
				5179	goto onError;
				5180	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				5181	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5182
				5183	else {
				5184	/* Integer input truncated to a character */
				5185	long x;
				5186	x = PyInt_AsLong(v);
				5187	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5188	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5189	buf[0] = (char) x;
				5190	}
				5191	buf[1] = '\0';
				5192	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5193
				5194	onError:
				5195	PyErr_SetString(PyExc_TypeError,
				5196	"%c requires int or char");
				5197	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5198	}
				5199
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5200	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				5201
				5202	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				5203	chars are formatted. XXX This is a magic number. Each formatting
				5204	routine does bounds checking to ensure no overflow, but a better
				5205	solution may be to malloc a buffer of appropriate size for each
				5206	format. For now, the current solution is sufficient.
				5207	*/
				5208	#define FORMATBUFLEN (size_t)120
				5209
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5210	PyObject PyUnicode_Format(PyObject format,
				5211	PyObject *args)
				5212	{
				5213	Py_UNICODE fmt, res;
				5214	int fmtcnt, rescnt, reslen, arglen, argidx;
				5215	int args_owned = 0;
				5216	PyUnicodeObject *result = NULL;
				5217	PyObject *dict = NULL;
				5218	PyObject *uformat;
				5219
				5220	if (format == NULL \|\| args == NULL) {
				5221	PyErr_BadInternalCall();
				5222	return NULL;
				5223	}
				5224	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5225	if (uformat == NULL)
				5226	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5227	fmt = PyUnicode_AS_UNICODE(uformat);
				5228	fmtcnt = PyUnicode_GET_SIZE(uformat);
				5229
				5230	reslen = rescnt = fmtcnt + 100;
				5231	result = _PyUnicode_New(reslen);
				5232	if (result == NULL)
				5233	goto onError;
				5234	res = PyUnicode_AS_UNICODE(result);
				5235
				5236	if (PyTuple_Check(args)) {
				5237	arglen = PyTuple_Size(args);
				5238	argidx = 0;
				5239	}
				5240	else {
				5241	arglen = -1;
				5242	argidx = -2;
				5243	}
				5244	if (args->ob_type->tp_as_mapping)
				5245	dict = args;
				5246
				5247	while (--fmtcnt >= 0) {
				5248	if (*fmt != '%') {
				5249	if (--rescnt < 0) {
				5250	rescnt = fmtcnt + 100;
				5251	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5252	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5253	return NULL;
				5254	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				5255	--rescnt;
				5256	}
				5257	res++ = fmt++;
				5258	}
				5259	else {
				5260	/* Got a format specifier */
				5261	int flags = 0;
				5262	int width = -1;
				5263	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5264	Py_UNICODE c = '\0';
				5265	Py_UNICODE fill;
				5266	PyObject *v = NULL;
				5267	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5268	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5269	Py_UNICODE sign;
				5270	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5271	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5272
				5273	fmt++;
				5274	if (*fmt == '(') {
				5275	Py_UNICODE *keystart;
				5276	int keylen;
				5277	PyObject *key;
				5278	int pcount = 1;
				5279
				5280	if (dict == NULL) {
				5281	PyErr_SetString(PyExc_TypeError,
				5282	"format requires a mapping");
				5283	goto onError;
				5284	}
				5285	++fmt;
				5286	--fmtcnt;
				5287	keystart = fmt;
				5288	/* Skip over balanced parentheses */
				5289	while (pcount > 0 && --fmtcnt >= 0) {
				5290	if (*fmt == ')')
				5291	--pcount;
				5292	else if (*fmt == '(')
				5293	++pcount;
				5294	fmt++;
				5295	}
				5296	keylen = fmt - keystart - 1;
				5297	if (fmtcnt < 0 \|\| pcount > 0) {
				5298	PyErr_SetString(PyExc_ValueError,
				5299	"incomplete format key");
				5300	goto onError;
				5301	}
Marc-André Lemburg	72f8213	2001-11-20 15:18:49 +0000	[diff] [blame]	5302	#if 0
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5303	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5304	then looked up since Python uses strings to hold
				5305	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5306	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5307	key = PyUnicode_EncodeUTF8(keystart,
				5308	keylen,
				5309	NULL);
Marc-André Lemburg	72f8213	2001-11-20 15:18:49 +0000	[diff] [blame]	5310	#else
				5311	key = PyUnicode_FromUnicode(keystart, keylen);
				5312	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5313	if (key == NULL)
				5314	goto onError;
				5315	if (args_owned) {
				5316	Py_DECREF(args);
				5317	args_owned = 0;
				5318	}
				5319	args = PyObject_GetItem(dict, key);
				5320	Py_DECREF(key);
				5321	if (args == NULL) {
				5322	goto onError;
				5323	}
				5324	args_owned = 1;
				5325	arglen = -1;
				5326	argidx = -2;
				5327	}
				5328	while (--fmtcnt >= 0) {
				5329	switch (c = *fmt++) {
				5330	case '-': flags \|= F_LJUST; continue;
				5331	case '+': flags \|= F_SIGN; continue;
				5332	case ' ': flags \|= F_BLANK; continue;
				5333	case '#': flags \|= F_ALT; continue;
				5334	case '0': flags \|= F_ZERO; continue;
				5335	}
				5336	break;
				5337	}
				5338	if (c == '*') {
				5339	v = getnextarg(args, arglen, &argidx);
				5340	if (v == NULL)
				5341	goto onError;
				5342	if (!PyInt_Check(v)) {
				5343	PyErr_SetString(PyExc_TypeError,
				5344	"* wants int");
				5345	goto onError;
				5346	}
				5347	width = PyInt_AsLong(v);
				5348	if (width < 0) {
				5349	flags \|= F_LJUST;
				5350	width = -width;
				5351	}
				5352	if (--fmtcnt >= 0)
				5353	c = *fmt++;
				5354	}
				5355	else if (c >= '0' && c <= '9') {
				5356	width = c - '0';
				5357	while (--fmtcnt >= 0) {
				5358	c = *fmt++;
				5359	if (c < '0' \|\| c > '9')
				5360	break;
				5361	if ((width*10) / 10 != width) {
				5362	PyErr_SetString(PyExc_ValueError,
				5363	"width too big");
				5364	goto onError;
				5365	}
				5366	width = width*10 + (c - '0');
				5367	}
				5368	}
				5369	if (c == '.') {
				5370	prec = 0;
				5371	if (--fmtcnt >= 0)
				5372	c = *fmt++;
				5373	if (c == '*') {
				5374	v = getnextarg(args, arglen, &argidx);
				5375	if (v == NULL)
				5376	goto onError;
				5377	if (!PyInt_Check(v)) {
				5378	PyErr_SetString(PyExc_TypeError,
				5379	"* wants int");
				5380	goto onError;
				5381	}
				5382	prec = PyInt_AsLong(v);
				5383	if (prec < 0)
				5384	prec = 0;
				5385	if (--fmtcnt >= 0)
				5386	c = *fmt++;
				5387	}
				5388	else if (c >= '0' && c <= '9') {
				5389	prec = c - '0';
				5390	while (--fmtcnt >= 0) {
				5391	c = Py_CHARMASK(*fmt++);
				5392	if (c < '0' \|\| c > '9')
				5393	break;
				5394	if ((prec*10) / 10 != prec) {
				5395	PyErr_SetString(PyExc_ValueError,
				5396	"prec too big");
				5397	goto onError;
				5398	}
				5399	prec = prec*10 + (c - '0');
				5400	}
				5401	}
				5402	} /* prec */
				5403	if (fmtcnt >= 0) {
				5404	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5405	if (--fmtcnt >= 0)
				5406	c = *fmt++;
				5407	}
				5408	}
				5409	if (fmtcnt < 0) {
				5410	PyErr_SetString(PyExc_ValueError,
				5411	"incomplete format");
				5412	goto onError;
				5413	}
				5414	if (c != '%') {
				5415	v = getnextarg(args, arglen, &argidx);
				5416	if (v == NULL)
				5417	goto onError;
				5418	}
				5419	sign = 0;
				5420	fill = ' ';
				5421	switch (c) {
				5422
				5423	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5424	pbuf = formatbuf;
				5425	/* presume that buffer length is at least 1 */
				5426	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5427	len = 1;
				5428	break;
				5429
				5430	case 's':
				5431	case 'r':
				5432	if (PyUnicode_Check(v) && c == 's') {
				5433	temp = v;
				5434	Py_INCREF(temp);
				5435	}
				5436	else {
				5437	PyObject *unicode;
				5438	if (c == 's')
				5439	temp = PyObject_Str(v);
				5440	else
				5441	temp = PyObject_Repr(v);
				5442	if (temp == NULL)
				5443	goto onError;
				5444	if (!PyString_Check(temp)) {
				5445	/* XXX Note: this should never happen, since
				5446	PyObject_Repr() and PyObject_Str() assure
				5447	this */
				5448	Py_DECREF(temp);
				5449	PyErr_SetString(PyExc_TypeError,
				5450	"%s argument has non-string str()");
				5451	goto onError;
				5452	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5453	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5454	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5455	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5456	"strict");
				5457	Py_DECREF(temp);
				5458	temp = unicode;
				5459	if (temp == NULL)
				5460	goto onError;
				5461	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5462	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5463	len = PyUnicode_GET_SIZE(temp);
				5464	if (prec >= 0 && len > prec)
				5465	len = prec;
				5466	break;
				5467
				5468	case 'i':
				5469	case 'd':
				5470	case 'u':
				5471	case 'o':
				5472	case 'x':
				5473	case 'X':
				5474	if (c == 'i')
				5475	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5476	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5477	temp = formatlong(v, flags, prec, c);
				5478	if (!temp)
				5479	goto onError;
				5480	pbuf = PyUnicode_AS_UNICODE(temp);
				5481	len = PyUnicode_GET_SIZE(temp);
				5482	/* unbounded ints can always produce
				5483	a sign character! */
				5484	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5485	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5486	else {
				5487	pbuf = formatbuf;
				5488	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5489	flags, prec, c, v);
				5490	if (len < 0)
				5491	goto onError;
				5492	/* only d conversion is signed */
				5493	sign = c == 'd';
				5494	}
				5495	if (flags & F_ZERO)
				5496	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5497	break;
				5498
				5499	case 'e':
				5500	case 'E':
				5501	case 'f':
				5502	case 'g':
				5503	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5504	pbuf = formatbuf;
				5505	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5506	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5507	if (len < 0)
				5508	goto onError;
				5509	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5510	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5511	fill = '0';
				5512	break;
				5513
				5514	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5515	pbuf = formatbuf;
				5516	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5517	if (len < 0)
				5518	goto onError;
				5519	break;
				5520
				5521	default:
				5522	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5523	"unsupported format character '%c' (0x%x) "
				5524	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5525	(31<=c && c<=126) ? c : '?',
				5526	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5527	goto onError;
				5528	}
				5529	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5530	if (pbuf == '-' \|\| pbuf == '+') {
				5531	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5532	len--;
				5533	}
				5534	else if (flags & F_SIGN)
				5535	sign = '+';
				5536	else if (flags & F_BLANK)
				5537	sign = ' ';
				5538	else
				5539	sign = 0;
				5540	}
				5541	if (width < len)
				5542	width = len;
				5543	if (rescnt < width + (sign != 0)) {
				5544	reslen -= rescnt;
				5545	rescnt = width + fmtcnt + 100;
				5546	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5547	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5548	return NULL;
				5549	res = PyUnicode_AS_UNICODE(result)
				5550	+ reslen - rescnt;
				5551	}
				5552	if (sign) {
				5553	if (fill != ' ')
				5554	*res++ = sign;
				5555	rescnt--;
				5556	if (width > len)
				5557	width--;
				5558	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5559	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5560	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5561	assert(pbuf[1] == c);
				5562	if (fill != ' ') {
				5563	res++ = pbuf++;
				5564	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5565	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5566	rescnt -= 2;
				5567	width -= 2;
				5568	if (width < 0)
				5569	width = 0;
				5570	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5571	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5572	if (width > len && !(flags & F_LJUST)) {
				5573	do {
				5574	--rescnt;
				5575	*res++ = fill;
				5576	} while (--width > len);
				5577	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5578	if (fill == ' ') {
				5579	if (sign)
				5580	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5581	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5582	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5583	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5584	res++ = pbuf++;
				5585	res++ = pbuf++;
				5586	}
				5587	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5588	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5589	res += len;
				5590	rescnt -= len;
				5591	while (--width >= len) {
				5592	--rescnt;
				5593	*res++ = ' ';
				5594	}
				5595	if (dict && (argidx < arglen) && c != '%') {
				5596	PyErr_SetString(PyExc_TypeError,
				5597	"not all arguments converted");
				5598	goto onError;
				5599	}
				5600	Py_XDECREF(temp);
				5601	} /* '%' */
				5602	} /* until end */
				5603	if (argidx < arglen && !dict) {
				5604	PyErr_SetString(PyExc_TypeError,
				5605	"not all arguments converted");
				5606	goto onError;
				5607	}
				5608
				5609	if (args_owned) {
				5610	Py_DECREF(args);
				5611	}
				5612	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5613	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5614	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5615	return (PyObject *)result;
				5616
				5617	onError:
				5618	Py_XDECREF(result);
				5619	Py_DECREF(uformat);
				5620	if (args_owned) {
				5621	Py_DECREF(args);
				5622	}
				5623	return NULL;
				5624	}
				5625
				5626	static PyBufferProcs unicode_as_buffer = {
				5627	(getreadbufferproc) unicode_buffer_getreadbuf,
				5628	(getwritebufferproc) unicode_buffer_getwritebuf,
				5629	(getsegcountproc) unicode_buffer_getsegcount,
				5630	(getcharbufferproc) unicode_buffer_getcharbuf,
				5631	};
				5632
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5633	staticforward PyObject *
				5634	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds);
				5635
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5636	static PyObject *
				5637	unicode_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5638	{
				5639	PyObject *x = NULL;
				5640	static char *kwlist[] = {"string", "encoding", "errors", 0};
				5641	char *encoding = NULL;
				5642	char *errors = NULL;
				5643
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5644	if (type != &PyUnicode_Type)
				5645	return unicode_subtype_new(type, args, kwds);
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5646	if (!PyArg_ParseTupleAndKeywords(args, kwds, "\|Oss:unicode",
				5647	kwlist, &x, &encoding, &errors))
				5648	return NULL;
				5649	if (x == NULL)
				5650	return (PyObject *)_PyUnicode_New(0);
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	5651	if (encoding == NULL && errors == NULL)
				5652	return PyObject_Unicode(x);
				5653	else
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5654	return PyUnicode_FromEncodedObject(x, encoding, errors);
				5655	}
				5656
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5657	static PyObject *
				5658	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5659	{
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5660	PyUnicodeObject tmp, pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5661	int n;
				5662
				5663	assert(PyType_IsSubtype(type, &PyUnicode_Type));
				5664	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
				5665	if (tmp == NULL)
				5666	return NULL;
				5667	assert(PyUnicode_Check(tmp));
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5668	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
				5669	if (pnew == NULL)
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5670	return NULL;
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5671	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
				5672	if (pnew->str == NULL) {
				5673	_Py_ForgetReference((PyObject *)pnew);
				5674	PyObject_DEL(pnew);
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5675	return NULL;
				5676	}
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5677	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
				5678	pnew->length = n;
				5679	pnew->hash = tmp->hash;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5680	Py_DECREF(tmp);
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5681	return (PyObject *)pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5682	}
				5683
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5684	static char unicode_doc[] =
				5685	"unicode(string [, encoding[, errors]]) -> object\n\
				5686	\n\
				5687	Create a new Unicode object from the given encoded string.\n\
				5688	encoding defaults to the current default string encoding and \n\
				5689	errors, defining the error handling, to 'strict'.";
				5690
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5691	PyTypeObject PyUnicode_Type = {
				5692	PyObject_HEAD_INIT(&PyType_Type)
				5693	0, /* ob_size */
				5694	"unicode", /* tp_name */
				5695	sizeof(PyUnicodeObject), /* tp_size */
				5696	0, /* tp_itemsize */
				5697	/* Slots */
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	5698	(destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5699	0, /* tp_print */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5700	0, /* tp_getattr */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5701	0, /* tp_setattr */
				5702	(cmpfunc) unicode_compare, /* tp_compare */
				5703	(reprfunc) unicode_repr, /* tp_repr */
				5704	0, /* tp_as_number */
				5705	&unicode_as_sequence, /* tp_as_sequence */
				5706	0, /* tp_as_mapping */
				5707	(hashfunc) unicode_hash, /* tp_hash*/
				5708	0, /* tp_call*/
				5709	(reprfunc) unicode_str, /* tp_str */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5710	PyObject_GenericGetAttr, /* tp_getattro */
				5711	0, /* tp_setattro */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5712	&unicode_as_buffer, /* tp_as_buffer */
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5713	Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5714	unicode_doc, /* tp_doc */
				5715	0, /* tp_traverse */
				5716	0, /* tp_clear */
				5717	0, /* tp_richcompare */
				5718	0, /* tp_weaklistoffset */
				5719	0, /* tp_iter */
				5720	0, /* tp_iternext */
				5721	unicode_methods, /* tp_methods */
				5722	0, /* tp_members */
				5723	0, /* tp_getset */
				5724	0, /* tp_base */
				5725	0, /* tp_dict */
				5726	0, /* tp_descr_get */
				5727	0, /* tp_descr_set */
				5728	0, /* tp_dictoffset */
				5729	0, /* tp_init */
				5730	0, /* tp_alloc */
				5731	unicode_new, /* tp_new */
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	5732	_PyObject_Del, /* tp_free */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5733	};
				5734
				5735	/* Initialize the Unicode implementation */
				5736
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5737	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5738	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5739	int i;
				5740
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5741	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5742	unicode_freelist = NULL;
				5743	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5744	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5745	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5746	for (i = 0; i < 256; i++)
				5747	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5748	}
				5749
				5750	/* Finalize the Unicode implementation */
				5751
				5752	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5753	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5754	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5755	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5756	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5757
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5758	Py_XDECREF(unicode_empty);
				5759	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5760
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5761	for (i = 0; i < 256; i++) {
				5762	if (unicode_latin1[i]) {
				5763	Py_DECREF(unicode_latin1[i]);
				5764	unicode_latin1[i] = NULL;
				5765	}
				5766	}
				5767
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5768	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5769	PyUnicodeObject *v = u;
				5770	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5771	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5772	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5773	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5774	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5775	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5776	unicode_freelist = NULL;
				5777	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5778	}