Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: a46df163f6f951cad2a249218812ae64a19343a8 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
				107	PyUnicode_GetMax()
				108	{
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	109	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
				204	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	222	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
				227	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				228	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	230	/* Keep-Alive optimization */
				231	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	232	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	233	unicode->str = NULL;
				234	unicode->length = 0;
				235	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	236	if (unicode->defenc) {
				237	Py_DECREF(unicode->defenc);
				238	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	239	}
				240	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	(PyUnicodeObject *)unicode = unicode_freelist;
				242	unicode_freelist = unicode;
				243	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	244	}
				245	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	246	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	247	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	248	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	249	}
				250	}
				251
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	252	int PyUnicode_Resize(PyObject **unicode,
				253	int length)
				254	{
				255	register PyUnicodeObject *v;
				256
				257	/* Argument checks */
				258	if (unicode == NULL) {
				259	PyErr_BadInternalCall();
				260	return -1;
				261	}
				262	v = (PyUnicodeObject )unicode;
				263	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				264	PyErr_BadInternalCall();
				265	return -1;
				266	}
				267
				268	/* Resizing unicode_empty and single character objects is not
				269	possible since these are being shared. We simply return a fresh
				270	copy with the same Unicode content. */
				271	if (v->length != length &&
				272	(v == unicode_empty \|\| v->length == 1)) {
				273	PyUnicodeObject *w = _PyUnicode_New(length);
				274	if (w == NULL)
				275	return -1;
				276	Py_UNICODE_COPY(w->str, v->str,
				277	length < v->length ? length : v->length);
				278	unicode = (PyObject )w;
				279	return 0;
				280	}
				281
				282	/* Note that we don't have to modify *unicode for unshared Unicode
				283	objects, since we can modify them in-place. */
				284	return unicode_resize(v, length);
				285	}
				286
				287	/* Internal API for use in unicodeobject.c only ! */
				288	#define _PyUnicode_Resize(unicodevar, length) \
				289	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				290
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	291	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				292	int size)
				293	{
				294	PyUnicodeObject *unicode;
				295
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	296	/* If the Unicode data is known at construction time, we can apply
				297	some optimizations which share commonly used objects. */
				298	if (u != NULL) {
				299
				300	/* Optimization for empty strings */
				301	if (size == 0 && unicode_empty != NULL) {
				302	Py_INCREF(unicode_empty);
				303	return (PyObject *)unicode_empty;
				304	}
				305
				306	/* Single character Unicode objects in the Latin-1 range are
				307	shared when using this constructor */
				308	if (size == 1 && *u < 256) {
				309	unicode = unicode_latin1[*u];
				310	if (!unicode) {
				311	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	312	if (!unicode)
				313	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	314	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	315	unicode_latin1[*u] = unicode;
				316	}
				317	Py_INCREF(unicode);
				318	return (PyObject *)unicode;
				319	}
				320	}
				321
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	322	unicode = _PyUnicode_New(size);
				323	if (!unicode)
				324	return NULL;
				325
				326	/* Copy the Unicode data into the new object */
				327	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	328	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	329
				330	return (PyObject *)unicode;
				331	}
				332
				333	#ifdef HAVE_WCHAR_H
				334
				335	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				336	int size)
				337	{
				338	PyUnicodeObject *unicode;
				339
				340	if (w == NULL) {
				341	PyErr_BadInternalCall();
				342	return NULL;
				343	}
				344
				345	unicode = _PyUnicode_New(size);
				346	if (!unicode)
				347	return NULL;
				348
				349	/* Copy the wchar_t data into the new object */
				350	#ifdef HAVE_USABLE_WCHAR_T
				351	memcpy(unicode->str, w, size * sizeof(wchar_t));
				352	#else
				353	{
				354	register Py_UNICODE *u;
				355	register int i;
				356	u = PyUnicode_AS_UNICODE(unicode);
				357	for (i = size; i >= 0; i--)
				358	u++ = w++;
				359	}
				360	#endif
				361
				362	return (PyObject *)unicode;
				363	}
				364
				365	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				366	register wchar_t *w,
				367	int size)
				368	{
				369	if (unicode == NULL) {
				370	PyErr_BadInternalCall();
				371	return -1;
				372	}
				373	if (size > PyUnicode_GET_SIZE(unicode))
				374	size = PyUnicode_GET_SIZE(unicode);
				375	#ifdef HAVE_USABLE_WCHAR_T
				376	memcpy(w, unicode->str, size * sizeof(wchar_t));
				377	#else
				378	{
				379	register Py_UNICODE *u;
				380	register int i;
				381	u = PyUnicode_AS_UNICODE(unicode);
				382	for (i = size; i >= 0; i--)
				383	w++ = u++;
				384	}
				385	#endif
				386
				387	return size;
				388	}
				389
				390	#endif
				391
				392	PyObject PyUnicode_FromObject(register PyObject obj)
				393	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	394	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				395	}
				396
				397	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				398	const char *encoding,
				399	const char *errors)
				400	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	401	const char *s;
				402	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	403	int owned = 0;
				404	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	405
				406	if (obj == NULL) {
				407	PyErr_BadInternalCall();
				408	return NULL;
				409	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	410
				411	/* Coerce object */
				412	if (PyInstance_Check(obj)) {
				413	PyObject *func;
				414	func = PyObject_GetAttrString(obj, "__str__");
				415	if (func == NULL) {
				416	PyErr_SetString(PyExc_TypeError,
				417	"coercing to Unicode: instance doesn't define __str__");
				418	return NULL;
				419	}
				420	obj = PyEval_CallObject(func, NULL);
				421	Py_DECREF(func);
				422	if (obj == NULL)
				423	return NULL;
				424	owned = 1;
				425	}
				426	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	427	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	428	v = obj;
				429	if (encoding) {
				430	PyErr_SetString(PyExc_TypeError,
				431	"decoding Unicode is not supported");
				432	return NULL;
				433	}
				434	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	435	}
				436	else if (PyString_Check(obj)) {
				437	s = PyString_AS_STRING(obj);
				438	len = PyString_GET_SIZE(obj);
				439	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	440	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				441	/* Overwrite the error message with something more useful in
				442	case of a TypeError. */
				443	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	444	PyErr_Format(PyExc_TypeError,
				445	"coercing to Unicode: need string or buffer, "
				446	"%.80s found",
				447	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	448	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	449	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	450
				451	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	452	if (len == 0) {
				453	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	454	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	455	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	456	else
				457	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	458
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	459	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	460	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	461	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	462	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	463	return v;
				464
				465	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	466	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	467	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	468	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	469	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	470	}
				471
				472	PyObject PyUnicode_Decode(const char s,
				473	int size,
				474	const char *encoding,
				475	const char *errors)
				476	{
				477	PyObject buffer = NULL, unicode;
				478
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	479	if (encoding == NULL)
				480	encoding = PyUnicode_GetDefaultEncoding();
				481
				482	/* Shortcuts for common default encodings */
				483	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	484	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	485	else if (strcmp(encoding, "latin-1") == 0)
				486	return PyUnicode_DecodeLatin1(s, size, errors);
				487	else if (strcmp(encoding, "ascii") == 0)
				488	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	489
				490	/* Decode via the codec registry */
				491	buffer = PyBuffer_FromMemory((void *)s, size);
				492	if (buffer == NULL)
				493	goto onError;
				494	unicode = PyCodec_Decode(buffer, encoding, errors);
				495	if (unicode == NULL)
				496	goto onError;
				497	if (!PyUnicode_Check(unicode)) {
				498	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	499	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	500	unicode->ob_type->tp_name);
				501	Py_DECREF(unicode);
				502	goto onError;
				503	}
				504	Py_DECREF(buffer);
				505	return unicode;
				506
				507	onError:
				508	Py_XDECREF(buffer);
				509	return NULL;
				510	}
				511
				512	PyObject PyUnicode_Encode(const Py_UNICODE s,
				513	int size,
				514	const char *encoding,
				515	const char *errors)
				516	{
				517	PyObject v, unicode;
				518
				519	unicode = PyUnicode_FromUnicode(s, size);
				520	if (unicode == NULL)
				521	return NULL;
				522	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				523	Py_DECREF(unicode);
				524	return v;
				525	}
				526
				527	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				528	const char *encoding,
				529	const char *errors)
				530	{
				531	PyObject *v;
				532
				533	if (!PyUnicode_Check(unicode)) {
				534	PyErr_BadArgument();
				535	goto onError;
				536	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	537
				538	if (encoding == NULL)
				539	encoding = PyUnicode_GetDefaultEncoding();
				540
				541	/* Shortcuts for common default encodings */
				542	if (errors == NULL) {
				543	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	544	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	545	else if (strcmp(encoding, "latin-1") == 0)
				546	return PyUnicode_AsLatin1String(unicode);
				547	else if (strcmp(encoding, "ascii") == 0)
				548	return PyUnicode_AsASCIIString(unicode);
				549	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	550
				551	/* Encode via the codec registry */
				552	v = PyCodec_Encode(unicode, encoding, errors);
				553	if (v == NULL)
				554	goto onError;
				555	/* XXX Should we really enforce this ? */
				556	if (!PyString_Check(v)) {
				557	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	558	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	559	v->ob_type->tp_name);
				560	Py_DECREF(v);
				561	goto onError;
				562	}
				563	return v;
				564
				565	onError:
				566	return NULL;
				567	}
				568
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	569	/* Return a Python string holding the default encoded value of the
				570	Unicode object.
				571
				572	The resulting string is cached in the Unicode object for subsequent
				573	usage by this function. The cached version is needed to implement
				574	the character buffer interface and will live (at least) as long as
				575	the Unicode object itself.
				576
				577	The refcount of the string is not incremented.
				578
				579	* Exported for internal use by the interpreter only !!! *
				580
				581	*/
				582
				583	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				584	const char *errors)
				585	{
				586	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				587
				588	if (v)
				589	return v;
				590	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				591	if (v && errors == NULL)
				592	((PyUnicodeObject *)unicode)->defenc = v;
				593	return v;
				594	}
				595
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	596	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				597	{
				598	if (!PyUnicode_Check(unicode)) {
				599	PyErr_BadArgument();
				600	goto onError;
				601	}
				602	return PyUnicode_AS_UNICODE(unicode);
				603
				604	onError:
				605	return NULL;
				606	}
				607
				608	int PyUnicode_GetSize(PyObject *unicode)
				609	{
				610	if (!PyUnicode_Check(unicode)) {
				611	PyErr_BadArgument();
				612	goto onError;
				613	}
				614	return PyUnicode_GET_SIZE(unicode);
				615
				616	onError:
				617	return -1;
				618	}
				619
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	620	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	621	{
				622	return unicode_default_encoding;
				623	}
				624
				625	int PyUnicode_SetDefaultEncoding(const char *encoding)
				626	{
				627	PyObject *v;
				628
				629	/* Make sure the encoding is valid. As side effect, this also
				630	loads the encoding into the codec registry cache. */
				631	v = _PyCodec_Lookup(encoding);
				632	if (v == NULL)
				633	goto onError;
				634	Py_DECREF(v);
				635	strncpy(unicode_default_encoding,
				636	encoding,
				637	sizeof(unicode_default_encoding));
				638	return 0;
				639
				640	onError:
				641	return -1;
				642	}
				643
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	644	/* --- UTF-8 Codec -------------------------------------------------------- */
				645
				646	static
				647	char utf8_code_length[256] = {
				648	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				649	illegal prefix. see RFC 2279 for details */
				650	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				651	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				652	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				653	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				654	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				655	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				656	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				657	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				658	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				659	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				660	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				661	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				662	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				663	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				664	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				665	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				666	};
				667
				668	static
				669	int utf8_decoding_error(const char **source,
				670	Py_UNICODE **dest,
				671	const char *errors,
				672	const char *details)
				673	{
				674	if ((errors == NULL) \|\|
				675	(strcmp(errors,"strict") == 0)) {
				676	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	677	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	678	details);
				679	return -1;
				680	}
				681	else if (strcmp(errors,"ignore") == 0) {
				682	(*source)++;
				683	return 0;
				684	}
				685	else if (strcmp(errors,"replace") == 0) {
				686	(*source)++;
				687	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				688	(*dest)++;
				689	return 0;
				690	}
				691	else {
				692	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	693	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	694	errors);
				695	return -1;
				696	}
				697	}
				698
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699	PyObject PyUnicode_DecodeUTF8(const char s,
				700	int size,
				701	const char *errors)
				702	{
				703	int n;
				704	const char *e;
				705	PyUnicodeObject *unicode;
				706	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	707	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	708
				709	/* Note: size will always be longer than the resulting Unicode
				710	character count */
				711	unicode = _PyUnicode_New(size);
				712	if (!unicode)
				713	return NULL;
				714	if (size == 0)
				715	return (PyObject *)unicode;
				716
				717	/* Unpack UTF-8 encoded data */
				718	p = unicode->str;
				719	e = s + size;
				720
				721	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	722	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	723
				724	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	725	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	726	s++;
				727	continue;
				728	}
				729
				730	n = utf8_code_length[ch];
				731
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	732	if (s + n > e) {
				733	errmsg = "unexpected end of data";
				734	goto utf8Error;
				735	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	736
				737	switch (n) {
				738
				739	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	740	errmsg = "unexpected code byte";
				741	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	742
				743	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	744	errmsg = "internal error";
				745	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	746
				747	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	748	if ((s[1] & 0xc0) != 0x80) {
				749	errmsg = "invalid data";
				750	goto utf8Error;
				751	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	752	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	753	if (ch < 0x80) {
				754	errmsg = "illegal encoding";
				755	goto utf8Error;
				756	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	757	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	758	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	759	break;
				760
				761	case 3:
				762	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	763	(s[2] & 0xc0) != 0x80) {
				764	errmsg = "invalid data";
				765	goto utf8Error;
				766	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	767	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	768	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				769	errmsg = "illegal encoding";
				770	goto utf8Error;
				771	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	772	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	773	*p++ = (Py_UNICODE)ch;
				774	break;
				775
				776	case 4:
				777	if ((s[1] & 0xc0) != 0x80 \|\|
				778	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	779	(s[3] & 0xc0) != 0x80) {
				780	errmsg = "invalid data";
				781	goto utf8Error;
				782	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	783	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				784	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				785	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	786	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	787	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	788	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	789	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	790	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	791	errmsg = "illegal encoding";
				792	goto utf8Error;
				793	}
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	794	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	795	*p++ = (Py_UNICODE)ch;
				796	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	797	/* compute and append the two surrogates: */
				798
				799	/* translate from 10000..10FFFF to 0..FFFF */
				800	ch -= 0x10000;
				801
				802	/* high surrogate = top 10 bits added to D800 */
				803	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				804
				805	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	806	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	807	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	808	break;
				809
				810	default:
				811	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	812	errmsg = "unsupported Unicode code range";
				813	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	814	}
				815	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	816	continue;
				817
				818	utf8Error:
				819	if (utf8_decoding_error(&s, &p, errors, errmsg))
				820	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	821	}
				822
				823	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	824	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	825	goto onError;
				826
				827	return (PyObject *)unicode;
				828
				829	onError:
				830	Py_DECREF(unicode);
				831	return NULL;
				832	}
				833
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	834	/* Not used anymore, now that the encoder supports UTF-16
				835	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	836	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	837	static
				838	int utf8_encoding_error(const Py_UNICODE **source,
				839	char **dest,
				840	const char *errors,
				841	const char *details)
				842	{
				843	if ((errors == NULL) \|\|
				844	(strcmp(errors,"strict") == 0)) {
				845	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	846	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	847	details);
				848	return -1;
				849	}
				850	else if (strcmp(errors,"ignore") == 0) {
				851	return 0;
				852	}
				853	else if (strcmp(errors,"replace") == 0) {
				854	**dest = '?';
				855	(*dest)++;
				856	return 0;
				857	}
				858	else {
				859	PyErr_Format(PyExc_ValueError,
				860	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	861	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	862	errors);
				863	return -1;
				864	}
				865	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	866	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	867
				868	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				869	int size,
				870	const char *errors)
				871	{
				872	PyObject *v;
				873	char *p;
				874	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	875	Py_UCS4 ch2;
				876	unsigned int cbAllocated = 3 * size;
				877	unsigned int cbWritten = 0;
				878	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	879
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	880	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	881	if (v == NULL)
				882	return NULL;
				883	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	884	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	885
				886	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	887	while (i < size) {
				888	Py_UCS4 ch = s[i++];
				889	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	890	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	891	cbWritten++;
				892	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	893	else if (ch < 0x0800) {
				894	*p++ = 0xc0 \| (ch >> 6);
				895	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	896	cbWritten += 2;
				897	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	898	else if (ch < 0x10000) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	899	/* Check for high surrogate */
				900	if (0xD800 <= ch && ch <= 0xDBFF) {
				901	if (i != size) {
				902	ch2 = s[i];
				903	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				904
				905	if (cbWritten >= (cbAllocated - 4)) {
				906	/* Provide enough room for some more
				907	surrogates */
				908	cbAllocated += 4*10;
				909	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	910	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	911	}
				912
				913	/* combine the two values */
				914	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				915
				916	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	917	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	918	i++;
				919	cbWritten += 4;
				920	}
				921	}
				922	}
				923	else {
				924	*p++ = (char)(0xe0 \| (ch >> 12));
				925	cbWritten += 3;
				926	}
				927	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				928	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	929	} else {
				930	*p++ = 0xf0 \| (ch>>18);
				931	*p++ = 0x80 \| ((ch>>12) & 0x3f);
				932	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				933	*p++ = 0x80 \| (ch & 0x3f);
				934	cbWritten += 4;
				935	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	936	}
				937	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	938	if (_PyString_Resize(&v, p - q))
				939	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	940	return v;
				941
				942	onError:
				943	Py_DECREF(v);
				944	return NULL;
				945	}
				946
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	947	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				948	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	949	if (!PyUnicode_Check(unicode)) {
				950	PyErr_BadArgument();
				951	return NULL;
				952	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	953	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				954	PyUnicode_GET_SIZE(unicode),
				955	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	956	}
				957
				958	/* --- UTF-16 Codec ------------------------------------------------------- */
				959
				960	static
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	961	int utf16_decoding_error(const Py_UCS2 **source,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	962	Py_UNICODE **dest,
				963	const char *errors,
				964	const char *details)
				965	{
				966	if ((errors == NULL) \|\|
				967	(strcmp(errors,"strict") == 0)) {
				968	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	969	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	970	details);
				971	return -1;
				972	}
				973	else if (strcmp(errors,"ignore") == 0) {
				974	return 0;
				975	}
				976	else if (strcmp(errors,"replace") == 0) {
				977	if (dest) {
				978	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				979	(*dest)++;
				980	}
				981	return 0;
				982	}
				983	else {
				984	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	985	"UTF-16 decoding error; "
				986	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	987	errors);
				988	return -1;
				989	}
				990	}
				991
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	992	PyObject PyUnicode_DecodeUTF16(const char s,
				993	int size,
				994	const char *errors,
				995	int *byteorder)
				996	{
				997	PyUnicodeObject *unicode;
				998	Py_UNICODE *p;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	999	const Py_UCS2 q, e;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1000	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1001	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1002
				1003	/* size should be an even number */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1004	if (size % sizeof(Py_UCS2) != 0) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1005	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				1006	return NULL;
				1007	/* The remaining input chars are ignored if we fall through
				1008	here... */
				1009	}
				1010
				1011	/* Note: size will always be longer than the resulting Unicode
				1012	character count */
				1013	unicode = _PyUnicode_New(size);
				1014	if (!unicode)
				1015	return NULL;
				1016	if (size == 0)
				1017	return (PyObject *)unicode;
				1018
				1019	/* Unpack UTF-16 encoded data */
				1020	p = unicode->str;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1021	q = (Py_UCS2 *)s;
				1022	e = q + (size / sizeof(Py_UCS2));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1023
				1024	if (byteorder)
				1025	bo = *byteorder;
				1026
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1027	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1028	byte order setting accordingly. In native mode, the leading BOM
				1029	mark is skipped, in all other modes, it is copied to the output
				1030	stream as-is (giving a ZWNBSP character). */
				1031	if (bo == 0) {
				1032	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1033	if (*q == 0xFEFF) {
				1034	q++;
				1035	bo = -1;
				1036	} else if (*q == 0xFFFE) {
				1037	q++;
				1038	bo = 1;
				1039	}
				1040	#else
				1041	if (*q == 0xFEFF) {
				1042	q++;
				1043	bo = 1;
				1044	} else if (*q == 0xFFFE) {
				1045	q++;
				1046	bo = -1;
				1047	}
				1048	#endif
				1049	}
				1050
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1051	while (q < e) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1052	register Py_UCS2 ch = *q++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1053
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1054	/* Swap input bytes if needed. (This assumes
				1055	sizeof(Py_UNICODE) == 2 !) */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1056	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1057	if (bo == 1)
				1058	ch = (ch >> 8) \| (ch << 8);
				1059	#else
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1060	if (bo == -1)
				1061	ch = (ch >> 8) \| (ch << 8);
				1062	#endif
				1063	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1064	*p++ = ch;
				1065	continue;
				1066	}
				1067
				1068	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1069	if (q >= e) {
				1070	errmsg = "unexpected end of data";
				1071	goto utf16Error;
				1072	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1073	if (0xD800 <= ch && ch <= 0xDBFF) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1074	Py_UCS2 ch2 = *q++;
				1075	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1076	if (bo == 1)
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1077	ch2 = (ch2 >> 8) \| (ch2 << 8);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1078	#else
				1079	if (bo == -1)
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1080	ch2 = (ch2 >> 8) \| (ch2 << 8);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1081	#endif
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1082	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1083	#ifndef Py_UNICODE_WIDE
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1084	/* This is valid data (a UTF-16 surrogate pair), but
				1085	we are not able to store this information since our
				1086	Py_UNICODE type only has 16 bits... this might
				1087	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1088	errmsg = "code pairs are not supported";
				1089	goto utf16Error;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1090	#else
				1091	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1092	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1093	#endif
				1094
				1095	}
				1096	else {
				1097	errmsg = "illegal UTF-16 surrogate";
				1098	goto utf16Error;
				1099	}
				1100
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1101	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1102	errmsg = "illegal encoding";
				1103	/* Fall through to report the error */
				1104
				1105	utf16Error:
				1106	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1107	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1108	}
				1109
				1110	if (byteorder)
				1111	*byteorder = bo;
				1112
				1113	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1114	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1115	goto onError;
				1116
				1117	return (PyObject *)unicode;
				1118
				1119	onError:
				1120	Py_DECREF(unicode);
				1121	return NULL;
				1122	}
				1123
				1124	#undef UTF16_ERROR
				1125
				1126	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1127	int size,
				1128	const char *errors,
				1129	int byteorder)
				1130	{
				1131	PyObject *v;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1132	Py_UCS2 *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1133	char *q;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1134	int i, pairs, doswap = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1135
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1136	for (i = pairs = 0; i < size; i++)
				1137	if (s[i] >= 0x10000)
				1138	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1139	v = PyString_FromStringAndSize(NULL,
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1140	sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1141	if (v == NULL)
				1142	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1143
				1144	q = PyString_AS_STRING(v);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1145	p = (Py_UCS2 *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1146	if (byteorder == 0)
				1147	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1148	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1149	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1150	if (byteorder == 0 \|\|
				1151	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1152	byteorder == -1
				1153	#else
				1154	byteorder == 1
				1155	#endif
				1156	)
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1157	doswap = 0;
				1158	while (size-- > 0) {
				1159	Py_UNICODE ch = *s++;
				1160	Py_UNICODE ch2 = 0;
				1161	if (ch >= 0x10000) {
				1162	ch2 = 0xDC00\|((ch-0x10000) & 0x3FF);
				1163	ch = 0xD800\|((ch-0x10000)>>10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1164	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1165	if (doswap){
				1166	*p++ = (ch >> 8) \| (ch << 8);
				1167	if (ch2)
				1168	*p++ = (ch2 >> 8) \| (ch2 << 8);
				1169	}else{
				1170	*p++ = ch;
				1171	if(ch2)
				1172	*p++ = ch2;
				1173	}
				1174	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1175	return v;
				1176	}
				1177
				1178	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1179	{
				1180	if (!PyUnicode_Check(unicode)) {
				1181	PyErr_BadArgument();
				1182	return NULL;
				1183	}
				1184	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1185	PyUnicode_GET_SIZE(unicode),
				1186	NULL,
				1187	0);
				1188	}
				1189
				1190	/* --- Unicode Escape Codec ----------------------------------------------- */
				1191
				1192	static
				1193	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1194	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1195	const char *errors,
				1196	const char *details)
				1197	{
				1198	if ((errors == NULL) \|\|
				1199	(strcmp(errors,"strict") == 0)) {
				1200	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1201	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1202	details);
				1203	return -1;
				1204	}
				1205	else if (strcmp(errors,"ignore") == 0) {
				1206	return 0;
				1207	}
				1208	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1209	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1210	return 0;
				1211	}
				1212	else {
				1213	PyErr_Format(PyExc_ValueError,
				1214	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1215	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1216	errors);
				1217	return -1;
				1218	}
				1219	}
				1220
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1221	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1222
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1223	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1224	int size,
				1225	const char *errors)
				1226	{
				1227	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1228	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1229	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1230	char* message;
				1231	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1232
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1233	/* Escaped strings will always be longer than the resulting
				1234	Unicode string, so we start with size here and then reduce the
				1235	length after conversion to the true value. */
				1236	v = _PyUnicode_New(size);
				1237	if (v == NULL)
				1238	goto onError;
				1239	if (size == 0)
				1240	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1241
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1242	p = buf = PyUnicode_AS_UNICODE(v);
				1243	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1244
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1245	while (s < end) {
				1246	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1247	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1248	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1249
				1250	/* Non-escape characters are interpreted as Unicode ordinals */
				1251	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1252	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1253	continue;
				1254	}
				1255
				1256	/* \ - Escapes */
				1257	s++;
				1258	switch (*s++) {
				1259
				1260	/* \x escapes */
				1261	case '\n': break;
				1262	case '\\': *p++ = '\\'; break;
				1263	case '\'': *p++ = '\''; break;
				1264	case '\"': *p++ = '\"'; break;
				1265	case 'b': *p++ = '\b'; break;
				1266	case 'f': p++ = '\014'; break; / FF */
				1267	case 't': *p++ = '\t'; break;
				1268	case 'n': *p++ = '\n'; break;
				1269	case 'r': *p++ = '\r'; break;
				1270	case 'v': p++ = '\013'; break; / VT */
				1271	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1272
				1273	/* \OOO (octal) escapes */
				1274	case '0': case '1': case '2': case '3':
				1275	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1276	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1277	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1278	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1279	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1280	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1281	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1282	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1283	break;
				1284
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1285	/* hex escapes */
				1286	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1287	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1288	digits = 2;
				1289	message = "truncated \\xXX escape";
				1290	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1291
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1292	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1293	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1294	digits = 4;
				1295	message = "truncated \\uXXXX escape";
				1296	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1297
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1298	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1299	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1300	digits = 8;
				1301	message = "truncated \\UXXXXXXXX escape";
				1302	hexescape:
				1303	chr = 0;
				1304	for (i = 0; i < digits; i++) {
				1305	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1306	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1307	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1308	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1309	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1310	i++;
				1311	break;
				1312	}
				1313	chr = (chr<<4) & ~0xF;
				1314	if (c >= '0' && c <= '9')
				1315	chr += c - '0';
				1316	else if (c >= 'a' && c <= 'f')
				1317	chr += 10 + c - 'a';
				1318	else
				1319	chr += 10 + c - 'A';
				1320	}
				1321	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1322	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1323	/* when we get here, chr is a 32-bit unicode character */
				1324	if (chr <= 0xffff)
				1325	/* UCS-2 character */
				1326	*p++ = (Py_UNICODE) chr;
				1327	else if (chr <= 0x10ffff) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1328	/* UCS-4 character. Either store directly, or as surrogate pair. */
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1329	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1330	*p++ = chr;
				1331	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1332	chr -= 0x10000L;
				1333	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1334	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1335	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1336	} else {
				1337	if (unicodeescape_decoding_error(
				1338	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1339	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1340	)
				1341	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1342	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1343	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1344	break;
				1345
				1346	/* \N{name} */
				1347	case 'N':
				1348	message = "malformed \\N character escape";
				1349	if (ucnhash_CAPI == NULL) {
				1350	/* load the unicode data module */
				1351	PyObject m, v;
				1352	m = PyImport_ImportModule("unicodedata");
				1353	if (m == NULL)
				1354	goto ucnhashError;
				1355	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1356	Py_DECREF(m);
				1357	if (v == NULL)
				1358	goto ucnhashError;
				1359	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1360	Py_DECREF(v);
				1361	if (ucnhash_CAPI == NULL)
				1362	goto ucnhashError;
				1363	}
				1364	if (*s == '{') {
				1365	const char *start = s+1;
				1366	/* look for the closing brace */
				1367	while (*s != '}' && s < end)
				1368	s++;
				1369	if (s > start && s < end && *s == '}') {
				1370	/* found a name. look it up in the unicode database */
				1371	message = "unknown Unicode character name";
				1372	s++;
				1373	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1374	goto store;
				1375	}
				1376	}
				1377	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1378	goto onError;
				1379	*p++ = x;
				1380	break;
				1381
				1382	default:
				1383	*p++ = '\\';
				1384	*p++ = (unsigned char)s[-1];
				1385	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1386	}
				1387	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1388	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1389	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1390	return (PyObject *)v;
				1391
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1392	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1393	PyErr_SetString(
				1394	PyExc_UnicodeError,
				1395	"\\N escapes not supported (can't load unicodedata module)"
				1396	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1397	return NULL;
				1398
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1399	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1400	Py_XDECREF(v);
				1401	return NULL;
				1402	}
				1403
				1404	/* Return a Unicode-Escape string version of the Unicode object.
				1405
				1406	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1407	appropriate.
				1408
				1409	*/
				1410
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1411	static const Py_UNICODE findchar(const Py_UNICODE s,
				1412	int size,
				1413	Py_UNICODE ch);
				1414
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1415	static
				1416	PyObject unicodeescape_string(const Py_UNICODE s,
				1417	int size,
				1418	int quotes)
				1419	{
				1420	PyObject *repr;
				1421	char *p;
				1422	char *q;
				1423
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1424	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1425
				1426	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1427	if (repr == NULL)
				1428	return NULL;
				1429
				1430	p = q = PyString_AS_STRING(repr);
				1431
				1432	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1433	*p++ = 'u';
				1434	*p++ = (findchar(s, size, '\'') &&
				1435	!findchar(s, size, '"')) ? '"' : '\'';
				1436	}
				1437	while (size-- > 0) {
				1438	Py_UNICODE ch = *s++;
				1439	/* Escape quotes */
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	1440	if (quotes && (ch == (Py_UNICODE) q[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1441	*p++ = '\\';
				1442	*p++ = (char) ch;
				1443	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame^]	1444	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1445	/* Map 21-bit characters to '\U00xxxxxx' */
				1446	else if (ch >= 0x10000) {
				1447	*p++ = '\\';
				1448	*p++ = 'U';
				1449	*p++ = hexdigit[(ch >> 28) & 0xf];
				1450	*p++ = hexdigit[(ch >> 24) & 0xf];
				1451	*p++ = hexdigit[(ch >> 20) & 0xf];
				1452	*p++ = hexdigit[(ch >> 16) & 0xf];
				1453	*p++ = hexdigit[(ch >> 12) & 0xf];
				1454	*p++ = hexdigit[(ch >> 8) & 0xf];
				1455	*p++ = hexdigit[(ch >> 4) & 0xf];
				1456	*p++ = hexdigit[ch & 15];
				1457	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame^]	1458	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1459	/* Map 16-bit characters to '\uxxxx' */
				1460	else if (ch >= 256) {
				1461	*p++ = '\\';
				1462	*p++ = 'u';
				1463	*p++ = hexdigit[(ch >> 12) & 0xf];
				1464	*p++ = hexdigit[(ch >> 8) & 0xf];
				1465	*p++ = hexdigit[(ch >> 4) & 0xf];
				1466	*p++ = hexdigit[ch & 15];
				1467	}
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1468	/* Map special whitespace to '\t', \n', '\r' */
				1469	else if (ch == '\t') {
				1470	*p++ = '\\';
				1471	*p++ = 't';
				1472	}
				1473	else if (ch == '\n') {
				1474	*p++ = '\\';
				1475	*p++ = 'n';
				1476	}
				1477	else if (ch == '\r') {
				1478	*p++ = '\\';
				1479	*p++ = 'r';
				1480	}
				1481	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1482	else if (ch < ' ' \|\| ch >= 128) {
				1483	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1484	*p++ = 'x';
				1485	*p++ = hexdigit[(ch >> 4) & 0xf];
				1486	*p++ = hexdigit[ch & 15];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1487	}
				1488	/* Copy everything else as-is */
				1489	else
				1490	*p++ = (char) ch;
				1491	}
				1492	if (quotes)
				1493	*p++ = q[1];
				1494
				1495	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1496	if (_PyString_Resize(&repr, p - q))
				1497	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1498
				1499	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1500
				1501	onError:
				1502	Py_DECREF(repr);
				1503	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1504	}
				1505
				1506	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1507	int size)
				1508	{
				1509	return unicodeescape_string(s, size, 0);
				1510	}
				1511
				1512	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1513	{
				1514	if (!PyUnicode_Check(unicode)) {
				1515	PyErr_BadArgument();
				1516	return NULL;
				1517	}
				1518	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1519	PyUnicode_GET_SIZE(unicode));
				1520	}
				1521
				1522	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1523
				1524	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1525	int size,
				1526	const char *errors)
				1527	{
				1528	PyUnicodeObject *v;
				1529	Py_UNICODE p, buf;
				1530	const char *end;
				1531	const char *bs;
				1532
				1533	/* Escaped strings will always be longer than the resulting
				1534	Unicode string, so we start with size here and then reduce the
				1535	length after conversion to the true value. */
				1536	v = _PyUnicode_New(size);
				1537	if (v == NULL)
				1538	goto onError;
				1539	if (size == 0)
				1540	return (PyObject *)v;
				1541	p = buf = PyUnicode_AS_UNICODE(v);
				1542	end = s + size;
				1543	while (s < end) {
				1544	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1545	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1546	int i;
				1547
				1548	/* Non-escape characters are interpreted as Unicode ordinals */
				1549	if (*s != '\\') {
				1550	p++ = (unsigned char)s++;
				1551	continue;
				1552	}
				1553
				1554	/* \u-escapes are only interpreted iff the number of leading
				1555	backslashes if odd */
				1556	bs = s;
				1557	for (;s < end;) {
				1558	if (*s != '\\')
				1559	break;
				1560	p++ = (unsigned char)s++;
				1561	}
				1562	if (((s - bs) & 1) == 0 \|\|
				1563	s >= end \|\|
				1564	*s != 'u') {
				1565	continue;
				1566	}
				1567	p--;
				1568	s++;
				1569
				1570	/* \uXXXX with 4 hex digits */
				1571	for (x = 0, i = 0; i < 4; i++) {
				1572	c = (unsigned char)s[i];
				1573	if (!isxdigit(c)) {
				1574	if (unicodeescape_decoding_error(&s, &x, errors,
				1575	"truncated \\uXXXX"))
				1576	goto onError;
				1577	i++;
				1578	break;
				1579	}
				1580	x = (x<<4) & ~0xF;
				1581	if (c >= '0' && c <= '9')
				1582	x += c - '0';
				1583	else if (c >= 'a' && c <= 'f')
				1584	x += 10 + c - 'a';
				1585	else
				1586	x += 10 + c - 'A';
				1587	}
				1588	s += i;
				1589	*p++ = x;
				1590	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1591	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1592	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1593	return (PyObject *)v;
				1594
				1595	onError:
				1596	Py_XDECREF(v);
				1597	return NULL;
				1598	}
				1599
				1600	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1601	int size)
				1602	{
				1603	PyObject *repr;
				1604	char *p;
				1605	char *q;
				1606
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1607	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1608
				1609	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1610	if (repr == NULL)
				1611	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1612	if (size == 0)
				1613	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1614
				1615	p = q = PyString_AS_STRING(repr);
				1616	while (size-- > 0) {
				1617	Py_UNICODE ch = *s++;
				1618	/* Map 16-bit characters to '\uxxxx' */
				1619	if (ch >= 256) {
				1620	*p++ = '\\';
				1621	*p++ = 'u';
				1622	*p++ = hexdigit[(ch >> 12) & 0xf];
				1623	*p++ = hexdigit[(ch >> 8) & 0xf];
				1624	*p++ = hexdigit[(ch >> 4) & 0xf];
				1625	*p++ = hexdigit[ch & 15];
				1626	}
				1627	/* Copy everything else as-is */
				1628	else
				1629	*p++ = (char) ch;
				1630	}
				1631	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1632	if (_PyString_Resize(&repr, p - q))
				1633	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1634
				1635	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1636
				1637	onError:
				1638	Py_DECREF(repr);
				1639	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1640	}
				1641
				1642	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1643	{
				1644	if (!PyUnicode_Check(unicode)) {
				1645	PyErr_BadArgument();
				1646	return NULL;
				1647	}
				1648	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1649	PyUnicode_GET_SIZE(unicode));
				1650	}
				1651
				1652	/* --- Latin-1 Codec ------------------------------------------------------ */
				1653
				1654	PyObject PyUnicode_DecodeLatin1(const char s,
				1655	int size,
				1656	const char *errors)
				1657	{
				1658	PyUnicodeObject *v;
				1659	Py_UNICODE *p;
				1660
				1661	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1662	if (size == 1 && (unsigned char)s < 256) {
				1663	Py_UNICODE r = (unsigned char)s;
				1664	return PyUnicode_FromUnicode(&r, 1);
				1665	}
				1666
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1667	v = _PyUnicode_New(size);
				1668	if (v == NULL)
				1669	goto onError;
				1670	if (size == 0)
				1671	return (PyObject *)v;
				1672	p = PyUnicode_AS_UNICODE(v);
				1673	while (size-- > 0)
				1674	p++ = (unsigned char)s++;
				1675	return (PyObject *)v;
				1676
				1677	onError:
				1678	Py_XDECREF(v);
				1679	return NULL;
				1680	}
				1681
				1682	static
				1683	int latin1_encoding_error(const Py_UNICODE **source,
				1684	char **dest,
				1685	const char *errors,
				1686	const char *details)
				1687	{
				1688	if ((errors == NULL) \|\|
				1689	(strcmp(errors,"strict") == 0)) {
				1690	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1691	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1692	details);
				1693	return -1;
				1694	}
				1695	else if (strcmp(errors,"ignore") == 0) {
				1696	return 0;
				1697	}
				1698	else if (strcmp(errors,"replace") == 0) {
				1699	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1700	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1701	return 0;
				1702	}
				1703	else {
				1704	PyErr_Format(PyExc_ValueError,
				1705	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1706	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1707	errors);
				1708	return -1;
				1709	}
				1710	}
				1711
				1712	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1713	int size,
				1714	const char *errors)
				1715	{
				1716	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1717	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1718
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1719	repr = PyString_FromStringAndSize(NULL, size);
				1720	if (repr == NULL)
				1721	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1722	if (size == 0)
				1723	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1724
				1725	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1726	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1727	while (size-- > 0) {
				1728	Py_UNICODE ch = *p++;
				1729	if (ch >= 256) {
				1730	if (latin1_encoding_error(&p, &s, errors,
				1731	"ordinal not in range(256)"))
				1732	goto onError;
				1733	}
				1734	else
				1735	*s++ = (char)ch;
				1736	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1737	/* Resize if error handling skipped some characters */
				1738	if (s - start < PyString_GET_SIZE(repr))
				1739	if (_PyString_Resize(&repr, s - start))
				1740	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1741	return repr;
				1742
				1743	onError:
				1744	Py_DECREF(repr);
				1745	return NULL;
				1746	}
				1747
				1748	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1749	{
				1750	if (!PyUnicode_Check(unicode)) {
				1751	PyErr_BadArgument();
				1752	return NULL;
				1753	}
				1754	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1755	PyUnicode_GET_SIZE(unicode),
				1756	NULL);
				1757	}
				1758
				1759	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1760
				1761	static
				1762	int ascii_decoding_error(const char **source,
				1763	Py_UNICODE **dest,
				1764	const char *errors,
				1765	const char *details)
				1766	{
				1767	if ((errors == NULL) \|\|
				1768	(strcmp(errors,"strict") == 0)) {
				1769	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1770	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1771	details);
				1772	return -1;
				1773	}
				1774	else if (strcmp(errors,"ignore") == 0) {
				1775	return 0;
				1776	}
				1777	else if (strcmp(errors,"replace") == 0) {
				1778	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1779	(*dest)++;
				1780	return 0;
				1781	}
				1782	else {
				1783	PyErr_Format(PyExc_ValueError,
				1784	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1785	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1786	errors);
				1787	return -1;
				1788	}
				1789	}
				1790
				1791	PyObject PyUnicode_DecodeASCII(const char s,
				1792	int size,
				1793	const char *errors)
				1794	{
				1795	PyUnicodeObject *v;
				1796	Py_UNICODE *p;
				1797
				1798	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1799	if (size == 1 && (unsigned char)s < 128) {
				1800	Py_UNICODE r = (unsigned char)s;
				1801	return PyUnicode_FromUnicode(&r, 1);
				1802	}
				1803
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1804	v = _PyUnicode_New(size);
				1805	if (v == NULL)
				1806	goto onError;
				1807	if (size == 0)
				1808	return (PyObject *)v;
				1809	p = PyUnicode_AS_UNICODE(v);
				1810	while (size-- > 0) {
				1811	register unsigned char c;
				1812
				1813	c = (unsigned char)*s++;
				1814	if (c < 128)
				1815	*p++ = c;
				1816	else if (ascii_decoding_error(&s, &p, errors,
				1817	"ordinal not in range(128)"))
				1818	goto onError;
				1819	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1820	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1821	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1822	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1823	return (PyObject *)v;
				1824
				1825	onError:
				1826	Py_XDECREF(v);
				1827	return NULL;
				1828	}
				1829
				1830	static
				1831	int ascii_encoding_error(const Py_UNICODE **source,
				1832	char **dest,
				1833	const char *errors,
				1834	const char *details)
				1835	{
				1836	if ((errors == NULL) \|\|
				1837	(strcmp(errors,"strict") == 0)) {
				1838	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1839	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1840	details);
				1841	return -1;
				1842	}
				1843	else if (strcmp(errors,"ignore") == 0) {
				1844	return 0;
				1845	}
				1846	else if (strcmp(errors,"replace") == 0) {
				1847	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1848	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1849	return 0;
				1850	}
				1851	else {
				1852	PyErr_Format(PyExc_ValueError,
				1853	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1854	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1855	errors);
				1856	return -1;
				1857	}
				1858	}
				1859
				1860	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1861	int size,
				1862	const char *errors)
				1863	{
				1864	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1865	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1866
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1867	repr = PyString_FromStringAndSize(NULL, size);
				1868	if (repr == NULL)
				1869	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1870	if (size == 0)
				1871	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1872
				1873	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1874	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1875	while (size-- > 0) {
				1876	Py_UNICODE ch = *p++;
				1877	if (ch >= 128) {
				1878	if (ascii_encoding_error(&p, &s, errors,
				1879	"ordinal not in range(128)"))
				1880	goto onError;
				1881	}
				1882	else
				1883	*s++ = (char)ch;
				1884	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1885	/* Resize if error handling skipped some characters */
				1886	if (s - start < PyString_GET_SIZE(repr))
				1887	if (_PyString_Resize(&repr, s - start))
				1888	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1889	return repr;
				1890
				1891	onError:
				1892	Py_DECREF(repr);
				1893	return NULL;
				1894	}
				1895
				1896	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1897	{
				1898	if (!PyUnicode_Check(unicode)) {
				1899	PyErr_BadArgument();
				1900	return NULL;
				1901	}
				1902	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1903	PyUnicode_GET_SIZE(unicode),
				1904	NULL);
				1905	}
				1906
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	1907	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1908
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1909	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1910
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1911	PyObject PyUnicode_DecodeMBCS(const char s,
				1912	int size,
				1913	const char *errors)
				1914	{
				1915	PyUnicodeObject *v;
				1916	Py_UNICODE *p;
				1917
				1918	/* First get the size of the result */
				1919	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1920	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1921	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1922
				1923	v = _PyUnicode_New(usize);
				1924	if (v == NULL)
				1925	return NULL;
				1926	if (usize == 0)
				1927	return (PyObject *)v;
				1928	p = PyUnicode_AS_UNICODE(v);
				1929	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1930	Py_DECREF(v);
				1931	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1932	}
				1933
				1934	return (PyObject *)v;
				1935	}
				1936
				1937	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1938	int size,
				1939	const char *errors)
				1940	{
				1941	PyObject *repr;
				1942	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1943	DWORD mbcssize;
				1944
				1945	/* If there are no characters, bail now! */
				1946	if (size==0)
				1947	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1948
				1949	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1950	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1951	if (mbcssize==0)
				1952	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1953
				1954	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1955	if (repr == NULL)
				1956	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1957	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1958	return repr;
				1959
				1960	/* Do the conversion */
				1961	s = PyString_AS_STRING(repr);
				1962	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1963	Py_DECREF(repr);
				1964	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1965	}
				1966	return repr;
				1967	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1968
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1969	#endif /* MS_WIN32 */
				1970
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1971	/* --- Character Mapping Codec -------------------------------------------- */
				1972
				1973	static
				1974	int charmap_decoding_error(const char **source,
				1975	Py_UNICODE **dest,
				1976	const char *errors,
				1977	const char *details)
				1978	{
				1979	if ((errors == NULL) \|\|
				1980	(strcmp(errors,"strict") == 0)) {
				1981	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1982	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1983	details);
				1984	return -1;
				1985	}
				1986	else if (strcmp(errors,"ignore") == 0) {
				1987	return 0;
				1988	}
				1989	else if (strcmp(errors,"replace") == 0) {
				1990	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1991	(*dest)++;
				1992	return 0;
				1993	}
				1994	else {
				1995	PyErr_Format(PyExc_ValueError,
				1996	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1997	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1998	errors);
				1999	return -1;
				2000	}
				2001	}
				2002
				2003	PyObject PyUnicode_DecodeCharmap(const char s,
				2004	int size,
				2005	PyObject *mapping,
				2006	const char *errors)
				2007	{
				2008	PyUnicodeObject *v;
				2009	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2010	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2011
				2012	/* Default to Latin-1 */
				2013	if (mapping == NULL)
				2014	return PyUnicode_DecodeLatin1(s, size, errors);
				2015
				2016	v = _PyUnicode_New(size);
				2017	if (v == NULL)
				2018	goto onError;
				2019	if (size == 0)
				2020	return (PyObject *)v;
				2021	p = PyUnicode_AS_UNICODE(v);
				2022	while (size-- > 0) {
				2023	unsigned char ch = *s++;
				2024	PyObject w, x;
				2025
				2026	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2027	w = PyInt_FromLong((long)ch);
				2028	if (w == NULL)
				2029	goto onError;
				2030	x = PyObject_GetItem(mapping, w);
				2031	Py_DECREF(w);
				2032	if (x == NULL) {
				2033	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2034	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2035	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2036	x = Py_None;
				2037	Py_INCREF(x);
				2038	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2039	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2040	}
				2041
				2042	/* Apply mapping */
				2043	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2044	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2045	if (value < 0 \|\| value > 65535) {
				2046	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2047	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2048	Py_DECREF(x);
				2049	goto onError;
				2050	}
				2051	*p++ = (Py_UNICODE)value;
				2052	}
				2053	else if (x == Py_None) {
				2054	/* undefined mapping */
				2055	if (charmap_decoding_error(&s, &p, errors,
				2056	"character maps to <undefined>")) {
				2057	Py_DECREF(x);
				2058	goto onError;
				2059	}
				2060	}
				2061	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2062	int targetsize = PyUnicode_GET_SIZE(x);
				2063
				2064	if (targetsize == 1)
				2065	/* 1-1 mapping */
				2066	p++ = PyUnicode_AS_UNICODE(x);
				2067
				2068	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2069	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2070	if (targetsize > extrachars) {
				2071	/* resize first */
				2072	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2073	int needed = (targetsize - extrachars) + \
				2074	(targetsize << 2);
				2075	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2076	if (_PyUnicode_Resize(&v,
				2077	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2078	Py_DECREF(x);
				2079	goto onError;
				2080	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2081	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2082	}
				2083	Py_UNICODE_COPY(p,
				2084	PyUnicode_AS_UNICODE(x),
				2085	targetsize);
				2086	p += targetsize;
				2087	extrachars -= targetsize;
				2088	}
				2089	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2090	}
				2091	else {
				2092	/* wrong return value */
				2093	PyErr_SetString(PyExc_TypeError,
				2094	"character mapping must return integer, None or unicode");
				2095	Py_DECREF(x);
				2096	goto onError;
				2097	}
				2098	Py_DECREF(x);
				2099	}
				2100	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2101	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2102	goto onError;
				2103	return (PyObject *)v;
				2104
				2105	onError:
				2106	Py_XDECREF(v);
				2107	return NULL;
				2108	}
				2109
				2110	static
				2111	int charmap_encoding_error(const Py_UNICODE **source,
				2112	char **dest,
				2113	const char *errors,
				2114	const char *details)
				2115	{
				2116	if ((errors == NULL) \|\|
				2117	(strcmp(errors,"strict") == 0)) {
				2118	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2119	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2120	details);
				2121	return -1;
				2122	}
				2123	else if (strcmp(errors,"ignore") == 0) {
				2124	return 0;
				2125	}
				2126	else if (strcmp(errors,"replace") == 0) {
				2127	**dest = '?';
				2128	(*dest)++;
				2129	return 0;
				2130	}
				2131	else {
				2132	PyErr_Format(PyExc_ValueError,
				2133	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2134	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2135	errors);
				2136	return -1;
				2137	}
				2138	}
				2139
				2140	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2141	int size,
				2142	PyObject *mapping,
				2143	const char *errors)
				2144	{
				2145	PyObject *v;
				2146	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2147	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2148
				2149	/* Default to Latin-1 */
				2150	if (mapping == NULL)
				2151	return PyUnicode_EncodeLatin1(p, size, errors);
				2152
				2153	v = PyString_FromStringAndSize(NULL, size);
				2154	if (v == NULL)
				2155	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2156	if (size == 0)
				2157	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2158	s = PyString_AS_STRING(v);
				2159	while (size-- > 0) {
				2160	Py_UNICODE ch = *p++;
				2161	PyObject w, x;
				2162
				2163	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2164	w = PyInt_FromLong((long)ch);
				2165	if (w == NULL)
				2166	goto onError;
				2167	x = PyObject_GetItem(mapping, w);
				2168	Py_DECREF(w);
				2169	if (x == NULL) {
				2170	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2171	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2172	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2173	x = Py_None;
				2174	Py_INCREF(x);
				2175	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2176	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2177	}
				2178
				2179	/* Apply mapping */
				2180	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2181	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2182	if (value < 0 \|\| value > 255) {
				2183	PyErr_SetString(PyExc_TypeError,
				2184	"character mapping must be in range(256)");
				2185	Py_DECREF(x);
				2186	goto onError;
				2187	}
				2188	*s++ = (char)value;
				2189	}
				2190	else if (x == Py_None) {
				2191	/* undefined mapping */
				2192	if (charmap_encoding_error(&p, &s, errors,
				2193	"character maps to <undefined>")) {
				2194	Py_DECREF(x);
				2195	goto onError;
				2196	}
				2197	}
				2198	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2199	int targetsize = PyString_GET_SIZE(x);
				2200
				2201	if (targetsize == 1)
				2202	/* 1-1 mapping */
				2203	s++ = PyString_AS_STRING(x);
				2204
				2205	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2206	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2207	if (targetsize > extrachars) {
				2208	/* resize first */
				2209	int oldpos = (int)(s - PyString_AS_STRING(v));
				2210	int needed = (targetsize - extrachars) + \
				2211	(targetsize << 2);
				2212	extrachars += needed;
				2213	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2214	Py_DECREF(x);
				2215	goto onError;
				2216	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2217	s = PyString_AS_STRING(v) + oldpos;
				2218	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2219	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2220	s += targetsize;
				2221	extrachars -= targetsize;
				2222	}
				2223	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2224	}
				2225	else {
				2226	/* wrong return value */
				2227	PyErr_SetString(PyExc_TypeError,
				2228	"character mapping must return integer, None or unicode");
				2229	Py_DECREF(x);
				2230	goto onError;
				2231	}
				2232	Py_DECREF(x);
				2233	}
				2234	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2235	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2236	goto onError;
				2237	return v;
				2238
				2239	onError:
				2240	Py_DECREF(v);
				2241	return NULL;
				2242	}
				2243
				2244	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2245	PyObject *mapping)
				2246	{
				2247	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2248	PyErr_BadArgument();
				2249	return NULL;
				2250	}
				2251	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2252	PyUnicode_GET_SIZE(unicode),
				2253	mapping,
				2254	NULL);
				2255	}
				2256
				2257	static
				2258	int translate_error(const Py_UNICODE **source,
				2259	Py_UNICODE **dest,
				2260	const char *errors,
				2261	const char *details)
				2262	{
				2263	if ((errors == NULL) \|\|
				2264	(strcmp(errors,"strict") == 0)) {
				2265	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2266	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2267	details);
				2268	return -1;
				2269	}
				2270	else if (strcmp(errors,"ignore") == 0) {
				2271	return 0;
				2272	}
				2273	else if (strcmp(errors,"replace") == 0) {
				2274	**dest = '?';
				2275	(*dest)++;
				2276	return 0;
				2277	}
				2278	else {
				2279	PyErr_Format(PyExc_ValueError,
				2280	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2281	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2282	errors);
				2283	return -1;
				2284	}
				2285	}
				2286
				2287	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2288	int size,
				2289	PyObject *mapping,
				2290	const char *errors)
				2291	{
				2292	PyUnicodeObject *v;
				2293	Py_UNICODE *p;
				2294
				2295	if (mapping == NULL) {
				2296	PyErr_BadArgument();
				2297	return NULL;
				2298	}
				2299
				2300	/* Output will never be longer than input */
				2301	v = _PyUnicode_New(size);
				2302	if (v == NULL)
				2303	goto onError;
				2304	if (size == 0)
				2305	goto done;
				2306	p = PyUnicode_AS_UNICODE(v);
				2307	while (size-- > 0) {
				2308	Py_UNICODE ch = *s++;
				2309	PyObject w, x;
				2310
				2311	/* Get mapping */
				2312	w = PyInt_FromLong(ch);
				2313	if (w == NULL)
				2314	goto onError;
				2315	x = PyObject_GetItem(mapping, w);
				2316	Py_DECREF(w);
				2317	if (x == NULL) {
				2318	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2319	/* No mapping found: default to 1-1 mapping */
				2320	PyErr_Clear();
				2321	*p++ = ch;
				2322	continue;
				2323	}
				2324	goto onError;
				2325	}
				2326
				2327	/* Apply mapping */
				2328	if (PyInt_Check(x))
				2329	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2330	else if (x == Py_None) {
				2331	/* undefined mapping */
				2332	if (translate_error(&s, &p, errors,
				2333	"character maps to <undefined>")) {
				2334	Py_DECREF(x);
				2335	goto onError;
				2336	}
				2337	}
				2338	else if (PyUnicode_Check(x)) {
				2339	if (PyUnicode_GET_SIZE(x) != 1) {
				2340	/* 1-n mapping */
				2341	PyErr_SetString(PyExc_NotImplementedError,
				2342	"1-n mappings are currently not implemented");
				2343	Py_DECREF(x);
				2344	goto onError;
				2345	}
				2346	p++ = PyUnicode_AS_UNICODE(x);
				2347	}
				2348	else {
				2349	/* wrong return value */
				2350	PyErr_SetString(PyExc_TypeError,
				2351	"translate mapping must return integer, None or unicode");
				2352	Py_DECREF(x);
				2353	goto onError;
				2354	}
				2355	Py_DECREF(x);
				2356	}
				2357	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2358	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2359	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2360
				2361	done:
				2362	return (PyObject *)v;
				2363
				2364	onError:
				2365	Py_XDECREF(v);
				2366	return NULL;
				2367	}
				2368
				2369	PyObject PyUnicode_Translate(PyObject str,
				2370	PyObject *mapping,
				2371	const char *errors)
				2372	{
				2373	PyObject *result;
				2374
				2375	str = PyUnicode_FromObject(str);
				2376	if (str == NULL)
				2377	goto onError;
				2378	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2379	PyUnicode_GET_SIZE(str),
				2380	mapping,
				2381	errors);
				2382	Py_DECREF(str);
				2383	return result;
				2384
				2385	onError:
				2386	Py_XDECREF(str);
				2387	return NULL;
				2388	}
				2389
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2390	/* --- Decimal Encoder ---------------------------------------------------- */
				2391
				2392	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2393	int length,
				2394	char *output,
				2395	const char *errors)
				2396	{
				2397	Py_UNICODE p, end;
				2398
				2399	if (output == NULL) {
				2400	PyErr_BadArgument();
				2401	return -1;
				2402	}
				2403
				2404	p = s;
				2405	end = s + length;
				2406	while (p < end) {
				2407	register Py_UNICODE ch = *p++;
				2408	int decimal;
				2409
				2410	if (Py_UNICODE_ISSPACE(ch)) {
				2411	*output++ = ' ';
				2412	continue;
				2413	}
				2414	decimal = Py_UNICODE_TODECIMAL(ch);
				2415	if (decimal >= 0) {
				2416	*output++ = '0' + decimal;
				2417	continue;
				2418	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2419	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2420	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2421	continue;
				2422	}
				2423	/* All other characters are considered invalid */
				2424	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2425	PyErr_SetString(PyExc_ValueError,
				2426	"invalid decimal Unicode string");
				2427	goto onError;
				2428	}
				2429	else if (strcmp(errors, "ignore") == 0)
				2430	continue;
				2431	else if (strcmp(errors, "replace") == 0) {
				2432	*output++ = '?';
				2433	continue;
				2434	}
				2435	}
				2436	/* 0-terminate the output string */
				2437	*output++ = '\0';
				2438	return 0;
				2439
				2440	onError:
				2441	return -1;
				2442	}
				2443
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2444	/* --- Helpers ------------------------------------------------------------ */
				2445
				2446	static
				2447	int count(PyUnicodeObject *self,
				2448	int start,
				2449	int end,
				2450	PyUnicodeObject *substring)
				2451	{
				2452	int count = 0;
				2453
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2454	if (start < 0)
				2455	start += self->length;
				2456	if (start < 0)
				2457	start = 0;
				2458	if (end > self->length)
				2459	end = self->length;
				2460	if (end < 0)
				2461	end += self->length;
				2462	if (end < 0)
				2463	end = 0;
				2464
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2465	if (substring->length == 0)
				2466	return (end - start + 1);
				2467
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2468	end -= substring->length;
				2469
				2470	while (start <= end)
				2471	if (Py_UNICODE_MATCH(self, start, substring)) {
				2472	count++;
				2473	start += substring->length;
				2474	} else
				2475	start++;
				2476
				2477	return count;
				2478	}
				2479
				2480	int PyUnicode_Count(PyObject *str,
				2481	PyObject *substr,
				2482	int start,
				2483	int end)
				2484	{
				2485	int result;
				2486
				2487	str = PyUnicode_FromObject(str);
				2488	if (str == NULL)
				2489	return -1;
				2490	substr = PyUnicode_FromObject(substr);
				2491	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2492	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2493	return -1;
				2494	}
				2495
				2496	result = count((PyUnicodeObject *)str,
				2497	start, end,
				2498	(PyUnicodeObject *)substr);
				2499
				2500	Py_DECREF(str);
				2501	Py_DECREF(substr);
				2502	return result;
				2503	}
				2504
				2505	static
				2506	int findstring(PyUnicodeObject *self,
				2507	PyUnicodeObject *substring,
				2508	int start,
				2509	int end,
				2510	int direction)
				2511	{
				2512	if (start < 0)
				2513	start += self->length;
				2514	if (start < 0)
				2515	start = 0;
				2516
				2517	if (substring->length == 0)
				2518	return start;
				2519
				2520	if (end > self->length)
				2521	end = self->length;
				2522	if (end < 0)
				2523	end += self->length;
				2524	if (end < 0)
				2525	end = 0;
				2526
				2527	end -= substring->length;
				2528
				2529	if (direction < 0) {
				2530	for (; end >= start; end--)
				2531	if (Py_UNICODE_MATCH(self, end, substring))
				2532	return end;
				2533	} else {
				2534	for (; start <= end; start++)
				2535	if (Py_UNICODE_MATCH(self, start, substring))
				2536	return start;
				2537	}
				2538
				2539	return -1;
				2540	}
				2541
				2542	int PyUnicode_Find(PyObject *str,
				2543	PyObject *substr,
				2544	int start,
				2545	int end,
				2546	int direction)
				2547	{
				2548	int result;
				2549
				2550	str = PyUnicode_FromObject(str);
				2551	if (str == NULL)
				2552	return -1;
				2553	substr = PyUnicode_FromObject(substr);
				2554	if (substr == NULL) {
				2555	Py_DECREF(substr);
				2556	return -1;
				2557	}
				2558
				2559	result = findstring((PyUnicodeObject *)str,
				2560	(PyUnicodeObject *)substr,
				2561	start, end, direction);
				2562	Py_DECREF(str);
				2563	Py_DECREF(substr);
				2564	return result;
				2565	}
				2566
				2567	static
				2568	int tailmatch(PyUnicodeObject *self,
				2569	PyUnicodeObject *substring,
				2570	int start,
				2571	int end,
				2572	int direction)
				2573	{
				2574	if (start < 0)
				2575	start += self->length;
				2576	if (start < 0)
				2577	start = 0;
				2578
				2579	if (substring->length == 0)
				2580	return 1;
				2581
				2582	if (end > self->length)
				2583	end = self->length;
				2584	if (end < 0)
				2585	end += self->length;
				2586	if (end < 0)
				2587	end = 0;
				2588
				2589	end -= substring->length;
				2590	if (end < start)
				2591	return 0;
				2592
				2593	if (direction > 0) {
				2594	if (Py_UNICODE_MATCH(self, end, substring))
				2595	return 1;
				2596	} else {
				2597	if (Py_UNICODE_MATCH(self, start, substring))
				2598	return 1;
				2599	}
				2600
				2601	return 0;
				2602	}
				2603
				2604	int PyUnicode_Tailmatch(PyObject *str,
				2605	PyObject *substr,
				2606	int start,
				2607	int end,
				2608	int direction)
				2609	{
				2610	int result;
				2611
				2612	str = PyUnicode_FromObject(str);
				2613	if (str == NULL)
				2614	return -1;
				2615	substr = PyUnicode_FromObject(substr);
				2616	if (substr == NULL) {
				2617	Py_DECREF(substr);
				2618	return -1;
				2619	}
				2620
				2621	result = tailmatch((PyUnicodeObject *)str,
				2622	(PyUnicodeObject *)substr,
				2623	start, end, direction);
				2624	Py_DECREF(str);
				2625	Py_DECREF(substr);
				2626	return result;
				2627	}
				2628
				2629	static
				2630	const Py_UNICODE findchar(const Py_UNICODE s,
				2631	int size,
				2632	Py_UNICODE ch)
				2633	{
				2634	/* like wcschr, but doesn't stop at NULL characters */
				2635
				2636	while (size-- > 0) {
				2637	if (*s == ch)
				2638	return s;
				2639	s++;
				2640	}
				2641
				2642	return NULL;
				2643	}
				2644
				2645	/* Apply fixfct filter to the Unicode object self and return a
				2646	reference to the modified object */
				2647
				2648	static
				2649	PyObject fixup(PyUnicodeObject self,
				2650	int (fixfct)(PyUnicodeObject s))
				2651	{
				2652
				2653	PyUnicodeObject *u;
				2654
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2655	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2656	if (u == NULL)
				2657	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2658
				2659	Py_UNICODE_COPY(u->str, self->str, self->length);
				2660
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2661	if (!fixfct(u)) {
				2662	/* fixfct should return TRUE if it modified the buffer. If
				2663	FALSE, return a reference to the original buffer instead
				2664	(to save space, not time) */
				2665	Py_INCREF(self);
				2666	Py_DECREF(u);
				2667	return (PyObject*) self;
				2668	}
				2669	return (PyObject*) u;
				2670	}
				2671
				2672	static
				2673	int fixupper(PyUnicodeObject *self)
				2674	{
				2675	int len = self->length;
				2676	Py_UNICODE *s = self->str;
				2677	int status = 0;
				2678
				2679	while (len-- > 0) {
				2680	register Py_UNICODE ch;
				2681
				2682	ch = Py_UNICODE_TOUPPER(*s);
				2683	if (ch != *s) {
				2684	status = 1;
				2685	*s = ch;
				2686	}
				2687	s++;
				2688	}
				2689
				2690	return status;
				2691	}
				2692
				2693	static
				2694	int fixlower(PyUnicodeObject *self)
				2695	{
				2696	int len = self->length;
				2697	Py_UNICODE *s = self->str;
				2698	int status = 0;
				2699
				2700	while (len-- > 0) {
				2701	register Py_UNICODE ch;
				2702
				2703	ch = Py_UNICODE_TOLOWER(*s);
				2704	if (ch != *s) {
				2705	status = 1;
				2706	*s = ch;
				2707	}
				2708	s++;
				2709	}
				2710
				2711	return status;
				2712	}
				2713
				2714	static
				2715	int fixswapcase(PyUnicodeObject *self)
				2716	{
				2717	int len = self->length;
				2718	Py_UNICODE *s = self->str;
				2719	int status = 0;
				2720
				2721	while (len-- > 0) {
				2722	if (Py_UNICODE_ISUPPER(*s)) {
				2723	s = Py_UNICODE_TOLOWER(s);
				2724	status = 1;
				2725	} else if (Py_UNICODE_ISLOWER(*s)) {
				2726	s = Py_UNICODE_TOUPPER(s);
				2727	status = 1;
				2728	}
				2729	s++;
				2730	}
				2731
				2732	return status;
				2733	}
				2734
				2735	static
				2736	int fixcapitalize(PyUnicodeObject *self)
				2737	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2738	int len = self->length;
				2739	Py_UNICODE *s = self->str;
				2740	int status = 0;
				2741
				2742	if (len == 0)
				2743	return 0;
				2744	if (Py_UNICODE_ISLOWER(*s)) {
				2745	s = Py_UNICODE_TOUPPER(s);
				2746	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2747	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2748	s++;
				2749	while (--len > 0) {
				2750	if (Py_UNICODE_ISUPPER(*s)) {
				2751	s = Py_UNICODE_TOLOWER(s);
				2752	status = 1;
				2753	}
				2754	s++;
				2755	}
				2756	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2757	}
				2758
				2759	static
				2760	int fixtitle(PyUnicodeObject *self)
				2761	{
				2762	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2763	register Py_UNICODE *e;
				2764	int previous_is_cased;
				2765
				2766	/* Shortcut for single character strings */
				2767	if (PyUnicode_GET_SIZE(self) == 1) {
				2768	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2769	if (*p != ch) {
				2770	*p = ch;
				2771	return 1;
				2772	}
				2773	else
				2774	return 0;
				2775	}
				2776
				2777	e = p + PyUnicode_GET_SIZE(self);
				2778	previous_is_cased = 0;
				2779	for (; p < e; p++) {
				2780	register const Py_UNICODE ch = *p;
				2781
				2782	if (previous_is_cased)
				2783	*p = Py_UNICODE_TOLOWER(ch);
				2784	else
				2785	*p = Py_UNICODE_TOTITLE(ch);
				2786
				2787	if (Py_UNICODE_ISLOWER(ch) \|\|
				2788	Py_UNICODE_ISUPPER(ch) \|\|
				2789	Py_UNICODE_ISTITLE(ch))
				2790	previous_is_cased = 1;
				2791	else
				2792	previous_is_cased = 0;
				2793	}
				2794	return 1;
				2795	}
				2796
				2797	PyObject PyUnicode_Join(PyObject separator,
				2798	PyObject *seq)
				2799	{
				2800	Py_UNICODE *sep;
				2801	int seplen;
				2802	PyUnicodeObject *res = NULL;
				2803	int reslen = 0;
				2804	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2805	int sz = 100;
				2806	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2807	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2808
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2809	it = PyObject_GetIter(seq);
				2810	if (it == NULL)
				2811	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2812
				2813	if (separator == NULL) {
				2814	Py_UNICODE blank = ' ';
				2815	sep = &blank;
				2816	seplen = 1;
				2817	}
				2818	else {
				2819	separator = PyUnicode_FromObject(separator);
				2820	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2821	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2822	sep = PyUnicode_AS_UNICODE(separator);
				2823	seplen = PyUnicode_GET_SIZE(separator);
				2824	}
				2825
				2826	res = _PyUnicode_New(sz);
				2827	if (res == NULL)
				2828	goto onError;
				2829	p = PyUnicode_AS_UNICODE(res);
				2830	reslen = 0;
				2831
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2832	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2833	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2834	PyObject *item = PyIter_Next(it);
				2835	if (item == NULL) {
				2836	if (PyErr_Occurred())
				2837	goto onError;
				2838	break;
				2839	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2840	if (!PyUnicode_Check(item)) {
				2841	PyObject *v;
				2842	v = PyUnicode_FromObject(item);
				2843	Py_DECREF(item);
				2844	item = v;
				2845	if (item == NULL)
				2846	goto onError;
				2847	}
				2848	itemlen = PyUnicode_GET_SIZE(item);
				2849	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2850	if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2851	goto onError;
				2852	sz *= 2;
				2853	p = PyUnicode_AS_UNICODE(res) + reslen;
				2854	}
				2855	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2856	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2857	p += seplen;
				2858	reslen += seplen;
				2859	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2860	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2861	p += itemlen;
				2862	reslen += itemlen;
				2863	Py_DECREF(item);
				2864	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2865	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2866	goto onError;
				2867
				2868	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2869	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2870	return (PyObject *)res;
				2871
				2872	onError:
				2873	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2874	Py_XDECREF(res);
				2875	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2876	return NULL;
				2877	}
				2878
				2879	static
				2880	PyUnicodeObject pad(PyUnicodeObject self,
				2881	int left,
				2882	int right,
				2883	Py_UNICODE fill)
				2884	{
				2885	PyUnicodeObject *u;
				2886
				2887	if (left < 0)
				2888	left = 0;
				2889	if (right < 0)
				2890	right = 0;
				2891
				2892	if (left == 0 && right == 0) {
				2893	Py_INCREF(self);
				2894	return self;
				2895	}
				2896
				2897	u = _PyUnicode_New(left + self->length + right);
				2898	if (u) {
				2899	if (left)
				2900	Py_UNICODE_FILL(u->str, fill, left);
				2901	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2902	if (right)
				2903	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2904	}
				2905
				2906	return u;
				2907	}
				2908
				2909	#define SPLIT_APPEND(data, left, right) \
				2910	str = PyUnicode_FromUnicode(data + left, right - left); \
				2911	if (!str) \
				2912	goto onError; \
				2913	if (PyList_Append(list, str)) { \
				2914	Py_DECREF(str); \
				2915	goto onError; \
				2916	} \
				2917	else \
				2918	Py_DECREF(str);
				2919
				2920	static
				2921	PyObject split_whitespace(PyUnicodeObject self,
				2922	PyObject *list,
				2923	int maxcount)
				2924	{
				2925	register int i;
				2926	register int j;
				2927	int len = self->length;
				2928	PyObject *str;
				2929
				2930	for (i = j = 0; i < len; ) {
				2931	/* find a token */
				2932	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2933	i++;
				2934	j = i;
				2935	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2936	i++;
				2937	if (j < i) {
				2938	if (maxcount-- <= 0)
				2939	break;
				2940	SPLIT_APPEND(self->str, j, i);
				2941	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2942	i++;
				2943	j = i;
				2944	}
				2945	}
				2946	if (j < len) {
				2947	SPLIT_APPEND(self->str, j, len);
				2948	}
				2949	return list;
				2950
				2951	onError:
				2952	Py_DECREF(list);
				2953	return NULL;
				2954	}
				2955
				2956	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2957	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2958	{
				2959	register int i;
				2960	register int j;
				2961	int len;
				2962	PyObject *list;
				2963	PyObject *str;
				2964	Py_UNICODE *data;
				2965
				2966	string = PyUnicode_FromObject(string);
				2967	if (string == NULL)
				2968	return NULL;
				2969	data = PyUnicode_AS_UNICODE(string);
				2970	len = PyUnicode_GET_SIZE(string);
				2971
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2972	list = PyList_New(0);
				2973	if (!list)
				2974	goto onError;
				2975
				2976	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2977	int eol;
				2978
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2979	/* Find a line and append it */
				2980	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2981	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2982
				2983	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2984	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2985	if (i < len) {
				2986	if (data[i] == '\r' && i + 1 < len &&
				2987	data[i+1] == '\n')
				2988	i += 2;
				2989	else
				2990	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2991	if (keepends)
				2992	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2993	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2994	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2995	j = i;
				2996	}
				2997	if (j < len) {
				2998	SPLIT_APPEND(data, j, len);
				2999	}
				3000
				3001	Py_DECREF(string);
				3002	return list;
				3003
				3004	onError:
				3005	Py_DECREF(list);
				3006	Py_DECREF(string);
				3007	return NULL;
				3008	}
				3009
				3010	static
				3011	PyObject split_char(PyUnicodeObject self,
				3012	PyObject *list,
				3013	Py_UNICODE ch,
				3014	int maxcount)
				3015	{
				3016	register int i;
				3017	register int j;
				3018	int len = self->length;
				3019	PyObject *str;
				3020
				3021	for (i = j = 0; i < len; ) {
				3022	if (self->str[i] == ch) {
				3023	if (maxcount-- <= 0)
				3024	break;
				3025	SPLIT_APPEND(self->str, j, i);
				3026	i = j = i + 1;
				3027	} else
				3028	i++;
				3029	}
				3030	if (j <= len) {
				3031	SPLIT_APPEND(self->str, j, len);
				3032	}
				3033	return list;
				3034
				3035	onError:
				3036	Py_DECREF(list);
				3037	return NULL;
				3038	}
				3039
				3040	static
				3041	PyObject split_substring(PyUnicodeObject self,
				3042	PyObject *list,
				3043	PyUnicodeObject *substring,
				3044	int maxcount)
				3045	{
				3046	register int i;
				3047	register int j;
				3048	int len = self->length;
				3049	int sublen = substring->length;
				3050	PyObject *str;
				3051
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3052	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3053	if (Py_UNICODE_MATCH(self, i, substring)) {
				3054	if (maxcount-- <= 0)
				3055	break;
				3056	SPLIT_APPEND(self->str, j, i);
				3057	i = j = i + sublen;
				3058	} else
				3059	i++;
				3060	}
				3061	if (j <= len) {
				3062	SPLIT_APPEND(self->str, j, len);
				3063	}
				3064	return list;
				3065
				3066	onError:
				3067	Py_DECREF(list);
				3068	return NULL;
				3069	}
				3070
				3071	#undef SPLIT_APPEND
				3072
				3073	static
				3074	PyObject split(PyUnicodeObject self,
				3075	PyUnicodeObject *substring,
				3076	int maxcount)
				3077	{
				3078	PyObject *list;
				3079
				3080	if (maxcount < 0)
				3081	maxcount = INT_MAX;
				3082
				3083	list = PyList_New(0);
				3084	if (!list)
				3085	return NULL;
				3086
				3087	if (substring == NULL)
				3088	return split_whitespace(self,list,maxcount);
				3089
				3090	else if (substring->length == 1)
				3091	return split_char(self,list,substring->str[0],maxcount);
				3092
				3093	else if (substring->length == 0) {
				3094	Py_DECREF(list);
				3095	PyErr_SetString(PyExc_ValueError, "empty separator");
				3096	return NULL;
				3097	}
				3098	else
				3099	return split_substring(self,list,substring,maxcount);
				3100	}
				3101
				3102	static
				3103	PyObject strip(PyUnicodeObject self,
				3104	int left,
				3105	int right)
				3106	{
				3107	Py_UNICODE *p = self->str;
				3108	int start = 0;
				3109	int end = self->length;
				3110
				3111	if (left)
				3112	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3113	start++;
				3114
				3115	if (right)
				3116	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3117	end--;
				3118
				3119	if (start == 0 && end == self->length) {
				3120	/* couldn't strip anything off, return original string */
				3121	Py_INCREF(self);
				3122	return (PyObject*) self;
				3123	}
				3124
				3125	return (PyObject*) PyUnicode_FromUnicode(
				3126	self->str + start,
				3127	end - start
				3128	);
				3129	}
				3130
				3131	static
				3132	PyObject replace(PyUnicodeObject self,
				3133	PyUnicodeObject *str1,
				3134	PyUnicodeObject *str2,
				3135	int maxcount)
				3136	{
				3137	PyUnicodeObject *u;
				3138
				3139	if (maxcount < 0)
				3140	maxcount = INT_MAX;
				3141
				3142	if (str1->length == 1 && str2->length == 1) {
				3143	int i;
				3144
				3145	/* replace characters */
				3146	if (!findchar(self->str, self->length, str1->str[0])) {
				3147	/* nothing to replace, return original string */
				3148	Py_INCREF(self);
				3149	u = self;
				3150	} else {
				3151	Py_UNICODE u1 = str1->str[0];
				3152	Py_UNICODE u2 = str2->str[0];
				3153
				3154	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3155	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3156	self->length
				3157	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3158	if (u != NULL) {
				3159	Py_UNICODE_COPY(u->str, self->str,
				3160	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3161	for (i = 0; i < u->length; i++)
				3162	if (u->str[i] == u1) {
				3163	if (--maxcount < 0)
				3164	break;
				3165	u->str[i] = u2;
				3166	}
				3167	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3168	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3169
				3170	} else {
				3171	int n, i;
				3172	Py_UNICODE *p;
				3173
				3174	/* replace strings */
				3175	n = count(self, 0, self->length, str1);
				3176	if (n > maxcount)
				3177	n = maxcount;
				3178	if (n == 0) {
				3179	/* nothing to replace, return original string */
				3180	Py_INCREF(self);
				3181	u = self;
				3182	} else {
				3183	u = _PyUnicode_New(
				3184	self->length + n * (str2->length - str1->length));
				3185	if (u) {
				3186	i = 0;
				3187	p = u->str;
				3188	while (i <= self->length - str1->length)
				3189	if (Py_UNICODE_MATCH(self, i, str1)) {
				3190	/* replace string segment */
				3191	Py_UNICODE_COPY(p, str2->str, str2->length);
				3192	p += str2->length;
				3193	i += str1->length;
				3194	if (--n <= 0) {
				3195	/* copy remaining part */
				3196	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3197	break;
				3198	}
				3199	} else
				3200	*p++ = self->str[i++];
				3201	}
				3202	}
				3203	}
				3204
				3205	return (PyObject *) u;
				3206	}
				3207
				3208	/* --- Unicode Object Methods --------------------------------------------- */
				3209
				3210	static char title__doc__[] =
				3211	"S.title() -> unicode\n\
				3212	\n\
				3213	Return a titlecased version of S, i.e. words start with title case\n\
				3214	characters, all remaining cased characters have lower case.";
				3215
				3216	static PyObject*
				3217	unicode_title(PyUnicodeObject self, PyObject args)
				3218	{
				3219	if (!PyArg_NoArgs(args))
				3220	return NULL;
				3221	return fixup(self, fixtitle);
				3222	}
				3223
				3224	static char capitalize__doc__[] =
				3225	"S.capitalize() -> unicode\n\
				3226	\n\
				3227	Return a capitalized version of S, i.e. make the first character\n\
				3228	have upper case.";
				3229
				3230	static PyObject*
				3231	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3232	{
				3233	if (!PyArg_NoArgs(args))
				3234	return NULL;
				3235	return fixup(self, fixcapitalize);
				3236	}
				3237
				3238	#if 0
				3239	static char capwords__doc__[] =
				3240	"S.capwords() -> unicode\n\
				3241	\n\
				3242	Apply .capitalize() to all words in S and return the result with\n\
				3243	normalized whitespace (all whitespace strings are replaced by ' ').";
				3244
				3245	static PyObject*
				3246	unicode_capwords(PyUnicodeObject self, PyObject args)
				3247	{
				3248	PyObject *list;
				3249	PyObject *item;
				3250	int i;
				3251
				3252	if (!PyArg_NoArgs(args))
				3253	return NULL;
				3254
				3255	/* Split into words */
				3256	list = split(self, NULL, -1);
				3257	if (!list)
				3258	return NULL;
				3259
				3260	/* Capitalize each word */
				3261	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3262	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3263	fixcapitalize);
				3264	if (item == NULL)
				3265	goto onError;
				3266	Py_DECREF(PyList_GET_ITEM(list, i));
				3267	PyList_SET_ITEM(list, i, item);
				3268	}
				3269
				3270	/* Join the words to form a new string */
				3271	item = PyUnicode_Join(NULL, list);
				3272
				3273	onError:
				3274	Py_DECREF(list);
				3275	return (PyObject *)item;
				3276	}
				3277	#endif
				3278
				3279	static char center__doc__[] =
				3280	"S.center(width) -> unicode\n\
				3281	\n\
				3282	Return S centered in a Unicode string of length width. Padding is done\n\
				3283	using spaces.";
				3284
				3285	static PyObject *
				3286	unicode_center(PyUnicodeObject self, PyObject args)
				3287	{
				3288	int marg, left;
				3289	int width;
				3290
				3291	if (!PyArg_ParseTuple(args, "i:center", &width))
				3292	return NULL;
				3293
				3294	if (self->length >= width) {
				3295	Py_INCREF(self);
				3296	return (PyObject*) self;
				3297	}
				3298
				3299	marg = width - self->length;
				3300	left = marg / 2 + (marg & width & 1);
				3301
				3302	return (PyObject*) pad(self, left, marg - left, ' ');
				3303	}
				3304
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3305	#if 0
				3306
				3307	/* This code should go into some future Unicode collation support
				3308	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3309	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3310
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3311	/* speedy UTF-16 code point order comparison */
				3312	/* gleaned from: */
				3313	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3314
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3315	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3316	{
				3317	0, 0, 0, 0, 0, 0, 0, 0,
				3318	0, 0, 0, 0, 0, 0, 0, 0,
				3319	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3320	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3321	};
				3322
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3323	static int
				3324	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3325	{
				3326	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3327
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3328	Py_UNICODE *s1 = str1->str;
				3329	Py_UNICODE *s2 = str2->str;
				3330
				3331	len1 = str1->length;
				3332	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3333
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3334	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3335	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3336
				3337	c1 = *s1++;
				3338	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3339
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3340	if (c1 > (1<<11) * 26)
				3341	c1 += utf16Fixup[c1>>11];
				3342	if (c2 > (1<<11) * 26)
				3343	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3344	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3345
				3346	if (c1 != c2)
				3347	return (c1 < c2) ? -1 : 1;
				3348
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3349	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3350	}
				3351
				3352	return (len1 < len2) ? -1 : (len1 != len2);
				3353	}
				3354
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3355	#else
				3356
				3357	static int
				3358	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3359	{
				3360	register int len1, len2;
				3361
				3362	Py_UNICODE *s1 = str1->str;
				3363	Py_UNICODE *s2 = str2->str;
				3364
				3365	len1 = str1->length;
				3366	len2 = str2->length;
				3367
				3368	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3369	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3370
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3371	c1 = *s1++;
				3372	c2 = *s2++;
				3373
				3374	if (c1 != c2)
				3375	return (c1 < c2) ? -1 : 1;
				3376
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3377	len1--; len2--;
				3378	}
				3379
				3380	return (len1 < len2) ? -1 : (len1 != len2);
				3381	}
				3382
				3383	#endif
				3384
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3385	int PyUnicode_Compare(PyObject *left,
				3386	PyObject *right)
				3387	{
				3388	PyUnicodeObject u = NULL, v = NULL;
				3389	int result;
				3390
				3391	/* Coerce the two arguments */
				3392	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3393	if (u == NULL)
				3394	goto onError;
				3395	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3396	if (v == NULL)
				3397	goto onError;
				3398
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3399	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3400	if (v == u) {
				3401	Py_DECREF(u);
				3402	Py_DECREF(v);
				3403	return 0;
				3404	}
				3405
				3406	result = unicode_compare(u, v);
				3407
				3408	Py_DECREF(u);
				3409	Py_DECREF(v);
				3410	return result;
				3411
				3412	onError:
				3413	Py_XDECREF(u);
				3414	Py_XDECREF(v);
				3415	return -1;
				3416	}
				3417
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3418	int PyUnicode_Contains(PyObject *container,
				3419	PyObject *element)
				3420	{
				3421	PyUnicodeObject u = NULL, v = NULL;
				3422	int result;
				3423	register const Py_UNICODE p, e;
				3424	register Py_UNICODE ch;
				3425
				3426	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3427	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3428	if (v == NULL) {
				3429	PyErr_SetString(PyExc_TypeError,
				3430	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3431	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3432	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3433	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3434	if (u == NULL) {
				3435	Py_DECREF(v);
				3436	goto onError;
				3437	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3438
				3439	/* Check v in u */
				3440	if (PyUnicode_GET_SIZE(v) != 1) {
				3441	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3442	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3443	goto onError;
				3444	}
				3445	ch = *PyUnicode_AS_UNICODE(v);
				3446	p = PyUnicode_AS_UNICODE(u);
				3447	e = p + PyUnicode_GET_SIZE(u);
				3448	result = 0;
				3449	while (p < e) {
				3450	if (*p++ == ch) {
				3451	result = 1;
				3452	break;
				3453	}
				3454	}
				3455
				3456	Py_DECREF(u);
				3457	Py_DECREF(v);
				3458	return result;
				3459
				3460	onError:
				3461	Py_XDECREF(u);
				3462	Py_XDECREF(v);
				3463	return -1;
				3464	}
				3465
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3466	/* Concat to string or Unicode object giving a new Unicode object. */
				3467
				3468	PyObject PyUnicode_Concat(PyObject left,
				3469	PyObject *right)
				3470	{
				3471	PyUnicodeObject u = NULL, v = NULL, *w;
				3472
				3473	/* Coerce the two arguments */
				3474	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3475	if (u == NULL)
				3476	goto onError;
				3477	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3478	if (v == NULL)
				3479	goto onError;
				3480
				3481	/* Shortcuts */
				3482	if (v == unicode_empty) {
				3483	Py_DECREF(v);
				3484	return (PyObject *)u;
				3485	}
				3486	if (u == unicode_empty) {
				3487	Py_DECREF(u);
				3488	return (PyObject *)v;
				3489	}
				3490
				3491	/* Concat the two Unicode strings */
				3492	w = _PyUnicode_New(u->length + v->length);
				3493	if (w == NULL)
				3494	goto onError;
				3495	Py_UNICODE_COPY(w->str, u->str, u->length);
				3496	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3497
				3498	Py_DECREF(u);
				3499	Py_DECREF(v);
				3500	return (PyObject *)w;
				3501
				3502	onError:
				3503	Py_XDECREF(u);
				3504	Py_XDECREF(v);
				3505	return NULL;
				3506	}
				3507
				3508	static char count__doc__[] =
				3509	"S.count(sub[, start[, end]]) -> int\n\
				3510	\n\
				3511	Return the number of occurrences of substring sub in Unicode string\n\
				3512	S[start:end]. Optional arguments start and end are\n\
				3513	interpreted as in slice notation.";
				3514
				3515	static PyObject *
				3516	unicode_count(PyUnicodeObject self, PyObject args)
				3517	{
				3518	PyUnicodeObject *substring;
				3519	int start = 0;
				3520	int end = INT_MAX;
				3521	PyObject *result;
				3522
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3523	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3524	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3525	return NULL;
				3526
				3527	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3528	(PyObject *)substring);
				3529	if (substring == NULL)
				3530	return NULL;
				3531
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3532	if (start < 0)
				3533	start += self->length;
				3534	if (start < 0)
				3535	start = 0;
				3536	if (end > self->length)
				3537	end = self->length;
				3538	if (end < 0)
				3539	end += self->length;
				3540	if (end < 0)
				3541	end = 0;
				3542
				3543	result = PyInt_FromLong((long) count(self, start, end, substring));
				3544
				3545	Py_DECREF(substring);
				3546	return result;
				3547	}
				3548
				3549	static char encode__doc__[] =
				3550	"S.encode([encoding[,errors]]) -> string\n\
				3551	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3552	Return an encoded string version of S. Default encoding is the current\n\
				3553	default string encoding. errors may be given to set a different error\n\
				3554	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3555	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3556
				3557	static PyObject *
				3558	unicode_encode(PyUnicodeObject self, PyObject args)
				3559	{
				3560	char *encoding = NULL;
				3561	char *errors = NULL;
				3562	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3563	return NULL;
				3564	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3565	}
				3566
				3567	static char expandtabs__doc__[] =
				3568	"S.expandtabs([tabsize]) -> unicode\n\
				3569	\n\
				3570	Return a copy of S where all tab characters are expanded using spaces.\n\
				3571	If tabsize is not given, a tab size of 8 characters is assumed.";
				3572
				3573	static PyObject*
				3574	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3575	{
				3576	Py_UNICODE *e;
				3577	Py_UNICODE *p;
				3578	Py_UNICODE *q;
				3579	int i, j;
				3580	PyUnicodeObject *u;
				3581	int tabsize = 8;
				3582
				3583	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3584	return NULL;
				3585
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3586	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3587	i = j = 0;
				3588	e = self->str + self->length;
				3589	for (p = self->str; p < e; p++)
				3590	if (*p == '\t') {
				3591	if (tabsize > 0)
				3592	j += tabsize - (j % tabsize);
				3593	}
				3594	else {
				3595	j++;
				3596	if (p == '\n' \|\| p == '\r') {
				3597	i += j;
				3598	j = 0;
				3599	}
				3600	}
				3601
				3602	/* Second pass: create output string and fill it */
				3603	u = _PyUnicode_New(i + j);
				3604	if (!u)
				3605	return NULL;
				3606
				3607	j = 0;
				3608	q = u->str;
				3609
				3610	for (p = self->str; p < e; p++)
				3611	if (*p == '\t') {
				3612	if (tabsize > 0) {
				3613	i = tabsize - (j % tabsize);
				3614	j += i;
				3615	while (i--)
				3616	*q++ = ' ';
				3617	}
				3618	}
				3619	else {
				3620	j++;
				3621	q++ = p;
				3622	if (p == '\n' \|\| p == '\r')
				3623	j = 0;
				3624	}
				3625
				3626	return (PyObject*) u;
				3627	}
				3628
				3629	static char find__doc__[] =
				3630	"S.find(sub [,start [,end]]) -> int\n\
				3631	\n\
				3632	Return the lowest index in S where substring sub is found,\n\
				3633	such that sub is contained within s[start,end]. Optional\n\
				3634	arguments start and end are interpreted as in slice notation.\n\
				3635	\n\
				3636	Return -1 on failure.";
				3637
				3638	static PyObject *
				3639	unicode_find(PyUnicodeObject self, PyObject args)
				3640	{
				3641	PyUnicodeObject *substring;
				3642	int start = 0;
				3643	int end = INT_MAX;
				3644	PyObject *result;
				3645
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3646	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3647	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3648	return NULL;
				3649	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3650	(PyObject *)substring);
				3651	if (substring == NULL)
				3652	return NULL;
				3653
				3654	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3655
				3656	Py_DECREF(substring);
				3657	return result;
				3658	}
				3659
				3660	static PyObject *
				3661	unicode_getitem(PyUnicodeObject *self, int index)
				3662	{
				3663	if (index < 0 \|\| index >= self->length) {
				3664	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3665	return NULL;
				3666	}
				3667
				3668	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3669	}
				3670
				3671	static long
				3672	unicode_hash(PyUnicodeObject *self)
				3673	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3674	/* Since Unicode objects compare equal to their ASCII string
				3675	counterparts, they should use the individual character values
				3676	as basis for their hash value. This is needed to assure that
				3677	strings and Unicode objects behave in the same way as
				3678	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3679
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3680	register int len;
				3681	register Py_UNICODE *p;
				3682	register long x;
				3683
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3684	if (self->hash != -1)
				3685	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3686	len = PyUnicode_GET_SIZE(self);
				3687	p = PyUnicode_AS_UNICODE(self);
				3688	x = *p << 7;
				3689	while (--len >= 0)
				3690	x = (1000003x) ^ p++;
				3691	x ^= PyUnicode_GET_SIZE(self);
				3692	if (x == -1)
				3693	x = -2;
				3694	self->hash = x;
				3695	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3696	}
				3697
				3698	static char index__doc__[] =
				3699	"S.index(sub [,start [,end]]) -> int\n\
				3700	\n\
				3701	Like S.find() but raise ValueError when the substring is not found.";
				3702
				3703	static PyObject *
				3704	unicode_index(PyUnicodeObject self, PyObject args)
				3705	{
				3706	int result;
				3707	PyUnicodeObject *substring;
				3708	int start = 0;
				3709	int end = INT_MAX;
				3710
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3711	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3712	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3713	return NULL;
				3714
				3715	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3716	(PyObject *)substring);
				3717	if (substring == NULL)
				3718	return NULL;
				3719
				3720	result = findstring(self, substring, start, end, 1);
				3721
				3722	Py_DECREF(substring);
				3723	if (result < 0) {
				3724	PyErr_SetString(PyExc_ValueError, "substring not found");
				3725	return NULL;
				3726	}
				3727	return PyInt_FromLong(result);
				3728	}
				3729
				3730	static char islower__doc__[] =
				3731	"S.islower() -> int\n\
				3732	\n\
				3733	Return 1 if all cased characters in S are lowercase and there is\n\
				3734	at least one cased character in S, 0 otherwise.";
				3735
				3736	static PyObject*
				3737	unicode_islower(PyUnicodeObject self, PyObject args)
				3738	{
				3739	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3740	register const Py_UNICODE *e;
				3741	int cased;
				3742
				3743	if (!PyArg_NoArgs(args))
				3744	return NULL;
				3745
				3746	/* Shortcut for single character strings */
				3747	if (PyUnicode_GET_SIZE(self) == 1)
				3748	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3749
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3750	/* Special case for empty strings */
				3751	if (PyString_GET_SIZE(self) == 0)
				3752	return PyInt_FromLong(0);
				3753
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3754	e = p + PyUnicode_GET_SIZE(self);
				3755	cased = 0;
				3756	for (; p < e; p++) {
				3757	register const Py_UNICODE ch = *p;
				3758
				3759	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3760	return PyInt_FromLong(0);
				3761	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3762	cased = 1;
				3763	}
				3764	return PyInt_FromLong(cased);
				3765	}
				3766
				3767	static char isupper__doc__[] =
				3768	"S.isupper() -> int\n\
				3769	\n\
				3770	Return 1 if all cased characters in S are uppercase and there is\n\
				3771	at least one cased character in S, 0 otherwise.";
				3772
				3773	static PyObject*
				3774	unicode_isupper(PyUnicodeObject self, PyObject args)
				3775	{
				3776	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3777	register const Py_UNICODE *e;
				3778	int cased;
				3779
				3780	if (!PyArg_NoArgs(args))
				3781	return NULL;
				3782
				3783	/* Shortcut for single character strings */
				3784	if (PyUnicode_GET_SIZE(self) == 1)
				3785	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3786
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3787	/* Special case for empty strings */
				3788	if (PyString_GET_SIZE(self) == 0)
				3789	return PyInt_FromLong(0);
				3790
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3791	e = p + PyUnicode_GET_SIZE(self);
				3792	cased = 0;
				3793	for (; p < e; p++) {
				3794	register const Py_UNICODE ch = *p;
				3795
				3796	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3797	return PyInt_FromLong(0);
				3798	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3799	cased = 1;
				3800	}
				3801	return PyInt_FromLong(cased);
				3802	}
				3803
				3804	static char istitle__doc__[] =
				3805	"S.istitle() -> int\n\
				3806	\n\
				3807	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3808	may only follow uncased characters and lowercase characters only cased\n\
				3809	ones. Return 0 otherwise.";
				3810
				3811	static PyObject*
				3812	unicode_istitle(PyUnicodeObject self, PyObject args)
				3813	{
				3814	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3815	register const Py_UNICODE *e;
				3816	int cased, previous_is_cased;
				3817
				3818	if (!PyArg_NoArgs(args))
				3819	return NULL;
				3820
				3821	/* Shortcut for single character strings */
				3822	if (PyUnicode_GET_SIZE(self) == 1)
				3823	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3824	(Py_UNICODE_ISUPPER(*p) != 0));
				3825
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3826	/* Special case for empty strings */
				3827	if (PyString_GET_SIZE(self) == 0)
				3828	return PyInt_FromLong(0);
				3829
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3830	e = p + PyUnicode_GET_SIZE(self);
				3831	cased = 0;
				3832	previous_is_cased = 0;
				3833	for (; p < e; p++) {
				3834	register const Py_UNICODE ch = *p;
				3835
				3836	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3837	if (previous_is_cased)
				3838	return PyInt_FromLong(0);
				3839	previous_is_cased = 1;
				3840	cased = 1;
				3841	}
				3842	else if (Py_UNICODE_ISLOWER(ch)) {
				3843	if (!previous_is_cased)
				3844	return PyInt_FromLong(0);
				3845	previous_is_cased = 1;
				3846	cased = 1;
				3847	}
				3848	else
				3849	previous_is_cased = 0;
				3850	}
				3851	return PyInt_FromLong(cased);
				3852	}
				3853
				3854	static char isspace__doc__[] =
				3855	"S.isspace() -> int\n\
				3856	\n\
				3857	Return 1 if there are only whitespace characters in S,\n\
				3858	0 otherwise.";
				3859
				3860	static PyObject*
				3861	unicode_isspace(PyUnicodeObject self, PyObject args)
				3862	{
				3863	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3864	register const Py_UNICODE *e;
				3865
				3866	if (!PyArg_NoArgs(args))
				3867	return NULL;
				3868
				3869	/* Shortcut for single character strings */
				3870	if (PyUnicode_GET_SIZE(self) == 1 &&
				3871	Py_UNICODE_ISSPACE(*p))
				3872	return PyInt_FromLong(1);
				3873
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3874	/* Special case for empty strings */
				3875	if (PyString_GET_SIZE(self) == 0)
				3876	return PyInt_FromLong(0);
				3877
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3878	e = p + PyUnicode_GET_SIZE(self);
				3879	for (; p < e; p++) {
				3880	if (!Py_UNICODE_ISSPACE(*p))
				3881	return PyInt_FromLong(0);
				3882	}
				3883	return PyInt_FromLong(1);
				3884	}
				3885
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3886	static char isalpha__doc__[] =
				3887	"S.isalpha() -> int\n\
				3888	\n\
				3889	Return 1 if all characters in S are alphabetic\n\
				3890	and there is at least one character in S, 0 otherwise.";
				3891
				3892	static PyObject*
				3893	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3894	{
				3895	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3896	register const Py_UNICODE *e;
				3897
				3898	if (!PyArg_NoArgs(args))
				3899	return NULL;
				3900
				3901	/* Shortcut for single character strings */
				3902	if (PyUnicode_GET_SIZE(self) == 1 &&
				3903	Py_UNICODE_ISALPHA(*p))
				3904	return PyInt_FromLong(1);
				3905
				3906	/* Special case for empty strings */
				3907	if (PyString_GET_SIZE(self) == 0)
				3908	return PyInt_FromLong(0);
				3909
				3910	e = p + PyUnicode_GET_SIZE(self);
				3911	for (; p < e; p++) {
				3912	if (!Py_UNICODE_ISALPHA(*p))
				3913	return PyInt_FromLong(0);
				3914	}
				3915	return PyInt_FromLong(1);
				3916	}
				3917
				3918	static char isalnum__doc__[] =
				3919	"S.isalnum() -> int\n\
				3920	\n\
				3921	Return 1 if all characters in S are alphanumeric\n\
				3922	and there is at least one character in S, 0 otherwise.";
				3923
				3924	static PyObject*
				3925	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3926	{
				3927	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3928	register const Py_UNICODE *e;
				3929
				3930	if (!PyArg_NoArgs(args))
				3931	return NULL;
				3932
				3933	/* Shortcut for single character strings */
				3934	if (PyUnicode_GET_SIZE(self) == 1 &&
				3935	Py_UNICODE_ISALNUM(*p))
				3936	return PyInt_FromLong(1);
				3937
				3938	/* Special case for empty strings */
				3939	if (PyString_GET_SIZE(self) == 0)
				3940	return PyInt_FromLong(0);
				3941
				3942	e = p + PyUnicode_GET_SIZE(self);
				3943	for (; p < e; p++) {
				3944	if (!Py_UNICODE_ISALNUM(*p))
				3945	return PyInt_FromLong(0);
				3946	}
				3947	return PyInt_FromLong(1);
				3948	}
				3949
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3950	static char isdecimal__doc__[] =
				3951	"S.isdecimal() -> int\n\
				3952	\n\
				3953	Return 1 if there are only decimal characters in S,\n\
				3954	0 otherwise.";
				3955
				3956	static PyObject*
				3957	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3958	{
				3959	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3960	register const Py_UNICODE *e;
				3961
				3962	if (!PyArg_NoArgs(args))
				3963	return NULL;
				3964
				3965	/* Shortcut for single character strings */
				3966	if (PyUnicode_GET_SIZE(self) == 1 &&
				3967	Py_UNICODE_ISDECIMAL(*p))
				3968	return PyInt_FromLong(1);
				3969
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3970	/* Special case for empty strings */
				3971	if (PyString_GET_SIZE(self) == 0)
				3972	return PyInt_FromLong(0);
				3973
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3974	e = p + PyUnicode_GET_SIZE(self);
				3975	for (; p < e; p++) {
				3976	if (!Py_UNICODE_ISDECIMAL(*p))
				3977	return PyInt_FromLong(0);
				3978	}
				3979	return PyInt_FromLong(1);
				3980	}
				3981
				3982	static char isdigit__doc__[] =
				3983	"S.isdigit() -> int\n\
				3984	\n\
				3985	Return 1 if there are only digit characters in S,\n\
				3986	0 otherwise.";
				3987
				3988	static PyObject*
				3989	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3990	{
				3991	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3992	register const Py_UNICODE *e;
				3993
				3994	if (!PyArg_NoArgs(args))
				3995	return NULL;
				3996
				3997	/* Shortcut for single character strings */
				3998	if (PyUnicode_GET_SIZE(self) == 1 &&
				3999	Py_UNICODE_ISDIGIT(*p))
				4000	return PyInt_FromLong(1);
				4001
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4002	/* Special case for empty strings */
				4003	if (PyString_GET_SIZE(self) == 0)
				4004	return PyInt_FromLong(0);
				4005
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4006	e = p + PyUnicode_GET_SIZE(self);
				4007	for (; p < e; p++) {
				4008	if (!Py_UNICODE_ISDIGIT(*p))
				4009	return PyInt_FromLong(0);
				4010	}
				4011	return PyInt_FromLong(1);
				4012	}
				4013
				4014	static char isnumeric__doc__[] =
				4015	"S.isnumeric() -> int\n\
				4016	\n\
				4017	Return 1 if there are only numeric characters in S,\n\
				4018	0 otherwise.";
				4019
				4020	static PyObject*
				4021	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				4022	{
				4023	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4024	register const Py_UNICODE *e;
				4025
				4026	if (!PyArg_NoArgs(args))
				4027	return NULL;
				4028
				4029	/* Shortcut for single character strings */
				4030	if (PyUnicode_GET_SIZE(self) == 1 &&
				4031	Py_UNICODE_ISNUMERIC(*p))
				4032	return PyInt_FromLong(1);
				4033
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4034	/* Special case for empty strings */
				4035	if (PyString_GET_SIZE(self) == 0)
				4036	return PyInt_FromLong(0);
				4037
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4038	e = p + PyUnicode_GET_SIZE(self);
				4039	for (; p < e; p++) {
				4040	if (!Py_UNICODE_ISNUMERIC(*p))
				4041	return PyInt_FromLong(0);
				4042	}
				4043	return PyInt_FromLong(1);
				4044	}
				4045
				4046	static char join__doc__[] =
				4047	"S.join(sequence) -> unicode\n\
				4048	\n\
				4049	Return a string which is the concatenation of the strings in the\n\
				4050	sequence. The separator between elements is S.";
				4051
				4052	static PyObject*
				4053	unicode_join(PyUnicodeObject self, PyObject args)
				4054	{
				4055	PyObject *data;
				4056	if (!PyArg_ParseTuple(args, "O:join", &data))
				4057	return NULL;
				4058
				4059	return PyUnicode_Join((PyObject *)self, data);
				4060	}
				4061
				4062	static int
				4063	unicode_length(PyUnicodeObject *self)
				4064	{
				4065	return self->length;
				4066	}
				4067
				4068	static char ljust__doc__[] =
				4069	"S.ljust(width) -> unicode\n\
				4070	\n\
				4071	Return S left justified in a Unicode string of length width. Padding is\n\
				4072	done using spaces.";
				4073
				4074	static PyObject *
				4075	unicode_ljust(PyUnicodeObject self, PyObject args)
				4076	{
				4077	int width;
				4078	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4079	return NULL;
				4080
				4081	if (self->length >= width) {
				4082	Py_INCREF(self);
				4083	return (PyObject*) self;
				4084	}
				4085
				4086	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4087	}
				4088
				4089	static char lower__doc__[] =
				4090	"S.lower() -> unicode\n\
				4091	\n\
				4092	Return a copy of the string S converted to lowercase.";
				4093
				4094	static PyObject*
				4095	unicode_lower(PyUnicodeObject self, PyObject args)
				4096	{
				4097	if (!PyArg_NoArgs(args))
				4098	return NULL;
				4099	return fixup(self, fixlower);
				4100	}
				4101
				4102	static char lstrip__doc__[] =
				4103	"S.lstrip() -> unicode\n\
				4104	\n\
				4105	Return a copy of the string S with leading whitespace removed.";
				4106
				4107	static PyObject *
				4108	unicode_lstrip(PyUnicodeObject self, PyObject args)
				4109	{
				4110	if (!PyArg_NoArgs(args))
				4111	return NULL;
				4112	return strip(self, 1, 0);
				4113	}
				4114
				4115	static PyObject*
				4116	unicode_repeat(PyUnicodeObject *str, int len)
				4117	{
				4118	PyUnicodeObject *u;
				4119	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4120	int nchars;
				4121	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4122
				4123	if (len < 0)
				4124	len = 0;
				4125
				4126	if (len == 1) {
				4127	/* no repeat, return original string */
				4128	Py_INCREF(str);
				4129	return (PyObject*) str;
				4130	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4131
				4132	/* ensure # of chars needed doesn't overflow int and # of bytes
				4133	* needed doesn't overflow size_t
				4134	*/
				4135	nchars = len * str->length;
				4136	if (len && nchars / len != str->length) {
				4137	PyErr_SetString(PyExc_OverflowError,
				4138	"repeated string is too long");
				4139	return NULL;
				4140	}
				4141	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4142	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4143	PyErr_SetString(PyExc_OverflowError,
				4144	"repeated string is too long");
				4145	return NULL;
				4146	}
				4147	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4148	if (!u)
				4149	return NULL;
				4150
				4151	p = u->str;
				4152
				4153	while (len-- > 0) {
				4154	Py_UNICODE_COPY(p, str->str, str->length);
				4155	p += str->length;
				4156	}
				4157
				4158	return (PyObject*) u;
				4159	}
				4160
				4161	PyObject PyUnicode_Replace(PyObject obj,
				4162	PyObject *subobj,
				4163	PyObject *replobj,
				4164	int maxcount)
				4165	{
				4166	PyObject *self;
				4167	PyObject *str1;
				4168	PyObject *str2;
				4169	PyObject *result;
				4170
				4171	self = PyUnicode_FromObject(obj);
				4172	if (self == NULL)
				4173	return NULL;
				4174	str1 = PyUnicode_FromObject(subobj);
				4175	if (str1 == NULL) {
				4176	Py_DECREF(self);
				4177	return NULL;
				4178	}
				4179	str2 = PyUnicode_FromObject(replobj);
				4180	if (str2 == NULL) {
				4181	Py_DECREF(self);
				4182	Py_DECREF(str1);
				4183	return NULL;
				4184	}
				4185	result = replace((PyUnicodeObject *)self,
				4186	(PyUnicodeObject *)str1,
				4187	(PyUnicodeObject *)str2,
				4188	maxcount);
				4189	Py_DECREF(self);
				4190	Py_DECREF(str1);
				4191	Py_DECREF(str2);
				4192	return result;
				4193	}
				4194
				4195	static char replace__doc__[] =
				4196	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4197	\n\
				4198	Return a copy of S with all occurrences of substring\n\
				4199	old replaced by new. If the optional argument maxsplit is\n\
				4200	given, only the first maxsplit occurrences are replaced.";
				4201
				4202	static PyObject*
				4203	unicode_replace(PyUnicodeObject self, PyObject args)
				4204	{
				4205	PyUnicodeObject *str1;
				4206	PyUnicodeObject *str2;
				4207	int maxcount = -1;
				4208	PyObject *result;
				4209
				4210	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4211	return NULL;
				4212	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4213	if (str1 == NULL)
				4214	return NULL;
				4215	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4216	if (str2 == NULL)
				4217	return NULL;
				4218
				4219	result = replace(self, str1, str2, maxcount);
				4220
				4221	Py_DECREF(str1);
				4222	Py_DECREF(str2);
				4223	return result;
				4224	}
				4225
				4226	static
				4227	PyObject unicode_repr(PyObject unicode)
				4228	{
				4229	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4230	PyUnicode_GET_SIZE(unicode),
				4231	1);
				4232	}
				4233
				4234	static char rfind__doc__[] =
				4235	"S.rfind(sub [,start [,end]]) -> int\n\
				4236	\n\
				4237	Return the highest index in S where substring sub is found,\n\
				4238	such that sub is contained within s[start,end]. Optional\n\
				4239	arguments start and end are interpreted as in slice notation.\n\
				4240	\n\
				4241	Return -1 on failure.";
				4242
				4243	static PyObject *
				4244	unicode_rfind(PyUnicodeObject self, PyObject args)
				4245	{
				4246	PyUnicodeObject *substring;
				4247	int start = 0;
				4248	int end = INT_MAX;
				4249	PyObject *result;
				4250
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4251	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4252	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4253	return NULL;
				4254	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4255	(PyObject *)substring);
				4256	if (substring == NULL)
				4257	return NULL;
				4258
				4259	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4260
				4261	Py_DECREF(substring);
				4262	return result;
				4263	}
				4264
				4265	static char rindex__doc__[] =
				4266	"S.rindex(sub [,start [,end]]) -> int\n\
				4267	\n\
				4268	Like S.rfind() but raise ValueError when the substring is not found.";
				4269
				4270	static PyObject *
				4271	unicode_rindex(PyUnicodeObject self, PyObject args)
				4272	{
				4273	int result;
				4274	PyUnicodeObject *substring;
				4275	int start = 0;
				4276	int end = INT_MAX;
				4277
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4278	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4279	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4280	return NULL;
				4281	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4282	(PyObject *)substring);
				4283	if (substring == NULL)
				4284	return NULL;
				4285
				4286	result = findstring(self, substring, start, end, -1);
				4287
				4288	Py_DECREF(substring);
				4289	if (result < 0) {
				4290	PyErr_SetString(PyExc_ValueError, "substring not found");
				4291	return NULL;
				4292	}
				4293	return PyInt_FromLong(result);
				4294	}
				4295
				4296	static char rjust__doc__[] =
				4297	"S.rjust(width) -> unicode\n\
				4298	\n\
				4299	Return S right justified in a Unicode string of length width. Padding is\n\
				4300	done using spaces.";
				4301
				4302	static PyObject *
				4303	unicode_rjust(PyUnicodeObject self, PyObject args)
				4304	{
				4305	int width;
				4306	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4307	return NULL;
				4308
				4309	if (self->length >= width) {
				4310	Py_INCREF(self);
				4311	return (PyObject*) self;
				4312	}
				4313
				4314	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4315	}
				4316
				4317	static char rstrip__doc__[] =
				4318	"S.rstrip() -> unicode\n\
				4319	\n\
				4320	Return a copy of the string S with trailing whitespace removed.";
				4321
				4322	static PyObject *
				4323	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4324	{
				4325	if (!PyArg_NoArgs(args))
				4326	return NULL;
				4327	return strip(self, 0, 1);
				4328	}
				4329
				4330	static PyObject*
				4331	unicode_slice(PyUnicodeObject *self, int start, int end)
				4332	{
				4333	/* standard clamping */
				4334	if (start < 0)
				4335	start = 0;
				4336	if (end < 0)
				4337	end = 0;
				4338	if (end > self->length)
				4339	end = self->length;
				4340	if (start == 0 && end == self->length) {
				4341	/* full slice, return original string */
				4342	Py_INCREF(self);
				4343	return (PyObject*) self;
				4344	}
				4345	if (start > end)
				4346	start = end;
				4347	/* copy slice */
				4348	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4349	end - start);
				4350	}
				4351
				4352	PyObject PyUnicode_Split(PyObject s,
				4353	PyObject *sep,
				4354	int maxsplit)
				4355	{
				4356	PyObject *result;
				4357
				4358	s = PyUnicode_FromObject(s);
				4359	if (s == NULL)
				4360	return NULL;
				4361	if (sep != NULL) {
				4362	sep = PyUnicode_FromObject(sep);
				4363	if (sep == NULL) {
				4364	Py_DECREF(s);
				4365	return NULL;
				4366	}
				4367	}
				4368
				4369	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4370
				4371	Py_DECREF(s);
				4372	Py_XDECREF(sep);
				4373	return result;
				4374	}
				4375
				4376	static char split__doc__[] =
				4377	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4378	\n\
				4379	Return a list of the words in S, using sep as the\n\
				4380	delimiter string. If maxsplit is given, at most maxsplit\n\
				4381	splits are done. If sep is not specified, any whitespace string\n\
				4382	is a separator.";
				4383
				4384	static PyObject*
				4385	unicode_split(PyUnicodeObject self, PyObject args)
				4386	{
				4387	PyObject *substring = Py_None;
				4388	int maxcount = -1;
				4389
				4390	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4391	return NULL;
				4392
				4393	if (substring == Py_None)
				4394	return split(self, NULL, maxcount);
				4395	else if (PyUnicode_Check(substring))
				4396	return split(self, (PyUnicodeObject *)substring, maxcount);
				4397	else
				4398	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4399	}
				4400
				4401	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4402	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4403	\n\
				4404	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4405	Line breaks are not included in the resulting list unless keepends\n\
				4406	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4407
				4408	static PyObject*
				4409	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4410	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4411	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4412
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4413	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4414	return NULL;
				4415
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4416	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4417	}
				4418
				4419	static
				4420	PyObject unicode_str(PyUnicodeObject self)
				4421	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4422	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4423	}
				4424
				4425	static char strip__doc__[] =
				4426	"S.strip() -> unicode\n\
				4427	\n\
				4428	Return a copy of S with leading and trailing whitespace removed.";
				4429
				4430	static PyObject *
				4431	unicode_strip(PyUnicodeObject self, PyObject args)
				4432	{
				4433	if (!PyArg_NoArgs(args))
				4434	return NULL;
				4435	return strip(self, 1, 1);
				4436	}
				4437
				4438	static char swapcase__doc__[] =
				4439	"S.swapcase() -> unicode\n\
				4440	\n\
				4441	Return a copy of S with uppercase characters converted to lowercase\n\
				4442	and vice versa.";
				4443
				4444	static PyObject*
				4445	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4446	{
				4447	if (!PyArg_NoArgs(args))
				4448	return NULL;
				4449	return fixup(self, fixswapcase);
				4450	}
				4451
				4452	static char translate__doc__[] =
				4453	"S.translate(table) -> unicode\n\
				4454	\n\
				4455	Return a copy of the string S, where all characters have been mapped\n\
				4456	through the given translation table, which must be a mapping of\n\
				4457	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4458	are left untouched. Characters mapped to None are deleted.";
				4459
				4460	static PyObject*
				4461	unicode_translate(PyUnicodeObject self, PyObject args)
				4462	{
				4463	PyObject *table;
				4464
				4465	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4466	return NULL;
				4467	return PyUnicode_TranslateCharmap(self->str,
				4468	self->length,
				4469	table,
				4470	"ignore");
				4471	}
				4472
				4473	static char upper__doc__[] =
				4474	"S.upper() -> unicode\n\
				4475	\n\
				4476	Return a copy of S converted to uppercase.";
				4477
				4478	static PyObject*
				4479	unicode_upper(PyUnicodeObject self, PyObject args)
				4480	{
				4481	if (!PyArg_NoArgs(args))
				4482	return NULL;
				4483	return fixup(self, fixupper);
				4484	}
				4485
				4486	#if 0
				4487	static char zfill__doc__[] =
				4488	"S.zfill(width) -> unicode\n\
				4489	\n\
				4490	Pad a numeric string x with zeros on the left, to fill a field\n\
				4491	of the specified width. The string x is never truncated.";
				4492
				4493	static PyObject *
				4494	unicode_zfill(PyUnicodeObject self, PyObject args)
				4495	{
				4496	int fill;
				4497	PyUnicodeObject *u;
				4498
				4499	int width;
				4500	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4501	return NULL;
				4502
				4503	if (self->length >= width) {
				4504	Py_INCREF(self);
				4505	return (PyObject*) self;
				4506	}
				4507
				4508	fill = width - self->length;
				4509
				4510	u = pad(self, fill, 0, '0');
				4511
				4512	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4513	/* move sign to beginning of string */
				4514	u->str[0] = u->str[fill];
				4515	u->str[fill] = '0';
				4516	}
				4517
				4518	return (PyObject*) u;
				4519	}
				4520	#endif
				4521
				4522	#if 0
				4523	static PyObject*
				4524	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4525	{
				4526	if (!PyArg_NoArgs(args))
				4527	return NULL;
				4528	return PyInt_FromLong(unicode_freelist_size);
				4529	}
				4530	#endif
				4531
				4532	static char startswith__doc__[] =
				4533	"S.startswith(prefix[, start[, end]]) -> int\n\
				4534	\n\
				4535	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4536	optional start, test S beginning at that position. With optional end, stop\n\
				4537	comparing S at that position.";
				4538
				4539	static PyObject *
				4540	unicode_startswith(PyUnicodeObject *self,
				4541	PyObject *args)
				4542	{
				4543	PyUnicodeObject *substring;
				4544	int start = 0;
				4545	int end = INT_MAX;
				4546	PyObject *result;
				4547
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4548	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4549	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4550	return NULL;
				4551	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4552	(PyObject *)substring);
				4553	if (substring == NULL)
				4554	return NULL;
				4555
				4556	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4557
				4558	Py_DECREF(substring);
				4559	return result;
				4560	}
				4561
				4562
				4563	static char endswith__doc__[] =
				4564	"S.endswith(suffix[, start[, end]]) -> int\n\
				4565	\n\
				4566	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4567	optional start, test S beginning at that position. With optional end, stop\n\
				4568	comparing S at that position.";
				4569
				4570	static PyObject *
				4571	unicode_endswith(PyUnicodeObject *self,
				4572	PyObject *args)
				4573	{
				4574	PyUnicodeObject *substring;
				4575	int start = 0;
				4576	int end = INT_MAX;
				4577	PyObject *result;
				4578
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4579	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4580	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4581	return NULL;
				4582	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4583	(PyObject *)substring);
				4584	if (substring == NULL)
				4585	return NULL;
				4586
				4587	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4588
				4589	Py_DECREF(substring);
				4590	return result;
				4591	}
				4592
				4593
				4594	static PyMethodDef unicode_methods[] = {
				4595
				4596	/* Order is according to common usage: often used methods should
				4597	appear first, since lookup is done sequentially. */
				4598
				4599	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4600	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4601	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4602	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4603	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4604	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4605	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4606	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4607	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4608	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4609	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4610	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4611	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4612	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4613	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4614	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4615	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4616	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4617	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4618	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4619	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4620	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4621	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4622	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4623	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4624	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4625	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4626	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4627	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4628	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4629	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4630	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4631	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4632	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4633	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4634	#if 0
				4635	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4636	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4637	#endif
				4638
				4639	#if 0
				4640	/* This one is just used for debugging the implementation. */
				4641	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4642	#endif
				4643
				4644	{NULL, NULL}
				4645	};
				4646
				4647	static PyObject *
				4648	unicode_getattr(PyUnicodeObject self, char name)
				4649	{
				4650	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4651	}
				4652
				4653	static PySequenceMethods unicode_as_sequence = {
				4654	(inquiry) unicode_length, /* sq_length */
				4655	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4656	(intargfunc) unicode_repeat, /* sq_repeat */
				4657	(intargfunc) unicode_getitem, /* sq_item */
				4658	(intintargfunc) unicode_slice, /* sq_slice */
				4659	0, /* sq_ass_item */
				4660	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4661	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4662	};
				4663
				4664	static int
				4665	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4666	int index,
				4667	const void **ptr)
				4668	{
				4669	if (index != 0) {
				4670	PyErr_SetString(PyExc_SystemError,
				4671	"accessing non-existent unicode segment");
				4672	return -1;
				4673	}
				4674	ptr = (void ) self->str;
				4675	return PyUnicode_GET_DATA_SIZE(self);
				4676	}
				4677
				4678	static int
				4679	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4680	const void **ptr)
				4681	{
				4682	PyErr_SetString(PyExc_TypeError,
				4683	"cannot use unicode as modifyable buffer");
				4684	return -1;
				4685	}
				4686
				4687	static int
				4688	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4689	int *lenp)
				4690	{
				4691	if (lenp)
				4692	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4693	return 1;
				4694	}
				4695
				4696	static int
				4697	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4698	int index,
				4699	const void **ptr)
				4700	{
				4701	PyObject *str;
				4702
				4703	if (index != 0) {
				4704	PyErr_SetString(PyExc_SystemError,
				4705	"accessing non-existent unicode segment");
				4706	return -1;
				4707	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4708	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4709	if (str == NULL)
				4710	return -1;
				4711	ptr = (void ) PyString_AS_STRING(str);
				4712	return PyString_GET_SIZE(str);
				4713	}
				4714
				4715	/* Helpers for PyUnicode_Format() */
				4716
				4717	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4718	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4719	{
				4720	int argidx = *p_argidx;
				4721	if (argidx < arglen) {
				4722	(*p_argidx)++;
				4723	if (arglen < 0)
				4724	return args;
				4725	else
				4726	return PyTuple_GetItem(args, argidx);
				4727	}
				4728	PyErr_SetString(PyExc_TypeError,
				4729	"not enough arguments for format string");
				4730	return NULL;
				4731	}
				4732
				4733	#define F_LJUST (1<<0)
				4734	#define F_SIGN (1<<1)
				4735	#define F_BLANK (1<<2)
				4736	#define F_ALT (1<<3)
				4737	#define F_ZERO (1<<4)
				4738
				4739	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4740	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4741	{
				4742	register int i;
				4743	int len;
				4744	va_list va;
				4745	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4746	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4747
				4748	/* First, format the string as char array, then expand to Py_UNICODE
				4749	array. */
				4750	charbuffer = (char *)buffer;
				4751	len = vsprintf(charbuffer, format, va);
				4752	for (i = len - 1; i >= 0; i--)
				4753	buffer[i] = (Py_UNICODE) charbuffer[i];
				4754
				4755	va_end(va);
				4756	return len;
				4757	}
				4758
				4759	static int
				4760	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4761	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4762	int flags,
				4763	int prec,
				4764	int type,
				4765	PyObject *v)
				4766	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4767	/* fmt = '%#.' + `prec` + `type`
				4768	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4769	char fmt[20];
				4770	double x;
				4771
				4772	x = PyFloat_AsDouble(v);
				4773	if (x == -1.0 && PyErr_Occurred())
				4774	return -1;
				4775	if (prec < 0)
				4776	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4777	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4778	type = 'g';
				4779	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4780	/* worst case length calc to ensure no buffer overrun:
				4781	fmt = %#.<prec>g
				4782	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4783	for any double rep.)
				4784	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4785	If prec=0 the effective precision is 1 (the leading digit is
				4786	always given), therefore increase by one to 10+prec. */
				4787	if (buflen <= (size_t)10 + (size_t)prec) {
				4788	PyErr_SetString(PyExc_OverflowError,
				4789	"formatted float is too long (precision too long?)");
				4790	return -1;
				4791	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4792	return usprintf(buf, fmt, x);
				4793	}
				4794
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4795	static PyObject*
				4796	formatlong(PyObject *val, int flags, int prec, int type)
				4797	{
				4798	char *buf;
				4799	int i, len;
				4800	PyObject str; / temporary string object. */
				4801	PyUnicodeObject *result;
				4802
				4803	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4804	if (!str)
				4805	return NULL;
				4806	result = _PyUnicode_New(len);
				4807	for (i = 0; i < len; i++)
				4808	result->str[i] = buf[i];
				4809	result->str[len] = 0;
				4810	Py_DECREF(str);
				4811	return (PyObject*)result;
				4812	}
				4813
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4814	static int
				4815	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4816	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4817	int flags,
				4818	int prec,
				4819	int type,
				4820	PyObject *v)
				4821	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4822	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4823	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4824	+ 1 + 1 = 24*/
				4825	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4826	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4827	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4828
				4829	x = PyInt_AsLong(v);
				4830	if (x == -1 && PyErr_Occurred())
				4831	return -1;
				4832	if (prec < 0)
				4833	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4834	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4835	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4836	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4837	PyErr_SetString(PyExc_OverflowError,
				4838	"formatted integer is too long (precision too long?)");
				4839	return -1;
				4840	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4841	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				4842	* but we want it (for consistency with other %#x conversions, and
				4843	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4844	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				4845	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				4846	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4847	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4848	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				4849	/* Only way to know what the platform does is to try it. */
				4850	sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
				4851	if (fmt[1] != (char)type) {
				4852	/* Supply our own leading 0x/0X -- needed under std C */
				4853	use_native_c_format = 0;
				4854	sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
				4855	}
				4856	}
				4857	if (use_native_c_format)
				4858	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4859	return usprintf(buf, fmt, x);
				4860	}
				4861
				4862	static int
				4863	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4864	size_t buflen,
				4865	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4866	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4867	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4868	if (PyUnicode_Check(v)) {
				4869	if (PyUnicode_GET_SIZE(v) != 1)
				4870	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4871	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4872	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4873
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4874	else if (PyString_Check(v)) {
				4875	if (PyString_GET_SIZE(v) != 1)
				4876	goto onError;
				4877	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4878	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4879
				4880	else {
				4881	/* Integer input truncated to a character */
				4882	long x;
				4883	x = PyInt_AsLong(v);
				4884	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4885	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4886	buf[0] = (char) x;
				4887	}
				4888	buf[1] = '\0';
				4889	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4890
				4891	onError:
				4892	PyErr_SetString(PyExc_TypeError,
				4893	"%c requires int or char");
				4894	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4895	}
				4896
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4897	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4898
				4899	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4900	chars are formatted. XXX This is a magic number. Each formatting
				4901	routine does bounds checking to ensure no overflow, but a better
				4902	solution may be to malloc a buffer of appropriate size for each
				4903	format. For now, the current solution is sufficient.
				4904	*/
				4905	#define FORMATBUFLEN (size_t)120
				4906
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4907	PyObject PyUnicode_Format(PyObject format,
				4908	PyObject *args)
				4909	{
				4910	Py_UNICODE fmt, res;
				4911	int fmtcnt, rescnt, reslen, arglen, argidx;
				4912	int args_owned = 0;
				4913	PyUnicodeObject *result = NULL;
				4914	PyObject *dict = NULL;
				4915	PyObject *uformat;
				4916
				4917	if (format == NULL \|\| args == NULL) {
				4918	PyErr_BadInternalCall();
				4919	return NULL;
				4920	}
				4921	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4922	if (uformat == NULL)
				4923	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4924	fmt = PyUnicode_AS_UNICODE(uformat);
				4925	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4926
				4927	reslen = rescnt = fmtcnt + 100;
				4928	result = _PyUnicode_New(reslen);
				4929	if (result == NULL)
				4930	goto onError;
				4931	res = PyUnicode_AS_UNICODE(result);
				4932
				4933	if (PyTuple_Check(args)) {
				4934	arglen = PyTuple_Size(args);
				4935	argidx = 0;
				4936	}
				4937	else {
				4938	arglen = -1;
				4939	argidx = -2;
				4940	}
				4941	if (args->ob_type->tp_as_mapping)
				4942	dict = args;
				4943
				4944	while (--fmtcnt >= 0) {
				4945	if (*fmt != '%') {
				4946	if (--rescnt < 0) {
				4947	rescnt = fmtcnt + 100;
				4948	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	4949	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4950	return NULL;
				4951	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4952	--rescnt;
				4953	}
				4954	res++ = fmt++;
				4955	}
				4956	else {
				4957	/* Got a format specifier */
				4958	int flags = 0;
				4959	int width = -1;
				4960	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4961	Py_UNICODE c = '\0';
				4962	Py_UNICODE fill;
				4963	PyObject *v = NULL;
				4964	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4965	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4966	Py_UNICODE sign;
				4967	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4968	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4969
				4970	fmt++;
				4971	if (*fmt == '(') {
				4972	Py_UNICODE *keystart;
				4973	int keylen;
				4974	PyObject *key;
				4975	int pcount = 1;
				4976
				4977	if (dict == NULL) {
				4978	PyErr_SetString(PyExc_TypeError,
				4979	"format requires a mapping");
				4980	goto onError;
				4981	}
				4982	++fmt;
				4983	--fmtcnt;
				4984	keystart = fmt;
				4985	/* Skip over balanced parentheses */
				4986	while (pcount > 0 && --fmtcnt >= 0) {
				4987	if (*fmt == ')')
				4988	--pcount;
				4989	else if (*fmt == '(')
				4990	++pcount;
				4991	fmt++;
				4992	}
				4993	keylen = fmt - keystart - 1;
				4994	if (fmtcnt < 0 \|\| pcount > 0) {
				4995	PyErr_SetString(PyExc_ValueError,
				4996	"incomplete format key");
				4997	goto onError;
				4998	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4999	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5000	then looked up since Python uses strings to hold
				5001	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5002	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5003	key = PyUnicode_EncodeUTF8(keystart,
				5004	keylen,
				5005	NULL);
				5006	if (key == NULL)
				5007	goto onError;
				5008	if (args_owned) {
				5009	Py_DECREF(args);
				5010	args_owned = 0;
				5011	}
				5012	args = PyObject_GetItem(dict, key);
				5013	Py_DECREF(key);
				5014	if (args == NULL) {
				5015	goto onError;
				5016	}
				5017	args_owned = 1;
				5018	arglen = -1;
				5019	argidx = -2;
				5020	}
				5021	while (--fmtcnt >= 0) {
				5022	switch (c = *fmt++) {
				5023	case '-': flags \|= F_LJUST; continue;
				5024	case '+': flags \|= F_SIGN; continue;
				5025	case ' ': flags \|= F_BLANK; continue;
				5026	case '#': flags \|= F_ALT; continue;
				5027	case '0': flags \|= F_ZERO; continue;
				5028	}
				5029	break;
				5030	}
				5031	if (c == '*') {
				5032	v = getnextarg(args, arglen, &argidx);
				5033	if (v == NULL)
				5034	goto onError;
				5035	if (!PyInt_Check(v)) {
				5036	PyErr_SetString(PyExc_TypeError,
				5037	"* wants int");
				5038	goto onError;
				5039	}
				5040	width = PyInt_AsLong(v);
				5041	if (width < 0) {
				5042	flags \|= F_LJUST;
				5043	width = -width;
				5044	}
				5045	if (--fmtcnt >= 0)
				5046	c = *fmt++;
				5047	}
				5048	else if (c >= '0' && c <= '9') {
				5049	width = c - '0';
				5050	while (--fmtcnt >= 0) {
				5051	c = *fmt++;
				5052	if (c < '0' \|\| c > '9')
				5053	break;
				5054	if ((width*10) / 10 != width) {
				5055	PyErr_SetString(PyExc_ValueError,
				5056	"width too big");
				5057	goto onError;
				5058	}
				5059	width = width*10 + (c - '0');
				5060	}
				5061	}
				5062	if (c == '.') {
				5063	prec = 0;
				5064	if (--fmtcnt >= 0)
				5065	c = *fmt++;
				5066	if (c == '*') {
				5067	v = getnextarg(args, arglen, &argidx);
				5068	if (v == NULL)
				5069	goto onError;
				5070	if (!PyInt_Check(v)) {
				5071	PyErr_SetString(PyExc_TypeError,
				5072	"* wants int");
				5073	goto onError;
				5074	}
				5075	prec = PyInt_AsLong(v);
				5076	if (prec < 0)
				5077	prec = 0;
				5078	if (--fmtcnt >= 0)
				5079	c = *fmt++;
				5080	}
				5081	else if (c >= '0' && c <= '9') {
				5082	prec = c - '0';
				5083	while (--fmtcnt >= 0) {
				5084	c = Py_CHARMASK(*fmt++);
				5085	if (c < '0' \|\| c > '9')
				5086	break;
				5087	if ((prec*10) / 10 != prec) {
				5088	PyErr_SetString(PyExc_ValueError,
				5089	"prec too big");
				5090	goto onError;
				5091	}
				5092	prec = prec*10 + (c - '0');
				5093	}
				5094	}
				5095	} /* prec */
				5096	if (fmtcnt >= 0) {
				5097	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5098	if (--fmtcnt >= 0)
				5099	c = *fmt++;
				5100	}
				5101	}
				5102	if (fmtcnt < 0) {
				5103	PyErr_SetString(PyExc_ValueError,
				5104	"incomplete format");
				5105	goto onError;
				5106	}
				5107	if (c != '%') {
				5108	v = getnextarg(args, arglen, &argidx);
				5109	if (v == NULL)
				5110	goto onError;
				5111	}
				5112	sign = 0;
				5113	fill = ' ';
				5114	switch (c) {
				5115
				5116	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5117	pbuf = formatbuf;
				5118	/* presume that buffer length is at least 1 */
				5119	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5120	len = 1;
				5121	break;
				5122
				5123	case 's':
				5124	case 'r':
				5125	if (PyUnicode_Check(v) && c == 's') {
				5126	temp = v;
				5127	Py_INCREF(temp);
				5128	}
				5129	else {
				5130	PyObject *unicode;
				5131	if (c == 's')
				5132	temp = PyObject_Str(v);
				5133	else
				5134	temp = PyObject_Repr(v);
				5135	if (temp == NULL)
				5136	goto onError;
				5137	if (!PyString_Check(temp)) {
				5138	/* XXX Note: this should never happen, since
				5139	PyObject_Repr() and PyObject_Str() assure
				5140	this */
				5141	Py_DECREF(temp);
				5142	PyErr_SetString(PyExc_TypeError,
				5143	"%s argument has non-string str()");
				5144	goto onError;
				5145	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5146	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5147	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5148	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5149	"strict");
				5150	Py_DECREF(temp);
				5151	temp = unicode;
				5152	if (temp == NULL)
				5153	goto onError;
				5154	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5155	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5156	len = PyUnicode_GET_SIZE(temp);
				5157	if (prec >= 0 && len > prec)
				5158	len = prec;
				5159	break;
				5160
				5161	case 'i':
				5162	case 'd':
				5163	case 'u':
				5164	case 'o':
				5165	case 'x':
				5166	case 'X':
				5167	if (c == 'i')
				5168	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5169	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5170	temp = formatlong(v, flags, prec, c);
				5171	if (!temp)
				5172	goto onError;
				5173	pbuf = PyUnicode_AS_UNICODE(temp);
				5174	len = PyUnicode_GET_SIZE(temp);
				5175	/* unbounded ints can always produce
				5176	a sign character! */
				5177	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5178	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5179	else {
				5180	pbuf = formatbuf;
				5181	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5182	flags, prec, c, v);
				5183	if (len < 0)
				5184	goto onError;
				5185	/* only d conversion is signed */
				5186	sign = c == 'd';
				5187	}
				5188	if (flags & F_ZERO)
				5189	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5190	break;
				5191
				5192	case 'e':
				5193	case 'E':
				5194	case 'f':
				5195	case 'g':
				5196	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5197	pbuf = formatbuf;
				5198	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5199	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5200	if (len < 0)
				5201	goto onError;
				5202	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5203	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5204	fill = '0';
				5205	break;
				5206
				5207	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5208	pbuf = formatbuf;
				5209	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5210	if (len < 0)
				5211	goto onError;
				5212	break;
				5213
				5214	default:
				5215	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5216	"unsupported format character '%c' (0x%x) "
				5217	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5218	(31<=c && c<=126) ? c : '?',
				5219	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5220	goto onError;
				5221	}
				5222	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5223	if (pbuf == '-' \|\| pbuf == '+') {
				5224	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5225	len--;
				5226	}
				5227	else if (flags & F_SIGN)
				5228	sign = '+';
				5229	else if (flags & F_BLANK)
				5230	sign = ' ';
				5231	else
				5232	sign = 0;
				5233	}
				5234	if (width < len)
				5235	width = len;
				5236	if (rescnt < width + (sign != 0)) {
				5237	reslen -= rescnt;
				5238	rescnt = width + fmtcnt + 100;
				5239	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5240	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5241	return NULL;
				5242	res = PyUnicode_AS_UNICODE(result)
				5243	+ reslen - rescnt;
				5244	}
				5245	if (sign) {
				5246	if (fill != ' ')
				5247	*res++ = sign;
				5248	rescnt--;
				5249	if (width > len)
				5250	width--;
				5251	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5252	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5253	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5254	assert(pbuf[1] == c);
				5255	if (fill != ' ') {
				5256	res++ = pbuf++;
				5257	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5258	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5259	rescnt -= 2;
				5260	width -= 2;
				5261	if (width < 0)
				5262	width = 0;
				5263	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5264	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5265	if (width > len && !(flags & F_LJUST)) {
				5266	do {
				5267	--rescnt;
				5268	*res++ = fill;
				5269	} while (--width > len);
				5270	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5271	if (fill == ' ') {
				5272	if (sign)
				5273	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5274	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5275	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5276	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5277	res++ = pbuf++;
				5278	res++ = pbuf++;
				5279	}
				5280	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5281	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5282	res += len;
				5283	rescnt -= len;
				5284	while (--width >= len) {
				5285	--rescnt;
				5286	*res++ = ' ';
				5287	}
				5288	if (dict && (argidx < arglen) && c != '%') {
				5289	PyErr_SetString(PyExc_TypeError,
				5290	"not all arguments converted");
				5291	goto onError;
				5292	}
				5293	Py_XDECREF(temp);
				5294	} /* '%' */
				5295	} /* until end */
				5296	if (argidx < arglen && !dict) {
				5297	PyErr_SetString(PyExc_TypeError,
				5298	"not all arguments converted");
				5299	goto onError;
				5300	}
				5301
				5302	if (args_owned) {
				5303	Py_DECREF(args);
				5304	}
				5305	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5306	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5307	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5308	return (PyObject *)result;
				5309
				5310	onError:
				5311	Py_XDECREF(result);
				5312	Py_DECREF(uformat);
				5313	if (args_owned) {
				5314	Py_DECREF(args);
				5315	}
				5316	return NULL;
				5317	}
				5318
				5319	static PyBufferProcs unicode_as_buffer = {
				5320	(getreadbufferproc) unicode_buffer_getreadbuf,
				5321	(getwritebufferproc) unicode_buffer_getwritebuf,
				5322	(getsegcountproc) unicode_buffer_getsegcount,
				5323	(getcharbufferproc) unicode_buffer_getcharbuf,
				5324	};
				5325
				5326	PyTypeObject PyUnicode_Type = {
				5327	PyObject_HEAD_INIT(&PyType_Type)
				5328	0, /* ob_size */
				5329	"unicode", /* tp_name */
				5330	sizeof(PyUnicodeObject), /* tp_size */
				5331	0, /* tp_itemsize */
				5332	/* Slots */
				5333	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5334	0, /* tp_print */
				5335	(getattrfunc)unicode_getattr, /* tp_getattr */
				5336	0, /* tp_setattr */
				5337	(cmpfunc) unicode_compare, /* tp_compare */
				5338	(reprfunc) unicode_repr, /* tp_repr */
				5339	0, /* tp_as_number */
				5340	&unicode_as_sequence, /* tp_as_sequence */
				5341	0, /* tp_as_mapping */
				5342	(hashfunc) unicode_hash, /* tp_hash*/
				5343	0, /* tp_call*/
				5344	(reprfunc) unicode_str, /* tp_str */
				5345	(getattrofunc) NULL, /* tp_getattro */
				5346	(setattrofunc) NULL, /* tp_setattro */
				5347	&unicode_as_buffer, /* tp_as_buffer */
				5348	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5349	};
				5350
				5351	/* Initialize the Unicode implementation */
				5352
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5353	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5354	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5355	int i;
				5356
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5357	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5358	unicode_freelist = NULL;
				5359	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5360	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5361	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5362	for (i = 0; i < 256; i++)
				5363	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5364	}
				5365
				5366	/* Finalize the Unicode implementation */
				5367
				5368	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5369	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5370	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5371	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5372	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5373
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5374	Py_XDECREF(unicode_empty);
				5375	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5376
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5377	for (i = 0; i < 256; i++) {
				5378	if (unicode_latin1[i]) {
				5379	Py_DECREF(unicode_latin1[i]);
				5380	unicode_latin1[i] = NULL;
				5381	}
				5382	}
				5383
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5384	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5385	PyUnicodeObject *v = u;
				5386	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5387	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5388	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5389	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5390	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5391	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5392	unicode_freelist = NULL;
				5393	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5394	}