Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 2f66c3cf93ea6d1fcb50e0870d5bed6b89bfb6f8 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
				107	PyUnicode_GetMax()
				108	{
				109	#ifdef USE_UCS4_STORAGE
				110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
				204	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	222	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
				227	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				228	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	230	/* Keep-Alive optimization */
				231	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	232	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	233	unicode->str = NULL;
				234	unicode->length = 0;
				235	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	236	if (unicode->defenc) {
				237	Py_DECREF(unicode->defenc);
				238	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	239	}
				240	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	(PyUnicodeObject *)unicode = unicode_freelist;
				242	unicode_freelist = unicode;
				243	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	244	}
				245	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	246	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	247	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	248	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	249	}
				250	}
				251
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	252	int PyUnicode_Resize(PyObject **unicode,
				253	int length)
				254	{
				255	register PyUnicodeObject *v;
				256
				257	/* Argument checks */
				258	if (unicode == NULL) {
				259	PyErr_BadInternalCall();
				260	return -1;
				261	}
				262	v = (PyUnicodeObject )unicode;
				263	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				264	PyErr_BadInternalCall();
				265	return -1;
				266	}
				267
				268	/* Resizing unicode_empty and single character objects is not
				269	possible since these are being shared. We simply return a fresh
				270	copy with the same Unicode content. */
				271	if (v->length != length &&
				272	(v == unicode_empty \|\| v->length == 1)) {
				273	PyUnicodeObject *w = _PyUnicode_New(length);
				274	if (w == NULL)
				275	return -1;
				276	Py_UNICODE_COPY(w->str, v->str,
				277	length < v->length ? length : v->length);
				278	unicode = (PyObject )w;
				279	return 0;
				280	}
				281
				282	/* Note that we don't have to modify *unicode for unshared Unicode
				283	objects, since we can modify them in-place. */
				284	return unicode_resize(v, length);
				285	}
				286
				287	/* Internal API for use in unicodeobject.c only ! */
				288	#define _PyUnicode_Resize(unicodevar, length) \
				289	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				290
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	291	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				292	int size)
				293	{
				294	PyUnicodeObject *unicode;
				295
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	296	/* If the Unicode data is known at construction time, we can apply
				297	some optimizations which share commonly used objects. */
				298	if (u != NULL) {
				299
				300	/* Optimization for empty strings */
				301	if (size == 0 && unicode_empty != NULL) {
				302	Py_INCREF(unicode_empty);
				303	return (PyObject *)unicode_empty;
				304	}
				305
				306	/* Single character Unicode objects in the Latin-1 range are
				307	shared when using this constructor */
				308	if (size == 1 && *u < 256) {
				309	unicode = unicode_latin1[*u];
				310	if (!unicode) {
				311	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	312	if (!unicode)
				313	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	314	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	315	unicode_latin1[*u] = unicode;
				316	}
				317	Py_INCREF(unicode);
				318	return (PyObject *)unicode;
				319	}
				320	}
				321
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	322	unicode = _PyUnicode_New(size);
				323	if (!unicode)
				324	return NULL;
				325
				326	/* Copy the Unicode data into the new object */
				327	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	328	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	329
				330	return (PyObject *)unicode;
				331	}
				332
				333	#ifdef HAVE_WCHAR_H
				334
				335	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				336	int size)
				337	{
				338	PyUnicodeObject *unicode;
				339
				340	if (w == NULL) {
				341	PyErr_BadInternalCall();
				342	return NULL;
				343	}
				344
				345	unicode = _PyUnicode_New(size);
				346	if (!unicode)
				347	return NULL;
				348
				349	/* Copy the wchar_t data into the new object */
				350	#ifdef HAVE_USABLE_WCHAR_T
				351	memcpy(unicode->str, w, size * sizeof(wchar_t));
				352	#else
				353	{
				354	register Py_UNICODE *u;
				355	register int i;
				356	u = PyUnicode_AS_UNICODE(unicode);
				357	for (i = size; i >= 0; i--)
				358	u++ = w++;
				359	}
				360	#endif
				361
				362	return (PyObject *)unicode;
				363	}
				364
				365	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				366	register wchar_t *w,
				367	int size)
				368	{
				369	if (unicode == NULL) {
				370	PyErr_BadInternalCall();
				371	return -1;
				372	}
				373	if (size > PyUnicode_GET_SIZE(unicode))
				374	size = PyUnicode_GET_SIZE(unicode);
				375	#ifdef HAVE_USABLE_WCHAR_T
				376	memcpy(w, unicode->str, size * sizeof(wchar_t));
				377	#else
				378	{
				379	register Py_UNICODE *u;
				380	register int i;
				381	u = PyUnicode_AS_UNICODE(unicode);
				382	for (i = size; i >= 0; i--)
				383	w++ = u++;
				384	}
				385	#endif
				386
				387	return size;
				388	}
				389
				390	#endif
				391
				392	PyObject PyUnicode_FromObject(register PyObject obj)
				393	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	394	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				395	}
				396
				397	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				398	const char *encoding,
				399	const char *errors)
				400	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	401	const char *s;
				402	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	403	int owned = 0;
				404	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	405
				406	if (obj == NULL) {
				407	PyErr_BadInternalCall();
				408	return NULL;
				409	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	410
				411	/* Coerce object */
				412	if (PyInstance_Check(obj)) {
				413	PyObject *func;
				414	func = PyObject_GetAttrString(obj, "__str__");
				415	if (func == NULL) {
				416	PyErr_SetString(PyExc_TypeError,
				417	"coercing to Unicode: instance doesn't define __str__");
				418	return NULL;
				419	}
				420	obj = PyEval_CallObject(func, NULL);
				421	Py_DECREF(func);
				422	if (obj == NULL)
				423	return NULL;
				424	owned = 1;
				425	}
				426	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	427	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	428	v = obj;
				429	if (encoding) {
				430	PyErr_SetString(PyExc_TypeError,
				431	"decoding Unicode is not supported");
				432	return NULL;
				433	}
				434	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	435	}
				436	else if (PyString_Check(obj)) {
				437	s = PyString_AS_STRING(obj);
				438	len = PyString_GET_SIZE(obj);
				439	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	440	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				441	/* Overwrite the error message with something more useful in
				442	case of a TypeError. */
				443	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	444	PyErr_Format(PyExc_TypeError,
				445	"coercing to Unicode: need string or buffer, "
				446	"%.80s found",
				447	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	448	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	449	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	450
				451	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	452	if (len == 0) {
				453	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	454	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	455	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	456	else
				457	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	458
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	459	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	460	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	461	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	462	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	463	return v;
				464
				465	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	466	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	467	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	468	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	469	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	470	}
				471
				472	PyObject PyUnicode_Decode(const char s,
				473	int size,
				474	const char *encoding,
				475	const char *errors)
				476	{
				477	PyObject buffer = NULL, unicode;
				478
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	479	if (encoding == NULL)
				480	encoding = PyUnicode_GetDefaultEncoding();
				481
				482	/* Shortcuts for common default encodings */
				483	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	484	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	485	else if (strcmp(encoding, "latin-1") == 0)
				486	return PyUnicode_DecodeLatin1(s, size, errors);
				487	else if (strcmp(encoding, "ascii") == 0)
				488	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	489
				490	/* Decode via the codec registry */
				491	buffer = PyBuffer_FromMemory((void *)s, size);
				492	if (buffer == NULL)
				493	goto onError;
				494	unicode = PyCodec_Decode(buffer, encoding, errors);
				495	if (unicode == NULL)
				496	goto onError;
				497	if (!PyUnicode_Check(unicode)) {
				498	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	499	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	500	unicode->ob_type->tp_name);
				501	Py_DECREF(unicode);
				502	goto onError;
				503	}
				504	Py_DECREF(buffer);
				505	return unicode;
				506
				507	onError:
				508	Py_XDECREF(buffer);
				509	return NULL;
				510	}
				511
				512	PyObject PyUnicode_Encode(const Py_UNICODE s,
				513	int size,
				514	const char *encoding,
				515	const char *errors)
				516	{
				517	PyObject v, unicode;
				518
				519	unicode = PyUnicode_FromUnicode(s, size);
				520	if (unicode == NULL)
				521	return NULL;
				522	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				523	Py_DECREF(unicode);
				524	return v;
				525	}
				526
				527	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				528	const char *encoding,
				529	const char *errors)
				530	{
				531	PyObject *v;
				532
				533	if (!PyUnicode_Check(unicode)) {
				534	PyErr_BadArgument();
				535	goto onError;
				536	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	537
				538	if (encoding == NULL)
				539	encoding = PyUnicode_GetDefaultEncoding();
				540
				541	/* Shortcuts for common default encodings */
				542	if (errors == NULL) {
				543	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	544	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	545	else if (strcmp(encoding, "latin-1") == 0)
				546	return PyUnicode_AsLatin1String(unicode);
				547	else if (strcmp(encoding, "ascii") == 0)
				548	return PyUnicode_AsASCIIString(unicode);
				549	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	550
				551	/* Encode via the codec registry */
				552	v = PyCodec_Encode(unicode, encoding, errors);
				553	if (v == NULL)
				554	goto onError;
				555	/* XXX Should we really enforce this ? */
				556	if (!PyString_Check(v)) {
				557	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	558	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	559	v->ob_type->tp_name);
				560	Py_DECREF(v);
				561	goto onError;
				562	}
				563	return v;
				564
				565	onError:
				566	return NULL;
				567	}
				568
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	569	/* Return a Python string holding the default encoded value of the
				570	Unicode object.
				571
				572	The resulting string is cached in the Unicode object for subsequent
				573	usage by this function. The cached version is needed to implement
				574	the character buffer interface and will live (at least) as long as
				575	the Unicode object itself.
				576
				577	The refcount of the string is not incremented.
				578
				579	* Exported for internal use by the interpreter only !!! *
				580
				581	*/
				582
				583	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				584	const char *errors)
				585	{
				586	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				587
				588	if (v)
				589	return v;
				590	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				591	if (v && errors == NULL)
				592	((PyUnicodeObject *)unicode)->defenc = v;
				593	return v;
				594	}
				595
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	596	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				597	{
				598	if (!PyUnicode_Check(unicode)) {
				599	PyErr_BadArgument();
				600	goto onError;
				601	}
				602	return PyUnicode_AS_UNICODE(unicode);
				603
				604	onError:
				605	return NULL;
				606	}
				607
				608	int PyUnicode_GetSize(PyObject *unicode)
				609	{
				610	if (!PyUnicode_Check(unicode)) {
				611	PyErr_BadArgument();
				612	goto onError;
				613	}
				614	return PyUnicode_GET_SIZE(unicode);
				615
				616	onError:
				617	return -1;
				618	}
				619
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	620	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	621	{
				622	return unicode_default_encoding;
				623	}
				624
				625	int PyUnicode_SetDefaultEncoding(const char *encoding)
				626	{
				627	PyObject *v;
				628
				629	/* Make sure the encoding is valid. As side effect, this also
				630	loads the encoding into the codec registry cache. */
				631	v = _PyCodec_Lookup(encoding);
				632	if (v == NULL)
				633	goto onError;
				634	Py_DECREF(v);
				635	strncpy(unicode_default_encoding,
				636	encoding,
				637	sizeof(unicode_default_encoding));
				638	return 0;
				639
				640	onError:
				641	return -1;
				642	}
				643
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	644	/* --- UTF-8 Codec -------------------------------------------------------- */
				645
				646	static
				647	char utf8_code_length[256] = {
				648	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				649	illegal prefix. see RFC 2279 for details */
				650	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				651	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				652	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				653	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				654	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				655	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				656	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				657	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				658	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				659	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				660	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				661	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				662	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				663	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				664	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				665	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				666	};
				667
				668	static
				669	int utf8_decoding_error(const char **source,
				670	Py_UNICODE **dest,
				671	const char *errors,
				672	const char *details)
				673	{
				674	if ((errors == NULL) \|\|
				675	(strcmp(errors,"strict") == 0)) {
				676	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	677	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	678	details);
				679	return -1;
				680	}
				681	else if (strcmp(errors,"ignore") == 0) {
				682	(*source)++;
				683	return 0;
				684	}
				685	else if (strcmp(errors,"replace") == 0) {
				686	(*source)++;
				687	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				688	(*dest)++;
				689	return 0;
				690	}
				691	else {
				692	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	693	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	694	errors);
				695	return -1;
				696	}
				697	}
				698
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699	PyObject PyUnicode_DecodeUTF8(const char s,
				700	int size,
				701	const char *errors)
				702	{
				703	int n;
				704	const char *e;
				705	PyUnicodeObject *unicode;
				706	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	707	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	708
				709	/* Note: size will always be longer than the resulting Unicode
				710	character count */
				711	unicode = _PyUnicode_New(size);
				712	if (!unicode)
				713	return NULL;
				714	if (size == 0)
				715	return (PyObject *)unicode;
				716
				717	/* Unpack UTF-8 encoded data */
				718	p = unicode->str;
				719	e = s + size;
				720
				721	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	722	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	723
				724	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	725	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	726	s++;
				727	continue;
				728	}
				729
				730	n = utf8_code_length[ch];
				731
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	732	if (s + n > e) {
				733	errmsg = "unexpected end of data";
				734	goto utf8Error;
				735	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	736
				737	switch (n) {
				738
				739	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	740	errmsg = "unexpected code byte";
				741	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	742
				743	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	744	errmsg = "internal error";
				745	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	746
				747	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	748	if ((s[1] & 0xc0) != 0x80) {
				749	errmsg = "invalid data";
				750	goto utf8Error;
				751	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	752	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	753	if (ch < 0x80) {
				754	errmsg = "illegal encoding";
				755	goto utf8Error;
				756	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	757	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	758	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	759	break;
				760
				761	case 3:
				762	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	763	(s[2] & 0xc0) != 0x80) {
				764	errmsg = "invalid data";
				765	goto utf8Error;
				766	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	767	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	768	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				769	errmsg = "illegal encoding";
				770	goto utf8Error;
				771	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	772	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	773	*p++ = (Py_UNICODE)ch;
				774	break;
				775
				776	case 4:
				777	if ((s[1] & 0xc0) != 0x80 \|\|
				778	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	779	(s[3] & 0xc0) != 0x80) {
				780	errmsg = "invalid data";
				781	goto utf8Error;
				782	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	783	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				784	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				785	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	786	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	787	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	788	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	789	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	790	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	791	errmsg = "illegal encoding";
				792	goto utf8Error;
				793	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	794	#if Py_UNICODE_SIZE == 4
				795	*p++ = (Py_UNICODE)ch;
				796	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	797	/* compute and append the two surrogates: */
				798
				799	/* translate from 10000..10FFFF to 0..FFFF */
				800	ch -= 0x10000;
				801
				802	/* high surrogate = top 10 bits added to D800 */
				803	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				804
				805	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	806	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	807	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	808	break;
				809
				810	default:
				811	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	812	errmsg = "unsupported Unicode code range";
				813	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	814	}
				815	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	816	continue;
				817
				818	utf8Error:
				819	if (utf8_decoding_error(&s, &p, errors, errmsg))
				820	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	821	}
				822
				823	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	824	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	825	goto onError;
				826
				827	return (PyObject *)unicode;
				828
				829	onError:
				830	Py_DECREF(unicode);
				831	return NULL;
				832	}
				833
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	834	/* Not used anymore, now that the encoder supports UTF-16
				835	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	836	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	837	static
				838	int utf8_encoding_error(const Py_UNICODE **source,
				839	char **dest,
				840	const char *errors,
				841	const char *details)
				842	{
				843	if ((errors == NULL) \|\|
				844	(strcmp(errors,"strict") == 0)) {
				845	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	846	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	847	details);
				848	return -1;
				849	}
				850	else if (strcmp(errors,"ignore") == 0) {
				851	return 0;
				852	}
				853	else if (strcmp(errors,"replace") == 0) {
				854	**dest = '?';
				855	(*dest)++;
				856	return 0;
				857	}
				858	else {
				859	PyErr_Format(PyExc_ValueError,
				860	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	861	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	862	errors);
				863	return -1;
				864	}
				865	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	866	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	867
				868	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				869	int size,
				870	const char *errors)
				871	{
				872	PyObject *v;
				873	char *p;
				874	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	875	Py_UCS4 ch2;
				876	unsigned int cbAllocated = 3 * size;
				877	unsigned int cbWritten = 0;
				878	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	879
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	880	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	881	if (v == NULL)
				882	return NULL;
				883	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	884	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	885
				886	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	887	while (i < size) {
				888	Py_UCS4 ch = s[i++];
				889	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	890	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	891	cbWritten++;
				892	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	893	else if (ch < 0x0800) {
				894	*p++ = 0xc0 \| (ch >> 6);
				895	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	896	cbWritten += 2;
				897	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	898	else if (ch < 0x10000) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	899	/* Check for high surrogate */
				900	if (0xD800 <= ch && ch <= 0xDBFF) {
				901	if (i != size) {
				902	ch2 = s[i];
				903	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				904
				905	if (cbWritten >= (cbAllocated - 4)) {
				906	/* Provide enough room for some more
				907	surrogates */
				908	cbAllocated += 4*10;
				909	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	910	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	911	}
				912
				913	/* combine the two values */
				914	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				915
				916	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	917	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	918	i++;
				919	cbWritten += 4;
				920	}
				921	}
				922	}
				923	else {
				924	*p++ = (char)(0xe0 \| (ch >> 12));
				925	cbWritten += 3;
				926	}
				927	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				928	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	929	} else {
				930	*p++ = 0xf0 \| (ch>>18);
				931	*p++ = 0x80 \| ((ch>>12) & 0x3f);
				932	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				933	*p++ = 0x80 \| (ch & 0x3f);
				934	cbWritten += 4;
				935	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	936	}
				937	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	938	if (_PyString_Resize(&v, p - q))
				939	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	940	return v;
				941
				942	onError:
				943	Py_DECREF(v);
				944	return NULL;
				945	}
				946
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	947	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				948	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	949	if (!PyUnicode_Check(unicode)) {
				950	PyErr_BadArgument();
				951	return NULL;
				952	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	953	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				954	PyUnicode_GET_SIZE(unicode),
				955	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	956	}
				957
				958	/* --- UTF-16 Codec ------------------------------------------------------- */
				959
				960	static
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	961	int utf16_decoding_error(const Py_UCS2 **source,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	962	Py_UNICODE **dest,
				963	const char *errors,
				964	const char *details)
				965	{
				966	if ((errors == NULL) \|\|
				967	(strcmp(errors,"strict") == 0)) {
				968	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	969	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	970	details);
				971	return -1;
				972	}
				973	else if (strcmp(errors,"ignore") == 0) {
				974	return 0;
				975	}
				976	else if (strcmp(errors,"replace") == 0) {
				977	if (dest) {
				978	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				979	(*dest)++;
				980	}
				981	return 0;
				982	}
				983	else {
				984	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	985	"UTF-16 decoding error; "
				986	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	987	errors);
				988	return -1;
				989	}
				990	}
				991
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	992	PyObject PyUnicode_DecodeUTF16(const char s,
				993	int size,
				994	const char *errors,
				995	int *byteorder)
				996	{
				997	PyUnicodeObject *unicode;
				998	Py_UNICODE *p;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	999	const Py_UCS2 q, e;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1000	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1001	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1002
				1003	/* size should be an even number */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1004	if (size % sizeof(Py_UCS2) != 0) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1005	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				1006	return NULL;
				1007	/* The remaining input chars are ignored if we fall through
				1008	here... */
				1009	}
				1010
				1011	/* Note: size will always be longer than the resulting Unicode
				1012	character count */
				1013	unicode = _PyUnicode_New(size);
				1014	if (!unicode)
				1015	return NULL;
				1016	if (size == 0)
				1017	return (PyObject *)unicode;
				1018
				1019	/* Unpack UTF-16 encoded data */
				1020	p = unicode->str;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1021	q = (Py_UCS2 *)s;
				1022	e = q + (size / sizeof(Py_UCS2));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1023
				1024	if (byteorder)
				1025	bo = *byteorder;
				1026
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1027	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1028	byte order setting accordingly. In native mode, the leading BOM
				1029	mark is skipped, in all other modes, it is copied to the output
				1030	stream as-is (giving a ZWNBSP character). */
				1031	if (bo == 0) {
				1032	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1033	if (*q == 0xFEFF) {
				1034	q++;
				1035	bo = -1;
				1036	} else if (*q == 0xFFFE) {
				1037	q++;
				1038	bo = 1;
				1039	}
				1040	#else
				1041	if (*q == 0xFEFF) {
				1042	q++;
				1043	bo = 1;
				1044	} else if (*q == 0xFFFE) {
				1045	q++;
				1046	bo = -1;
				1047	}
				1048	#endif
				1049	}
				1050
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1051	while (q < e) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1052	register Py_UCS2 ch = *q++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1053
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1054	/* Swap input bytes if needed. (This assumes
				1055	sizeof(Py_UNICODE) == 2 !) */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1056	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1057	if (bo == 1)
				1058	ch = (ch >> 8) \| (ch << 8);
				1059	#else
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1060	if (bo == -1)
				1061	ch = (ch >> 8) \| (ch << 8);
				1062	#endif
				1063	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1064	*p++ = ch;
				1065	continue;
				1066	}
				1067
				1068	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1069	if (q >= e) {
				1070	errmsg = "unexpected end of data";
				1071	goto utf16Error;
				1072	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1073	if (0xD800 <= ch && ch <= 0xDBFF) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1074	Py_UCS2 ch2 = *q++;
				1075	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1076	if (bo == 1)
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1077	ch2 = (ch2 >> 8) \| (ch2 << 8);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1078	#else
				1079	if (bo == -1)
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1080	ch2 = (ch2 >> 8) \| (ch2 << 8);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1081	#endif
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1082	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1083	#if Py_UNICODE_SIZE == 2
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1084	/* This is valid data (a UTF-16 surrogate pair), but
				1085	we are not able to store this information since our
				1086	Py_UNICODE type only has 16 bits... this might
				1087	change someday, even though it's unlikely. */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1088	errmsg = "code pairs are not supported";
				1089	goto utf16Error;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1090	#else
				1091	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1092	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1093	#endif
				1094
				1095	}
				1096	else {
				1097	errmsg = "illegal UTF-16 surrogate";
				1098	goto utf16Error;
				1099	}
				1100
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1101	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1102	errmsg = "illegal encoding";
				1103	/* Fall through to report the error */
				1104
				1105	utf16Error:
				1106	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1107	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1108	}
				1109
				1110	if (byteorder)
				1111	*byteorder = bo;
				1112
				1113	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1114	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1115	goto onError;
				1116
				1117	return (PyObject *)unicode;
				1118
				1119	onError:
				1120	Py_DECREF(unicode);
				1121	return NULL;
				1122	}
				1123
				1124	#undef UTF16_ERROR
				1125
				1126	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1127	int size,
				1128	const char *errors,
				1129	int byteorder)
				1130	{
				1131	PyObject *v;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1132	Py_UCS2 *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1133	char *q;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1134	int i, pairs, doswap = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1135
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1136	for (i = pairs = 0; i < size; i++)
				1137	if (s[i] >= 0x10000)
				1138	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1139	v = PyString_FromStringAndSize(NULL,
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1140	sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1141	if (v == NULL)
				1142	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1143
				1144	q = PyString_AS_STRING(v);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1145	p = (Py_UCS2 *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1146	if (byteorder == 0)
				1147	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1148	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1149	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1150	if (byteorder == 0 \|\|
				1151	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1152	byteorder == -1
				1153	#else
				1154	byteorder == 1
				1155	#endif
				1156	)
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1157	doswap = 0;
				1158	while (size-- > 0) {
				1159	Py_UNICODE ch = *s++;
				1160	Py_UNICODE ch2 = 0;
				1161	if (ch >= 0x10000) {
				1162	ch2 = 0xDC00\|((ch-0x10000) & 0x3FF);
				1163	ch = 0xD800\|((ch-0x10000)>>10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1164	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1165	if (doswap){
				1166	*p++ = (ch >> 8) \| (ch << 8);
				1167	if (ch2)
				1168	*p++ = (ch2 >> 8) \| (ch2 << 8);
				1169	}else{
				1170	*p++ = ch;
				1171	if(ch2)
				1172	*p++ = ch2;
				1173	}
				1174	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1175	return v;
				1176	}
				1177
				1178	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1179	{
				1180	if (!PyUnicode_Check(unicode)) {
				1181	PyErr_BadArgument();
				1182	return NULL;
				1183	}
				1184	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1185	PyUnicode_GET_SIZE(unicode),
				1186	NULL,
				1187	0);
				1188	}
				1189
				1190	/* --- Unicode Escape Codec ----------------------------------------------- */
				1191
				1192	static
				1193	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1194	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1195	const char *errors,
				1196	const char *details)
				1197	{
				1198	if ((errors == NULL) \|\|
				1199	(strcmp(errors,"strict") == 0)) {
				1200	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1201	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1202	details);
				1203	return -1;
				1204	}
				1205	else if (strcmp(errors,"ignore") == 0) {
				1206	return 0;
				1207	}
				1208	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1209	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1210	return 0;
				1211	}
				1212	else {
				1213	PyErr_Format(PyExc_ValueError,
				1214	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1215	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1216	errors);
				1217	return -1;
				1218	}
				1219	}
				1220
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1221	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1222
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1223	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1224	int size,
				1225	const char *errors)
				1226	{
				1227	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1228	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1229	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1230	char* message;
				1231	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1232
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1233	/* Escaped strings will always be longer than the resulting
				1234	Unicode string, so we start with size here and then reduce the
				1235	length after conversion to the true value. */
				1236	v = _PyUnicode_New(size);
				1237	if (v == NULL)
				1238	goto onError;
				1239	if (size == 0)
				1240	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1241
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1242	p = buf = PyUnicode_AS_UNICODE(v);
				1243	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1244
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1245	while (s < end) {
				1246	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1247	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1248	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1249
				1250	/* Non-escape characters are interpreted as Unicode ordinals */
				1251	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1252	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1253	continue;
				1254	}
				1255
				1256	/* \ - Escapes */
				1257	s++;
				1258	switch (*s++) {
				1259
				1260	/* \x escapes */
				1261	case '\n': break;
				1262	case '\\': *p++ = '\\'; break;
				1263	case '\'': *p++ = '\''; break;
				1264	case '\"': *p++ = '\"'; break;
				1265	case 'b': *p++ = '\b'; break;
				1266	case 'f': p++ = '\014'; break; / FF */
				1267	case 't': *p++ = '\t'; break;
				1268	case 'n': *p++ = '\n'; break;
				1269	case 'r': *p++ = '\r'; break;
				1270	case 'v': p++ = '\013'; break; / VT */
				1271	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1272
				1273	/* \OOO (octal) escapes */
				1274	case '0': case '1': case '2': case '3':
				1275	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1276	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1277	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1278	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1279	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1280	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1281	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1282	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1283	break;
				1284
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1285	/* hex escapes */
				1286	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1287	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1288	digits = 2;
				1289	message = "truncated \\xXX escape";
				1290	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1291
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1292	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1293	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1294	digits = 4;
				1295	message = "truncated \\uXXXX escape";
				1296	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1297
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1298	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1299	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1300	digits = 8;
				1301	message = "truncated \\UXXXXXXXX escape";
				1302	hexescape:
				1303	chr = 0;
				1304	for (i = 0; i < digits; i++) {
				1305	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1306	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1307	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1308	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1309	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1310	i++;
				1311	break;
				1312	}
				1313	chr = (chr<<4) & ~0xF;
				1314	if (c >= '0' && c <= '9')
				1315	chr += c - '0';
				1316	else if (c >= 'a' && c <= 'f')
				1317	chr += 10 + c - 'a';
				1318	else
				1319	chr += 10 + c - 'A';
				1320	}
				1321	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1322	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1323	/* when we get here, chr is a 32-bit unicode character */
				1324	if (chr <= 0xffff)
				1325	/* UCS-2 character */
				1326	*p++ = (Py_UNICODE) chr;
				1327	else if (chr <= 0x10ffff) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1328	/* UCS-4 character. Either store directly, or as surrogate pair. */
				1329	#if Py_UNICODE_SIZE == 4
				1330	*p++ = chr;
				1331	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1332	chr -= 0x10000L;
				1333	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1334	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1335	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1336	} else {
				1337	if (unicodeescape_decoding_error(
				1338	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1339	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1340	)
				1341	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1342	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1343	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1344	break;
				1345
				1346	/* \N{name} */
				1347	case 'N':
				1348	message = "malformed \\N character escape";
				1349	if (ucnhash_CAPI == NULL) {
				1350	/* load the unicode data module */
				1351	PyObject m, v;
				1352	m = PyImport_ImportModule("unicodedata");
				1353	if (m == NULL)
				1354	goto ucnhashError;
				1355	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1356	Py_DECREF(m);
				1357	if (v == NULL)
				1358	goto ucnhashError;
				1359	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1360	Py_DECREF(v);
				1361	if (ucnhash_CAPI == NULL)
				1362	goto ucnhashError;
				1363	}
				1364	if (*s == '{') {
				1365	const char *start = s+1;
				1366	/* look for the closing brace */
				1367	while (*s != '}' && s < end)
				1368	s++;
				1369	if (s > start && s < end && *s == '}') {
				1370	/* found a name. look it up in the unicode database */
				1371	message = "unknown Unicode character name";
				1372	s++;
				1373	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1374	goto store;
				1375	}
				1376	}
				1377	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1378	goto onError;
				1379	*p++ = x;
				1380	break;
				1381
				1382	default:
				1383	*p++ = '\\';
				1384	*p++ = (unsigned char)s[-1];
				1385	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1386	}
				1387	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1388	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1389	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1390	return (PyObject *)v;
				1391
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1392	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1393	PyErr_SetString(
				1394	PyExc_UnicodeError,
				1395	"\\N escapes not supported (can't load unicodedata module)"
				1396	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1397	return NULL;
				1398
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1399	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1400	Py_XDECREF(v);
				1401	return NULL;
				1402	}
				1403
				1404	/* Return a Unicode-Escape string version of the Unicode object.
				1405
				1406	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1407	appropriate.
				1408
				1409	*/
				1410
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1411	static const Py_UNICODE findchar(const Py_UNICODE s,
				1412	int size,
				1413	Py_UNICODE ch);
				1414
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1415	static
				1416	PyObject unicodeescape_string(const Py_UNICODE s,
				1417	int size,
				1418	int quotes)
				1419	{
				1420	PyObject *repr;
				1421	char *p;
				1422	char *q;
				1423
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1424	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1425
				1426	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1427	if (repr == NULL)
				1428	return NULL;
				1429
				1430	p = q = PyString_AS_STRING(repr);
				1431
				1432	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1433	*p++ = 'u';
				1434	*p++ = (findchar(s, size, '\'') &&
				1435	!findchar(s, size, '"')) ? '"' : '\'';
				1436	}
				1437	while (size-- > 0) {
				1438	Py_UNICODE ch = *s++;
				1439	/* Escape quotes */
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	1440	if (quotes && (ch == (Py_UNICODE) q[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1441	*p++ = '\\';
				1442	*p++ = (char) ch;
				1443	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1444	/* Map 21-bit characters to '\U00xxxxxx' */
				1445	else if (ch >= 0x10000) {
				1446	*p++ = '\\';
				1447	*p++ = 'U';
				1448	*p++ = hexdigit[(ch >> 28) & 0xf];
				1449	*p++ = hexdigit[(ch >> 24) & 0xf];
				1450	*p++ = hexdigit[(ch >> 20) & 0xf];
				1451	*p++ = hexdigit[(ch >> 16) & 0xf];
				1452	*p++ = hexdigit[(ch >> 12) & 0xf];
				1453	*p++ = hexdigit[(ch >> 8) & 0xf];
				1454	*p++ = hexdigit[(ch >> 4) & 0xf];
				1455	*p++ = hexdigit[ch & 15];
				1456	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1457	/* Map 16-bit characters to '\uxxxx' */
				1458	else if (ch >= 256) {
				1459	*p++ = '\\';
				1460	*p++ = 'u';
				1461	*p++ = hexdigit[(ch >> 12) & 0xf];
				1462	*p++ = hexdigit[(ch >> 8) & 0xf];
				1463	*p++ = hexdigit[(ch >> 4) & 0xf];
				1464	*p++ = hexdigit[ch & 15];
				1465	}
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1466	/* Map special whitespace to '\t', \n', '\r' */
				1467	else if (ch == '\t') {
				1468	*p++ = '\\';
				1469	*p++ = 't';
				1470	}
				1471	else if (ch == '\n') {
				1472	*p++ = '\\';
				1473	*p++ = 'n';
				1474	}
				1475	else if (ch == '\r') {
				1476	*p++ = '\\';
				1477	*p++ = 'r';
				1478	}
				1479	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1480	else if (ch < ' ' \|\| ch >= 128) {
				1481	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1482	*p++ = 'x';
				1483	*p++ = hexdigit[(ch >> 4) & 0xf];
				1484	*p++ = hexdigit[ch & 15];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1485	}
				1486	/* Copy everything else as-is */
				1487	else
				1488	*p++ = (char) ch;
				1489	}
				1490	if (quotes)
				1491	*p++ = q[1];
				1492
				1493	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1494	if (_PyString_Resize(&repr, p - q))
				1495	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1496
				1497	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1498
				1499	onError:
				1500	Py_DECREF(repr);
				1501	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1502	}
				1503
				1504	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1505	int size)
				1506	{
				1507	return unicodeescape_string(s, size, 0);
				1508	}
				1509
				1510	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1511	{
				1512	if (!PyUnicode_Check(unicode)) {
				1513	PyErr_BadArgument();
				1514	return NULL;
				1515	}
				1516	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1517	PyUnicode_GET_SIZE(unicode));
				1518	}
				1519
				1520	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1521
				1522	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1523	int size,
				1524	const char *errors)
				1525	{
				1526	PyUnicodeObject *v;
				1527	Py_UNICODE p, buf;
				1528	const char *end;
				1529	const char *bs;
				1530
				1531	/* Escaped strings will always be longer than the resulting
				1532	Unicode string, so we start with size here and then reduce the
				1533	length after conversion to the true value. */
				1534	v = _PyUnicode_New(size);
				1535	if (v == NULL)
				1536	goto onError;
				1537	if (size == 0)
				1538	return (PyObject *)v;
				1539	p = buf = PyUnicode_AS_UNICODE(v);
				1540	end = s + size;
				1541	while (s < end) {
				1542	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1543	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1544	int i;
				1545
				1546	/* Non-escape characters are interpreted as Unicode ordinals */
				1547	if (*s != '\\') {
				1548	p++ = (unsigned char)s++;
				1549	continue;
				1550	}
				1551
				1552	/* \u-escapes are only interpreted iff the number of leading
				1553	backslashes if odd */
				1554	bs = s;
				1555	for (;s < end;) {
				1556	if (*s != '\\')
				1557	break;
				1558	p++ = (unsigned char)s++;
				1559	}
				1560	if (((s - bs) & 1) == 0 \|\|
				1561	s >= end \|\|
				1562	*s != 'u') {
				1563	continue;
				1564	}
				1565	p--;
				1566	s++;
				1567
				1568	/* \uXXXX with 4 hex digits */
				1569	for (x = 0, i = 0; i < 4; i++) {
				1570	c = (unsigned char)s[i];
				1571	if (!isxdigit(c)) {
				1572	if (unicodeescape_decoding_error(&s, &x, errors,
				1573	"truncated \\uXXXX"))
				1574	goto onError;
				1575	i++;
				1576	break;
				1577	}
				1578	x = (x<<4) & ~0xF;
				1579	if (c >= '0' && c <= '9')
				1580	x += c - '0';
				1581	else if (c >= 'a' && c <= 'f')
				1582	x += 10 + c - 'a';
				1583	else
				1584	x += 10 + c - 'A';
				1585	}
				1586	s += i;
				1587	*p++ = x;
				1588	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1589	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1590	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1591	return (PyObject *)v;
				1592
				1593	onError:
				1594	Py_XDECREF(v);
				1595	return NULL;
				1596	}
				1597
				1598	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1599	int size)
				1600	{
				1601	PyObject *repr;
				1602	char *p;
				1603	char *q;
				1604
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1605	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1606
				1607	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1608	if (repr == NULL)
				1609	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1610	if (size == 0)
				1611	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1612
				1613	p = q = PyString_AS_STRING(repr);
				1614	while (size-- > 0) {
				1615	Py_UNICODE ch = *s++;
				1616	/* Map 16-bit characters to '\uxxxx' */
				1617	if (ch >= 256) {
				1618	*p++ = '\\';
				1619	*p++ = 'u';
				1620	*p++ = hexdigit[(ch >> 12) & 0xf];
				1621	*p++ = hexdigit[(ch >> 8) & 0xf];
				1622	*p++ = hexdigit[(ch >> 4) & 0xf];
				1623	*p++ = hexdigit[ch & 15];
				1624	}
				1625	/* Copy everything else as-is */
				1626	else
				1627	*p++ = (char) ch;
				1628	}
				1629	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1630	if (_PyString_Resize(&repr, p - q))
				1631	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1632
				1633	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1634
				1635	onError:
				1636	Py_DECREF(repr);
				1637	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1638	}
				1639
				1640	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1641	{
				1642	if (!PyUnicode_Check(unicode)) {
				1643	PyErr_BadArgument();
				1644	return NULL;
				1645	}
				1646	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1647	PyUnicode_GET_SIZE(unicode));
				1648	}
				1649
				1650	/* --- Latin-1 Codec ------------------------------------------------------ */
				1651
				1652	PyObject PyUnicode_DecodeLatin1(const char s,
				1653	int size,
				1654	const char *errors)
				1655	{
				1656	PyUnicodeObject *v;
				1657	Py_UNICODE *p;
				1658
				1659	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1660	if (size == 1 && (unsigned char)s < 256) {
				1661	Py_UNICODE r = (unsigned char)s;
				1662	return PyUnicode_FromUnicode(&r, 1);
				1663	}
				1664
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1665	v = _PyUnicode_New(size);
				1666	if (v == NULL)
				1667	goto onError;
				1668	if (size == 0)
				1669	return (PyObject *)v;
				1670	p = PyUnicode_AS_UNICODE(v);
				1671	while (size-- > 0)
				1672	p++ = (unsigned char)s++;
				1673	return (PyObject *)v;
				1674
				1675	onError:
				1676	Py_XDECREF(v);
				1677	return NULL;
				1678	}
				1679
				1680	static
				1681	int latin1_encoding_error(const Py_UNICODE **source,
				1682	char **dest,
				1683	const char *errors,
				1684	const char *details)
				1685	{
				1686	if ((errors == NULL) \|\|
				1687	(strcmp(errors,"strict") == 0)) {
				1688	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1689	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1690	details);
				1691	return -1;
				1692	}
				1693	else if (strcmp(errors,"ignore") == 0) {
				1694	return 0;
				1695	}
				1696	else if (strcmp(errors,"replace") == 0) {
				1697	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1698	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1699	return 0;
				1700	}
				1701	else {
				1702	PyErr_Format(PyExc_ValueError,
				1703	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1704	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1705	errors);
				1706	return -1;
				1707	}
				1708	}
				1709
				1710	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1711	int size,
				1712	const char *errors)
				1713	{
				1714	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1715	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1716
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1717	repr = PyString_FromStringAndSize(NULL, size);
				1718	if (repr == NULL)
				1719	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1720	if (size == 0)
				1721	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1722
				1723	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1724	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1725	while (size-- > 0) {
				1726	Py_UNICODE ch = *p++;
				1727	if (ch >= 256) {
				1728	if (latin1_encoding_error(&p, &s, errors,
				1729	"ordinal not in range(256)"))
				1730	goto onError;
				1731	}
				1732	else
				1733	*s++ = (char)ch;
				1734	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1735	/* Resize if error handling skipped some characters */
				1736	if (s - start < PyString_GET_SIZE(repr))
				1737	if (_PyString_Resize(&repr, s - start))
				1738	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1739	return repr;
				1740
				1741	onError:
				1742	Py_DECREF(repr);
				1743	return NULL;
				1744	}
				1745
				1746	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1747	{
				1748	if (!PyUnicode_Check(unicode)) {
				1749	PyErr_BadArgument();
				1750	return NULL;
				1751	}
				1752	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1753	PyUnicode_GET_SIZE(unicode),
				1754	NULL);
				1755	}
				1756
				1757	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1758
				1759	static
				1760	int ascii_decoding_error(const char **source,
				1761	Py_UNICODE **dest,
				1762	const char *errors,
				1763	const char *details)
				1764	{
				1765	if ((errors == NULL) \|\|
				1766	(strcmp(errors,"strict") == 0)) {
				1767	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1768	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1769	details);
				1770	return -1;
				1771	}
				1772	else if (strcmp(errors,"ignore") == 0) {
				1773	return 0;
				1774	}
				1775	else if (strcmp(errors,"replace") == 0) {
				1776	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1777	(*dest)++;
				1778	return 0;
				1779	}
				1780	else {
				1781	PyErr_Format(PyExc_ValueError,
				1782	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1783	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1784	errors);
				1785	return -1;
				1786	}
				1787	}
				1788
				1789	PyObject PyUnicode_DecodeASCII(const char s,
				1790	int size,
				1791	const char *errors)
				1792	{
				1793	PyUnicodeObject *v;
				1794	Py_UNICODE *p;
				1795
				1796	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1797	if (size == 1 && (unsigned char)s < 128) {
				1798	Py_UNICODE r = (unsigned char)s;
				1799	return PyUnicode_FromUnicode(&r, 1);
				1800	}
				1801
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1802	v = _PyUnicode_New(size);
				1803	if (v == NULL)
				1804	goto onError;
				1805	if (size == 0)
				1806	return (PyObject *)v;
				1807	p = PyUnicode_AS_UNICODE(v);
				1808	while (size-- > 0) {
				1809	register unsigned char c;
				1810
				1811	c = (unsigned char)*s++;
				1812	if (c < 128)
				1813	*p++ = c;
				1814	else if (ascii_decoding_error(&s, &p, errors,
				1815	"ordinal not in range(128)"))
				1816	goto onError;
				1817	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1818	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1819	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1820	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1821	return (PyObject *)v;
				1822
				1823	onError:
				1824	Py_XDECREF(v);
				1825	return NULL;
				1826	}
				1827
				1828	static
				1829	int ascii_encoding_error(const Py_UNICODE **source,
				1830	char **dest,
				1831	const char *errors,
				1832	const char *details)
				1833	{
				1834	if ((errors == NULL) \|\|
				1835	(strcmp(errors,"strict") == 0)) {
				1836	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1837	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1838	details);
				1839	return -1;
				1840	}
				1841	else if (strcmp(errors,"ignore") == 0) {
				1842	return 0;
				1843	}
				1844	else if (strcmp(errors,"replace") == 0) {
				1845	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1846	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1847	return 0;
				1848	}
				1849	else {
				1850	PyErr_Format(PyExc_ValueError,
				1851	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1852	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1853	errors);
				1854	return -1;
				1855	}
				1856	}
				1857
				1858	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1859	int size,
				1860	const char *errors)
				1861	{
				1862	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1863	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1864
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1865	repr = PyString_FromStringAndSize(NULL, size);
				1866	if (repr == NULL)
				1867	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1868	if (size == 0)
				1869	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1870
				1871	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1872	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1873	while (size-- > 0) {
				1874	Py_UNICODE ch = *p++;
				1875	if (ch >= 128) {
				1876	if (ascii_encoding_error(&p, &s, errors,
				1877	"ordinal not in range(128)"))
				1878	goto onError;
				1879	}
				1880	else
				1881	*s++ = (char)ch;
				1882	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1883	/* Resize if error handling skipped some characters */
				1884	if (s - start < PyString_GET_SIZE(repr))
				1885	if (_PyString_Resize(&repr, s - start))
				1886	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1887	return repr;
				1888
				1889	onError:
				1890	Py_DECREF(repr);
				1891	return NULL;
				1892	}
				1893
				1894	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1895	{
				1896	if (!PyUnicode_Check(unicode)) {
				1897	PyErr_BadArgument();
				1898	return NULL;
				1899	}
				1900	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1901	PyUnicode_GET_SIZE(unicode),
				1902	NULL);
				1903	}
				1904
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	1905	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1906
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1907	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1908
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1909	PyObject PyUnicode_DecodeMBCS(const char s,
				1910	int size,
				1911	const char *errors)
				1912	{
				1913	PyUnicodeObject *v;
				1914	Py_UNICODE *p;
				1915
				1916	/* First get the size of the result */
				1917	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1918	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1919	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1920
				1921	v = _PyUnicode_New(usize);
				1922	if (v == NULL)
				1923	return NULL;
				1924	if (usize == 0)
				1925	return (PyObject *)v;
				1926	p = PyUnicode_AS_UNICODE(v);
				1927	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1928	Py_DECREF(v);
				1929	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1930	}
				1931
				1932	return (PyObject *)v;
				1933	}
				1934
				1935	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1936	int size,
				1937	const char *errors)
				1938	{
				1939	PyObject *repr;
				1940	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1941	DWORD mbcssize;
				1942
				1943	/* If there are no characters, bail now! */
				1944	if (size==0)
				1945	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1946
				1947	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1948	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1949	if (mbcssize==0)
				1950	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1951
				1952	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1953	if (repr == NULL)
				1954	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1955	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1956	return repr;
				1957
				1958	/* Do the conversion */
				1959	s = PyString_AS_STRING(repr);
				1960	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				1961	Py_DECREF(repr);
				1962	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1963	}
				1964	return repr;
				1965	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1966
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1967	#endif /* MS_WIN32 */
				1968
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1969	/* --- Character Mapping Codec -------------------------------------------- */
				1970
				1971	static
				1972	int charmap_decoding_error(const char **source,
				1973	Py_UNICODE **dest,
				1974	const char *errors,
				1975	const char *details)
				1976	{
				1977	if ((errors == NULL) \|\|
				1978	(strcmp(errors,"strict") == 0)) {
				1979	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1980	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1981	details);
				1982	return -1;
				1983	}
				1984	else if (strcmp(errors,"ignore") == 0) {
				1985	return 0;
				1986	}
				1987	else if (strcmp(errors,"replace") == 0) {
				1988	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1989	(*dest)++;
				1990	return 0;
				1991	}
				1992	else {
				1993	PyErr_Format(PyExc_ValueError,
				1994	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1995	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1996	errors);
				1997	return -1;
				1998	}
				1999	}
				2000
				2001	PyObject PyUnicode_DecodeCharmap(const char s,
				2002	int size,
				2003	PyObject *mapping,
				2004	const char *errors)
				2005	{
				2006	PyUnicodeObject *v;
				2007	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2008	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2009
				2010	/* Default to Latin-1 */
				2011	if (mapping == NULL)
				2012	return PyUnicode_DecodeLatin1(s, size, errors);
				2013
				2014	v = _PyUnicode_New(size);
				2015	if (v == NULL)
				2016	goto onError;
				2017	if (size == 0)
				2018	return (PyObject *)v;
				2019	p = PyUnicode_AS_UNICODE(v);
				2020	while (size-- > 0) {
				2021	unsigned char ch = *s++;
				2022	PyObject w, x;
				2023
				2024	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2025	w = PyInt_FromLong((long)ch);
				2026	if (w == NULL)
				2027	goto onError;
				2028	x = PyObject_GetItem(mapping, w);
				2029	Py_DECREF(w);
				2030	if (x == NULL) {
				2031	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2032	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2033	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2034	x = Py_None;
				2035	Py_INCREF(x);
				2036	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2037	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2038	}
				2039
				2040	/* Apply mapping */
				2041	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2042	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2043	if (value < 0 \|\| value > 65535) {
				2044	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2045	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2046	Py_DECREF(x);
				2047	goto onError;
				2048	}
				2049	*p++ = (Py_UNICODE)value;
				2050	}
				2051	else if (x == Py_None) {
				2052	/* undefined mapping */
				2053	if (charmap_decoding_error(&s, &p, errors,
				2054	"character maps to <undefined>")) {
				2055	Py_DECREF(x);
				2056	goto onError;
				2057	}
				2058	}
				2059	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2060	int targetsize = PyUnicode_GET_SIZE(x);
				2061
				2062	if (targetsize == 1)
				2063	/* 1-1 mapping */
				2064	p++ = PyUnicode_AS_UNICODE(x);
				2065
				2066	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2067	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2068	if (targetsize > extrachars) {
				2069	/* resize first */
				2070	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2071	int needed = (targetsize - extrachars) + \
				2072	(targetsize << 2);
				2073	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2074	if (_PyUnicode_Resize(&v,
				2075	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2076	Py_DECREF(x);
				2077	goto onError;
				2078	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2079	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2080	}
				2081	Py_UNICODE_COPY(p,
				2082	PyUnicode_AS_UNICODE(x),
				2083	targetsize);
				2084	p += targetsize;
				2085	extrachars -= targetsize;
				2086	}
				2087	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2088	}
				2089	else {
				2090	/* wrong return value */
				2091	PyErr_SetString(PyExc_TypeError,
				2092	"character mapping must return integer, None or unicode");
				2093	Py_DECREF(x);
				2094	goto onError;
				2095	}
				2096	Py_DECREF(x);
				2097	}
				2098	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2099	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2100	goto onError;
				2101	return (PyObject *)v;
				2102
				2103	onError:
				2104	Py_XDECREF(v);
				2105	return NULL;
				2106	}
				2107
				2108	static
				2109	int charmap_encoding_error(const Py_UNICODE **source,
				2110	char **dest,
				2111	const char *errors,
				2112	const char *details)
				2113	{
				2114	if ((errors == NULL) \|\|
				2115	(strcmp(errors,"strict") == 0)) {
				2116	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2117	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2118	details);
				2119	return -1;
				2120	}
				2121	else if (strcmp(errors,"ignore") == 0) {
				2122	return 0;
				2123	}
				2124	else if (strcmp(errors,"replace") == 0) {
				2125	**dest = '?';
				2126	(*dest)++;
				2127	return 0;
				2128	}
				2129	else {
				2130	PyErr_Format(PyExc_ValueError,
				2131	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2132	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2133	errors);
				2134	return -1;
				2135	}
				2136	}
				2137
				2138	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2139	int size,
				2140	PyObject *mapping,
				2141	const char *errors)
				2142	{
				2143	PyObject *v;
				2144	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2145	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2146
				2147	/* Default to Latin-1 */
				2148	if (mapping == NULL)
				2149	return PyUnicode_EncodeLatin1(p, size, errors);
				2150
				2151	v = PyString_FromStringAndSize(NULL, size);
				2152	if (v == NULL)
				2153	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2154	if (size == 0)
				2155	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2156	s = PyString_AS_STRING(v);
				2157	while (size-- > 0) {
				2158	Py_UNICODE ch = *p++;
				2159	PyObject w, x;
				2160
				2161	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2162	w = PyInt_FromLong((long)ch);
				2163	if (w == NULL)
				2164	goto onError;
				2165	x = PyObject_GetItem(mapping, w);
				2166	Py_DECREF(w);
				2167	if (x == NULL) {
				2168	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2169	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2170	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2171	x = Py_None;
				2172	Py_INCREF(x);
				2173	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2174	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2175	}
				2176
				2177	/* Apply mapping */
				2178	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2179	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2180	if (value < 0 \|\| value > 255) {
				2181	PyErr_SetString(PyExc_TypeError,
				2182	"character mapping must be in range(256)");
				2183	Py_DECREF(x);
				2184	goto onError;
				2185	}
				2186	*s++ = (char)value;
				2187	}
				2188	else if (x == Py_None) {
				2189	/* undefined mapping */
				2190	if (charmap_encoding_error(&p, &s, errors,
				2191	"character maps to <undefined>")) {
				2192	Py_DECREF(x);
				2193	goto onError;
				2194	}
				2195	}
				2196	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2197	int targetsize = PyString_GET_SIZE(x);
				2198
				2199	if (targetsize == 1)
				2200	/* 1-1 mapping */
				2201	s++ = PyString_AS_STRING(x);
				2202
				2203	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2204	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2205	if (targetsize > extrachars) {
				2206	/* resize first */
				2207	int oldpos = (int)(s - PyString_AS_STRING(v));
				2208	int needed = (targetsize - extrachars) + \
				2209	(targetsize << 2);
				2210	extrachars += needed;
				2211	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2212	Py_DECREF(x);
				2213	goto onError;
				2214	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2215	s = PyString_AS_STRING(v) + oldpos;
				2216	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2217	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2218	s += targetsize;
				2219	extrachars -= targetsize;
				2220	}
				2221	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2222	}
				2223	else {
				2224	/* wrong return value */
				2225	PyErr_SetString(PyExc_TypeError,
				2226	"character mapping must return integer, None or unicode");
				2227	Py_DECREF(x);
				2228	goto onError;
				2229	}
				2230	Py_DECREF(x);
				2231	}
				2232	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2233	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2234	goto onError;
				2235	return v;
				2236
				2237	onError:
				2238	Py_DECREF(v);
				2239	return NULL;
				2240	}
				2241
				2242	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2243	PyObject *mapping)
				2244	{
				2245	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2246	PyErr_BadArgument();
				2247	return NULL;
				2248	}
				2249	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2250	PyUnicode_GET_SIZE(unicode),
				2251	mapping,
				2252	NULL);
				2253	}
				2254
				2255	static
				2256	int translate_error(const Py_UNICODE **source,
				2257	Py_UNICODE **dest,
				2258	const char *errors,
				2259	const char *details)
				2260	{
				2261	if ((errors == NULL) \|\|
				2262	(strcmp(errors,"strict") == 0)) {
				2263	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2264	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2265	details);
				2266	return -1;
				2267	}
				2268	else if (strcmp(errors,"ignore") == 0) {
				2269	return 0;
				2270	}
				2271	else if (strcmp(errors,"replace") == 0) {
				2272	**dest = '?';
				2273	(*dest)++;
				2274	return 0;
				2275	}
				2276	else {
				2277	PyErr_Format(PyExc_ValueError,
				2278	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2279	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2280	errors);
				2281	return -1;
				2282	}
				2283	}
				2284
				2285	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2286	int size,
				2287	PyObject *mapping,
				2288	const char *errors)
				2289	{
				2290	PyUnicodeObject *v;
				2291	Py_UNICODE *p;
				2292
				2293	if (mapping == NULL) {
				2294	PyErr_BadArgument();
				2295	return NULL;
				2296	}
				2297
				2298	/* Output will never be longer than input */
				2299	v = _PyUnicode_New(size);
				2300	if (v == NULL)
				2301	goto onError;
				2302	if (size == 0)
				2303	goto done;
				2304	p = PyUnicode_AS_UNICODE(v);
				2305	while (size-- > 0) {
				2306	Py_UNICODE ch = *s++;
				2307	PyObject w, x;
				2308
				2309	/* Get mapping */
				2310	w = PyInt_FromLong(ch);
				2311	if (w == NULL)
				2312	goto onError;
				2313	x = PyObject_GetItem(mapping, w);
				2314	Py_DECREF(w);
				2315	if (x == NULL) {
				2316	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2317	/* No mapping found: default to 1-1 mapping */
				2318	PyErr_Clear();
				2319	*p++ = ch;
				2320	continue;
				2321	}
				2322	goto onError;
				2323	}
				2324
				2325	/* Apply mapping */
				2326	if (PyInt_Check(x))
				2327	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2328	else if (x == Py_None) {
				2329	/* undefined mapping */
				2330	if (translate_error(&s, &p, errors,
				2331	"character maps to <undefined>")) {
				2332	Py_DECREF(x);
				2333	goto onError;
				2334	}
				2335	}
				2336	else if (PyUnicode_Check(x)) {
				2337	if (PyUnicode_GET_SIZE(x) != 1) {
				2338	/* 1-n mapping */
				2339	PyErr_SetString(PyExc_NotImplementedError,
				2340	"1-n mappings are currently not implemented");
				2341	Py_DECREF(x);
				2342	goto onError;
				2343	}
				2344	p++ = PyUnicode_AS_UNICODE(x);
				2345	}
				2346	else {
				2347	/* wrong return value */
				2348	PyErr_SetString(PyExc_TypeError,
				2349	"translate mapping must return integer, None or unicode");
				2350	Py_DECREF(x);
				2351	goto onError;
				2352	}
				2353	Py_DECREF(x);
				2354	}
				2355	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2356	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2357	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2358
				2359	done:
				2360	return (PyObject *)v;
				2361
				2362	onError:
				2363	Py_XDECREF(v);
				2364	return NULL;
				2365	}
				2366
				2367	PyObject PyUnicode_Translate(PyObject str,
				2368	PyObject *mapping,
				2369	const char *errors)
				2370	{
				2371	PyObject *result;
				2372
				2373	str = PyUnicode_FromObject(str);
				2374	if (str == NULL)
				2375	goto onError;
				2376	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2377	PyUnicode_GET_SIZE(str),
				2378	mapping,
				2379	errors);
				2380	Py_DECREF(str);
				2381	return result;
				2382
				2383	onError:
				2384	Py_XDECREF(str);
				2385	return NULL;
				2386	}
				2387
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2388	/* --- Decimal Encoder ---------------------------------------------------- */
				2389
				2390	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2391	int length,
				2392	char *output,
				2393	const char *errors)
				2394	{
				2395	Py_UNICODE p, end;
				2396
				2397	if (output == NULL) {
				2398	PyErr_BadArgument();
				2399	return -1;
				2400	}
				2401
				2402	p = s;
				2403	end = s + length;
				2404	while (p < end) {
				2405	register Py_UNICODE ch = *p++;
				2406	int decimal;
				2407
				2408	if (Py_UNICODE_ISSPACE(ch)) {
				2409	*output++ = ' ';
				2410	continue;
				2411	}
				2412	decimal = Py_UNICODE_TODECIMAL(ch);
				2413	if (decimal >= 0) {
				2414	*output++ = '0' + decimal;
				2415	continue;
				2416	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2417	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2418	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2419	continue;
				2420	}
				2421	/* All other characters are considered invalid */
				2422	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2423	PyErr_SetString(PyExc_ValueError,
				2424	"invalid decimal Unicode string");
				2425	goto onError;
				2426	}
				2427	else if (strcmp(errors, "ignore") == 0)
				2428	continue;
				2429	else if (strcmp(errors, "replace") == 0) {
				2430	*output++ = '?';
				2431	continue;
				2432	}
				2433	}
				2434	/* 0-terminate the output string */
				2435	*output++ = '\0';
				2436	return 0;
				2437
				2438	onError:
				2439	return -1;
				2440	}
				2441
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2442	/* --- Helpers ------------------------------------------------------------ */
				2443
				2444	static
				2445	int count(PyUnicodeObject *self,
				2446	int start,
				2447	int end,
				2448	PyUnicodeObject *substring)
				2449	{
				2450	int count = 0;
				2451
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2452	if (start < 0)
				2453	start += self->length;
				2454	if (start < 0)
				2455	start = 0;
				2456	if (end > self->length)
				2457	end = self->length;
				2458	if (end < 0)
				2459	end += self->length;
				2460	if (end < 0)
				2461	end = 0;
				2462
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2463	if (substring->length == 0)
				2464	return (end - start + 1);
				2465
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2466	end -= substring->length;
				2467
				2468	while (start <= end)
				2469	if (Py_UNICODE_MATCH(self, start, substring)) {
				2470	count++;
				2471	start += substring->length;
				2472	} else
				2473	start++;
				2474
				2475	return count;
				2476	}
				2477
				2478	int PyUnicode_Count(PyObject *str,
				2479	PyObject *substr,
				2480	int start,
				2481	int end)
				2482	{
				2483	int result;
				2484
				2485	str = PyUnicode_FromObject(str);
				2486	if (str == NULL)
				2487	return -1;
				2488	substr = PyUnicode_FromObject(substr);
				2489	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2490	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2491	return -1;
				2492	}
				2493
				2494	result = count((PyUnicodeObject *)str,
				2495	start, end,
				2496	(PyUnicodeObject *)substr);
				2497
				2498	Py_DECREF(str);
				2499	Py_DECREF(substr);
				2500	return result;
				2501	}
				2502
				2503	static
				2504	int findstring(PyUnicodeObject *self,
				2505	PyUnicodeObject *substring,
				2506	int start,
				2507	int end,
				2508	int direction)
				2509	{
				2510	if (start < 0)
				2511	start += self->length;
				2512	if (start < 0)
				2513	start = 0;
				2514
				2515	if (substring->length == 0)
				2516	return start;
				2517
				2518	if (end > self->length)
				2519	end = self->length;
				2520	if (end < 0)
				2521	end += self->length;
				2522	if (end < 0)
				2523	end = 0;
				2524
				2525	end -= substring->length;
				2526
				2527	if (direction < 0) {
				2528	for (; end >= start; end--)
				2529	if (Py_UNICODE_MATCH(self, end, substring))
				2530	return end;
				2531	} else {
				2532	for (; start <= end; start++)
				2533	if (Py_UNICODE_MATCH(self, start, substring))
				2534	return start;
				2535	}
				2536
				2537	return -1;
				2538	}
				2539
				2540	int PyUnicode_Find(PyObject *str,
				2541	PyObject *substr,
				2542	int start,
				2543	int end,
				2544	int direction)
				2545	{
				2546	int result;
				2547
				2548	str = PyUnicode_FromObject(str);
				2549	if (str == NULL)
				2550	return -1;
				2551	substr = PyUnicode_FromObject(substr);
				2552	if (substr == NULL) {
				2553	Py_DECREF(substr);
				2554	return -1;
				2555	}
				2556
				2557	result = findstring((PyUnicodeObject *)str,
				2558	(PyUnicodeObject *)substr,
				2559	start, end, direction);
				2560	Py_DECREF(str);
				2561	Py_DECREF(substr);
				2562	return result;
				2563	}
				2564
				2565	static
				2566	int tailmatch(PyUnicodeObject *self,
				2567	PyUnicodeObject *substring,
				2568	int start,
				2569	int end,
				2570	int direction)
				2571	{
				2572	if (start < 0)
				2573	start += self->length;
				2574	if (start < 0)
				2575	start = 0;
				2576
				2577	if (substring->length == 0)
				2578	return 1;
				2579
				2580	if (end > self->length)
				2581	end = self->length;
				2582	if (end < 0)
				2583	end += self->length;
				2584	if (end < 0)
				2585	end = 0;
				2586
				2587	end -= substring->length;
				2588	if (end < start)
				2589	return 0;
				2590
				2591	if (direction > 0) {
				2592	if (Py_UNICODE_MATCH(self, end, substring))
				2593	return 1;
				2594	} else {
				2595	if (Py_UNICODE_MATCH(self, start, substring))
				2596	return 1;
				2597	}
				2598
				2599	return 0;
				2600	}
				2601
				2602	int PyUnicode_Tailmatch(PyObject *str,
				2603	PyObject *substr,
				2604	int start,
				2605	int end,
				2606	int direction)
				2607	{
				2608	int result;
				2609
				2610	str = PyUnicode_FromObject(str);
				2611	if (str == NULL)
				2612	return -1;
				2613	substr = PyUnicode_FromObject(substr);
				2614	if (substr == NULL) {
				2615	Py_DECREF(substr);
				2616	return -1;
				2617	}
				2618
				2619	result = tailmatch((PyUnicodeObject *)str,
				2620	(PyUnicodeObject *)substr,
				2621	start, end, direction);
				2622	Py_DECREF(str);
				2623	Py_DECREF(substr);
				2624	return result;
				2625	}
				2626
				2627	static
				2628	const Py_UNICODE findchar(const Py_UNICODE s,
				2629	int size,
				2630	Py_UNICODE ch)
				2631	{
				2632	/* like wcschr, but doesn't stop at NULL characters */
				2633
				2634	while (size-- > 0) {
				2635	if (*s == ch)
				2636	return s;
				2637	s++;
				2638	}
				2639
				2640	return NULL;
				2641	}
				2642
				2643	/* Apply fixfct filter to the Unicode object self and return a
				2644	reference to the modified object */
				2645
				2646	static
				2647	PyObject fixup(PyUnicodeObject self,
				2648	int (fixfct)(PyUnicodeObject s))
				2649	{
				2650
				2651	PyUnicodeObject *u;
				2652
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2653	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2654	if (u == NULL)
				2655	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2656
				2657	Py_UNICODE_COPY(u->str, self->str, self->length);
				2658
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2659	if (!fixfct(u)) {
				2660	/* fixfct should return TRUE if it modified the buffer. If
				2661	FALSE, return a reference to the original buffer instead
				2662	(to save space, not time) */
				2663	Py_INCREF(self);
				2664	Py_DECREF(u);
				2665	return (PyObject*) self;
				2666	}
				2667	return (PyObject*) u;
				2668	}
				2669
				2670	static
				2671	int fixupper(PyUnicodeObject *self)
				2672	{
				2673	int len = self->length;
				2674	Py_UNICODE *s = self->str;
				2675	int status = 0;
				2676
				2677	while (len-- > 0) {
				2678	register Py_UNICODE ch;
				2679
				2680	ch = Py_UNICODE_TOUPPER(*s);
				2681	if (ch != *s) {
				2682	status = 1;
				2683	*s = ch;
				2684	}
				2685	s++;
				2686	}
				2687
				2688	return status;
				2689	}
				2690
				2691	static
				2692	int fixlower(PyUnicodeObject *self)
				2693	{
				2694	int len = self->length;
				2695	Py_UNICODE *s = self->str;
				2696	int status = 0;
				2697
				2698	while (len-- > 0) {
				2699	register Py_UNICODE ch;
				2700
				2701	ch = Py_UNICODE_TOLOWER(*s);
				2702	if (ch != *s) {
				2703	status = 1;
				2704	*s = ch;
				2705	}
				2706	s++;
				2707	}
				2708
				2709	return status;
				2710	}
				2711
				2712	static
				2713	int fixswapcase(PyUnicodeObject *self)
				2714	{
				2715	int len = self->length;
				2716	Py_UNICODE *s = self->str;
				2717	int status = 0;
				2718
				2719	while (len-- > 0) {
				2720	if (Py_UNICODE_ISUPPER(*s)) {
				2721	s = Py_UNICODE_TOLOWER(s);
				2722	status = 1;
				2723	} else if (Py_UNICODE_ISLOWER(*s)) {
				2724	s = Py_UNICODE_TOUPPER(s);
				2725	status = 1;
				2726	}
				2727	s++;
				2728	}
				2729
				2730	return status;
				2731	}
				2732
				2733	static
				2734	int fixcapitalize(PyUnicodeObject *self)
				2735	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2736	int len = self->length;
				2737	Py_UNICODE *s = self->str;
				2738	int status = 0;
				2739
				2740	if (len == 0)
				2741	return 0;
				2742	if (Py_UNICODE_ISLOWER(*s)) {
				2743	s = Py_UNICODE_TOUPPER(s);
				2744	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2745	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2746	s++;
				2747	while (--len > 0) {
				2748	if (Py_UNICODE_ISUPPER(*s)) {
				2749	s = Py_UNICODE_TOLOWER(s);
				2750	status = 1;
				2751	}
				2752	s++;
				2753	}
				2754	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2755	}
				2756
				2757	static
				2758	int fixtitle(PyUnicodeObject *self)
				2759	{
				2760	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2761	register Py_UNICODE *e;
				2762	int previous_is_cased;
				2763
				2764	/* Shortcut for single character strings */
				2765	if (PyUnicode_GET_SIZE(self) == 1) {
				2766	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2767	if (*p != ch) {
				2768	*p = ch;
				2769	return 1;
				2770	}
				2771	else
				2772	return 0;
				2773	}
				2774
				2775	e = p + PyUnicode_GET_SIZE(self);
				2776	previous_is_cased = 0;
				2777	for (; p < e; p++) {
				2778	register const Py_UNICODE ch = *p;
				2779
				2780	if (previous_is_cased)
				2781	*p = Py_UNICODE_TOLOWER(ch);
				2782	else
				2783	*p = Py_UNICODE_TOTITLE(ch);
				2784
				2785	if (Py_UNICODE_ISLOWER(ch) \|\|
				2786	Py_UNICODE_ISUPPER(ch) \|\|
				2787	Py_UNICODE_ISTITLE(ch))
				2788	previous_is_cased = 1;
				2789	else
				2790	previous_is_cased = 0;
				2791	}
				2792	return 1;
				2793	}
				2794
				2795	PyObject PyUnicode_Join(PyObject separator,
				2796	PyObject *seq)
				2797	{
				2798	Py_UNICODE *sep;
				2799	int seplen;
				2800	PyUnicodeObject *res = NULL;
				2801	int reslen = 0;
				2802	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2803	int sz = 100;
				2804	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2805	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2806
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2807	it = PyObject_GetIter(seq);
				2808	if (it == NULL)
				2809	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2810
				2811	if (separator == NULL) {
				2812	Py_UNICODE blank = ' ';
				2813	sep = &blank;
				2814	seplen = 1;
				2815	}
				2816	else {
				2817	separator = PyUnicode_FromObject(separator);
				2818	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2819	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2820	sep = PyUnicode_AS_UNICODE(separator);
				2821	seplen = PyUnicode_GET_SIZE(separator);
				2822	}
				2823
				2824	res = _PyUnicode_New(sz);
				2825	if (res == NULL)
				2826	goto onError;
				2827	p = PyUnicode_AS_UNICODE(res);
				2828	reslen = 0;
				2829
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2830	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2831	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2832	PyObject *item = PyIter_Next(it);
				2833	if (item == NULL) {
				2834	if (PyErr_Occurred())
				2835	goto onError;
				2836	break;
				2837	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2838	if (!PyUnicode_Check(item)) {
				2839	PyObject *v;
				2840	v = PyUnicode_FromObject(item);
				2841	Py_DECREF(item);
				2842	item = v;
				2843	if (item == NULL)
				2844	goto onError;
				2845	}
				2846	itemlen = PyUnicode_GET_SIZE(item);
				2847	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2848	if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2849	goto onError;
				2850	sz *= 2;
				2851	p = PyUnicode_AS_UNICODE(res) + reslen;
				2852	}
				2853	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2854	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2855	p += seplen;
				2856	reslen += seplen;
				2857	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2858	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2859	p += itemlen;
				2860	reslen += itemlen;
				2861	Py_DECREF(item);
				2862	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2863	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2864	goto onError;
				2865
				2866	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2867	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2868	return (PyObject *)res;
				2869
				2870	onError:
				2871	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2872	Py_XDECREF(res);
				2873	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2874	return NULL;
				2875	}
				2876
				2877	static
				2878	PyUnicodeObject pad(PyUnicodeObject self,
				2879	int left,
				2880	int right,
				2881	Py_UNICODE fill)
				2882	{
				2883	PyUnicodeObject *u;
				2884
				2885	if (left < 0)
				2886	left = 0;
				2887	if (right < 0)
				2888	right = 0;
				2889
				2890	if (left == 0 && right == 0) {
				2891	Py_INCREF(self);
				2892	return self;
				2893	}
				2894
				2895	u = _PyUnicode_New(left + self->length + right);
				2896	if (u) {
				2897	if (left)
				2898	Py_UNICODE_FILL(u->str, fill, left);
				2899	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2900	if (right)
				2901	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2902	}
				2903
				2904	return u;
				2905	}
				2906
				2907	#define SPLIT_APPEND(data, left, right) \
				2908	str = PyUnicode_FromUnicode(data + left, right - left); \
				2909	if (!str) \
				2910	goto onError; \
				2911	if (PyList_Append(list, str)) { \
				2912	Py_DECREF(str); \
				2913	goto onError; \
				2914	} \
				2915	else \
				2916	Py_DECREF(str);
				2917
				2918	static
				2919	PyObject split_whitespace(PyUnicodeObject self,
				2920	PyObject *list,
				2921	int maxcount)
				2922	{
				2923	register int i;
				2924	register int j;
				2925	int len = self->length;
				2926	PyObject *str;
				2927
				2928	for (i = j = 0; i < len; ) {
				2929	/* find a token */
				2930	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2931	i++;
				2932	j = i;
				2933	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2934	i++;
				2935	if (j < i) {
				2936	if (maxcount-- <= 0)
				2937	break;
				2938	SPLIT_APPEND(self->str, j, i);
				2939	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2940	i++;
				2941	j = i;
				2942	}
				2943	}
				2944	if (j < len) {
				2945	SPLIT_APPEND(self->str, j, len);
				2946	}
				2947	return list;
				2948
				2949	onError:
				2950	Py_DECREF(list);
				2951	return NULL;
				2952	}
				2953
				2954	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2955	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2956	{
				2957	register int i;
				2958	register int j;
				2959	int len;
				2960	PyObject *list;
				2961	PyObject *str;
				2962	Py_UNICODE *data;
				2963
				2964	string = PyUnicode_FromObject(string);
				2965	if (string == NULL)
				2966	return NULL;
				2967	data = PyUnicode_AS_UNICODE(string);
				2968	len = PyUnicode_GET_SIZE(string);
				2969
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2970	list = PyList_New(0);
				2971	if (!list)
				2972	goto onError;
				2973
				2974	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2975	int eol;
				2976
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2977	/* Find a line and append it */
				2978	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				2979	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2980
				2981	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2982	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2983	if (i < len) {
				2984	if (data[i] == '\r' && i + 1 < len &&
				2985	data[i+1] == '\n')
				2986	i += 2;
				2987	else
				2988	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2989	if (keepends)
				2990	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2991	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2992	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2993	j = i;
				2994	}
				2995	if (j < len) {
				2996	SPLIT_APPEND(data, j, len);
				2997	}
				2998
				2999	Py_DECREF(string);
				3000	return list;
				3001
				3002	onError:
				3003	Py_DECREF(list);
				3004	Py_DECREF(string);
				3005	return NULL;
				3006	}
				3007
				3008	static
				3009	PyObject split_char(PyUnicodeObject self,
				3010	PyObject *list,
				3011	Py_UNICODE ch,
				3012	int maxcount)
				3013	{
				3014	register int i;
				3015	register int j;
				3016	int len = self->length;
				3017	PyObject *str;
				3018
				3019	for (i = j = 0; i < len; ) {
				3020	if (self->str[i] == ch) {
				3021	if (maxcount-- <= 0)
				3022	break;
				3023	SPLIT_APPEND(self->str, j, i);
				3024	i = j = i + 1;
				3025	} else
				3026	i++;
				3027	}
				3028	if (j <= len) {
				3029	SPLIT_APPEND(self->str, j, len);
				3030	}
				3031	return list;
				3032
				3033	onError:
				3034	Py_DECREF(list);
				3035	return NULL;
				3036	}
				3037
				3038	static
				3039	PyObject split_substring(PyUnicodeObject self,
				3040	PyObject *list,
				3041	PyUnicodeObject *substring,
				3042	int maxcount)
				3043	{
				3044	register int i;
				3045	register int j;
				3046	int len = self->length;
				3047	int sublen = substring->length;
				3048	PyObject *str;
				3049
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3050	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3051	if (Py_UNICODE_MATCH(self, i, substring)) {
				3052	if (maxcount-- <= 0)
				3053	break;
				3054	SPLIT_APPEND(self->str, j, i);
				3055	i = j = i + sublen;
				3056	} else
				3057	i++;
				3058	}
				3059	if (j <= len) {
				3060	SPLIT_APPEND(self->str, j, len);
				3061	}
				3062	return list;
				3063
				3064	onError:
				3065	Py_DECREF(list);
				3066	return NULL;
				3067	}
				3068
				3069	#undef SPLIT_APPEND
				3070
				3071	static
				3072	PyObject split(PyUnicodeObject self,
				3073	PyUnicodeObject *substring,
				3074	int maxcount)
				3075	{
				3076	PyObject *list;
				3077
				3078	if (maxcount < 0)
				3079	maxcount = INT_MAX;
				3080
				3081	list = PyList_New(0);
				3082	if (!list)
				3083	return NULL;
				3084
				3085	if (substring == NULL)
				3086	return split_whitespace(self,list,maxcount);
				3087
				3088	else if (substring->length == 1)
				3089	return split_char(self,list,substring->str[0],maxcount);
				3090
				3091	else if (substring->length == 0) {
				3092	Py_DECREF(list);
				3093	PyErr_SetString(PyExc_ValueError, "empty separator");
				3094	return NULL;
				3095	}
				3096	else
				3097	return split_substring(self,list,substring,maxcount);
				3098	}
				3099
				3100	static
				3101	PyObject strip(PyUnicodeObject self,
				3102	int left,
				3103	int right)
				3104	{
				3105	Py_UNICODE *p = self->str;
				3106	int start = 0;
				3107	int end = self->length;
				3108
				3109	if (left)
				3110	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3111	start++;
				3112
				3113	if (right)
				3114	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3115	end--;
				3116
				3117	if (start == 0 && end == self->length) {
				3118	/* couldn't strip anything off, return original string */
				3119	Py_INCREF(self);
				3120	return (PyObject*) self;
				3121	}
				3122
				3123	return (PyObject*) PyUnicode_FromUnicode(
				3124	self->str + start,
				3125	end - start
				3126	);
				3127	}
				3128
				3129	static
				3130	PyObject replace(PyUnicodeObject self,
				3131	PyUnicodeObject *str1,
				3132	PyUnicodeObject *str2,
				3133	int maxcount)
				3134	{
				3135	PyUnicodeObject *u;
				3136
				3137	if (maxcount < 0)
				3138	maxcount = INT_MAX;
				3139
				3140	if (str1->length == 1 && str2->length == 1) {
				3141	int i;
				3142
				3143	/* replace characters */
				3144	if (!findchar(self->str, self->length, str1->str[0])) {
				3145	/* nothing to replace, return original string */
				3146	Py_INCREF(self);
				3147	u = self;
				3148	} else {
				3149	Py_UNICODE u1 = str1->str[0];
				3150	Py_UNICODE u2 = str2->str[0];
				3151
				3152	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3153	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3154	self->length
				3155	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3156	if (u != NULL) {
				3157	Py_UNICODE_COPY(u->str, self->str,
				3158	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3159	for (i = 0; i < u->length; i++)
				3160	if (u->str[i] == u1) {
				3161	if (--maxcount < 0)
				3162	break;
				3163	u->str[i] = u2;
				3164	}
				3165	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3166	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3167
				3168	} else {
				3169	int n, i;
				3170	Py_UNICODE *p;
				3171
				3172	/* replace strings */
				3173	n = count(self, 0, self->length, str1);
				3174	if (n > maxcount)
				3175	n = maxcount;
				3176	if (n == 0) {
				3177	/* nothing to replace, return original string */
				3178	Py_INCREF(self);
				3179	u = self;
				3180	} else {
				3181	u = _PyUnicode_New(
				3182	self->length + n * (str2->length - str1->length));
				3183	if (u) {
				3184	i = 0;
				3185	p = u->str;
				3186	while (i <= self->length - str1->length)
				3187	if (Py_UNICODE_MATCH(self, i, str1)) {
				3188	/* replace string segment */
				3189	Py_UNICODE_COPY(p, str2->str, str2->length);
				3190	p += str2->length;
				3191	i += str1->length;
				3192	if (--n <= 0) {
				3193	/* copy remaining part */
				3194	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3195	break;
				3196	}
				3197	} else
				3198	*p++ = self->str[i++];
				3199	}
				3200	}
				3201	}
				3202
				3203	return (PyObject *) u;
				3204	}
				3205
				3206	/* --- Unicode Object Methods --------------------------------------------- */
				3207
				3208	static char title__doc__[] =
				3209	"S.title() -> unicode\n\
				3210	\n\
				3211	Return a titlecased version of S, i.e. words start with title case\n\
				3212	characters, all remaining cased characters have lower case.";
				3213
				3214	static PyObject*
				3215	unicode_title(PyUnicodeObject self, PyObject args)
				3216	{
				3217	if (!PyArg_NoArgs(args))
				3218	return NULL;
				3219	return fixup(self, fixtitle);
				3220	}
				3221
				3222	static char capitalize__doc__[] =
				3223	"S.capitalize() -> unicode\n\
				3224	\n\
				3225	Return a capitalized version of S, i.e. make the first character\n\
				3226	have upper case.";
				3227
				3228	static PyObject*
				3229	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3230	{
				3231	if (!PyArg_NoArgs(args))
				3232	return NULL;
				3233	return fixup(self, fixcapitalize);
				3234	}
				3235
				3236	#if 0
				3237	static char capwords__doc__[] =
				3238	"S.capwords() -> unicode\n\
				3239	\n\
				3240	Apply .capitalize() to all words in S and return the result with\n\
				3241	normalized whitespace (all whitespace strings are replaced by ' ').";
				3242
				3243	static PyObject*
				3244	unicode_capwords(PyUnicodeObject self, PyObject args)
				3245	{
				3246	PyObject *list;
				3247	PyObject *item;
				3248	int i;
				3249
				3250	if (!PyArg_NoArgs(args))
				3251	return NULL;
				3252
				3253	/* Split into words */
				3254	list = split(self, NULL, -1);
				3255	if (!list)
				3256	return NULL;
				3257
				3258	/* Capitalize each word */
				3259	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3260	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3261	fixcapitalize);
				3262	if (item == NULL)
				3263	goto onError;
				3264	Py_DECREF(PyList_GET_ITEM(list, i));
				3265	PyList_SET_ITEM(list, i, item);
				3266	}
				3267
				3268	/* Join the words to form a new string */
				3269	item = PyUnicode_Join(NULL, list);
				3270
				3271	onError:
				3272	Py_DECREF(list);
				3273	return (PyObject *)item;
				3274	}
				3275	#endif
				3276
				3277	static char center__doc__[] =
				3278	"S.center(width) -> unicode\n\
				3279	\n\
				3280	Return S centered in a Unicode string of length width. Padding is done\n\
				3281	using spaces.";
				3282
				3283	static PyObject *
				3284	unicode_center(PyUnicodeObject self, PyObject args)
				3285	{
				3286	int marg, left;
				3287	int width;
				3288
				3289	if (!PyArg_ParseTuple(args, "i:center", &width))
				3290	return NULL;
				3291
				3292	if (self->length >= width) {
				3293	Py_INCREF(self);
				3294	return (PyObject*) self;
				3295	}
				3296
				3297	marg = width - self->length;
				3298	left = marg / 2 + (marg & width & 1);
				3299
				3300	return (PyObject*) pad(self, left, marg - left, ' ');
				3301	}
				3302
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3303	#if 0
				3304
				3305	/* This code should go into some future Unicode collation support
				3306	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3307	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3308
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3309	/* speedy UTF-16 code point order comparison */
				3310	/* gleaned from: */
				3311	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3312
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3313	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3314	{
				3315	0, 0, 0, 0, 0, 0, 0, 0,
				3316	0, 0, 0, 0, 0, 0, 0, 0,
				3317	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3318	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3319	};
				3320
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3321	static int
				3322	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3323	{
				3324	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3325
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3326	Py_UNICODE *s1 = str1->str;
				3327	Py_UNICODE *s2 = str2->str;
				3328
				3329	len1 = str1->length;
				3330	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3331
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3332	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3333	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3334
				3335	c1 = *s1++;
				3336	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3337
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3338	if (c1 > (1<<11) * 26)
				3339	c1 += utf16Fixup[c1>>11];
				3340	if (c2 > (1<<11) * 26)
				3341	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3342	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3343
				3344	if (c1 != c2)
				3345	return (c1 < c2) ? -1 : 1;
				3346
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3347	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3348	}
				3349
				3350	return (len1 < len2) ? -1 : (len1 != len2);
				3351	}
				3352
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3353	#else
				3354
				3355	static int
				3356	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3357	{
				3358	register int len1, len2;
				3359
				3360	Py_UNICODE *s1 = str1->str;
				3361	Py_UNICODE *s2 = str2->str;
				3362
				3363	len1 = str1->length;
				3364	len2 = str2->length;
				3365
				3366	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3367	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3368
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3369	c1 = *s1++;
				3370	c2 = *s2++;
				3371
				3372	if (c1 != c2)
				3373	return (c1 < c2) ? -1 : 1;
				3374
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3375	len1--; len2--;
				3376	}
				3377
				3378	return (len1 < len2) ? -1 : (len1 != len2);
				3379	}
				3380
				3381	#endif
				3382
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3383	int PyUnicode_Compare(PyObject *left,
				3384	PyObject *right)
				3385	{
				3386	PyUnicodeObject u = NULL, v = NULL;
				3387	int result;
				3388
				3389	/* Coerce the two arguments */
				3390	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3391	if (u == NULL)
				3392	goto onError;
				3393	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3394	if (v == NULL)
				3395	goto onError;
				3396
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3397	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3398	if (v == u) {
				3399	Py_DECREF(u);
				3400	Py_DECREF(v);
				3401	return 0;
				3402	}
				3403
				3404	result = unicode_compare(u, v);
				3405
				3406	Py_DECREF(u);
				3407	Py_DECREF(v);
				3408	return result;
				3409
				3410	onError:
				3411	Py_XDECREF(u);
				3412	Py_XDECREF(v);
				3413	return -1;
				3414	}
				3415
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3416	int PyUnicode_Contains(PyObject *container,
				3417	PyObject *element)
				3418	{
				3419	PyUnicodeObject u = NULL, v = NULL;
				3420	int result;
				3421	register const Py_UNICODE p, e;
				3422	register Py_UNICODE ch;
				3423
				3424	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3425	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3426	if (v == NULL) {
				3427	PyErr_SetString(PyExc_TypeError,
				3428	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3429	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3430	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3431	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3432	if (u == NULL) {
				3433	Py_DECREF(v);
				3434	goto onError;
				3435	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3436
				3437	/* Check v in u */
				3438	if (PyUnicode_GET_SIZE(v) != 1) {
				3439	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3440	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3441	goto onError;
				3442	}
				3443	ch = *PyUnicode_AS_UNICODE(v);
				3444	p = PyUnicode_AS_UNICODE(u);
				3445	e = p + PyUnicode_GET_SIZE(u);
				3446	result = 0;
				3447	while (p < e) {
				3448	if (*p++ == ch) {
				3449	result = 1;
				3450	break;
				3451	}
				3452	}
				3453
				3454	Py_DECREF(u);
				3455	Py_DECREF(v);
				3456	return result;
				3457
				3458	onError:
				3459	Py_XDECREF(u);
				3460	Py_XDECREF(v);
				3461	return -1;
				3462	}
				3463
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3464	/* Concat to string or Unicode object giving a new Unicode object. */
				3465
				3466	PyObject PyUnicode_Concat(PyObject left,
				3467	PyObject *right)
				3468	{
				3469	PyUnicodeObject u = NULL, v = NULL, *w;
				3470
				3471	/* Coerce the two arguments */
				3472	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3473	if (u == NULL)
				3474	goto onError;
				3475	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3476	if (v == NULL)
				3477	goto onError;
				3478
				3479	/* Shortcuts */
				3480	if (v == unicode_empty) {
				3481	Py_DECREF(v);
				3482	return (PyObject *)u;
				3483	}
				3484	if (u == unicode_empty) {
				3485	Py_DECREF(u);
				3486	return (PyObject *)v;
				3487	}
				3488
				3489	/* Concat the two Unicode strings */
				3490	w = _PyUnicode_New(u->length + v->length);
				3491	if (w == NULL)
				3492	goto onError;
				3493	Py_UNICODE_COPY(w->str, u->str, u->length);
				3494	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3495
				3496	Py_DECREF(u);
				3497	Py_DECREF(v);
				3498	return (PyObject *)w;
				3499
				3500	onError:
				3501	Py_XDECREF(u);
				3502	Py_XDECREF(v);
				3503	return NULL;
				3504	}
				3505
				3506	static char count__doc__[] =
				3507	"S.count(sub[, start[, end]]) -> int\n\
				3508	\n\
				3509	Return the number of occurrences of substring sub in Unicode string\n\
				3510	S[start:end]. Optional arguments start and end are\n\
				3511	interpreted as in slice notation.";
				3512
				3513	static PyObject *
				3514	unicode_count(PyUnicodeObject self, PyObject args)
				3515	{
				3516	PyUnicodeObject *substring;
				3517	int start = 0;
				3518	int end = INT_MAX;
				3519	PyObject *result;
				3520
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3521	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3522	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3523	return NULL;
				3524
				3525	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3526	(PyObject *)substring);
				3527	if (substring == NULL)
				3528	return NULL;
				3529
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3530	if (start < 0)
				3531	start += self->length;
				3532	if (start < 0)
				3533	start = 0;
				3534	if (end > self->length)
				3535	end = self->length;
				3536	if (end < 0)
				3537	end += self->length;
				3538	if (end < 0)
				3539	end = 0;
				3540
				3541	result = PyInt_FromLong((long) count(self, start, end, substring));
				3542
				3543	Py_DECREF(substring);
				3544	return result;
				3545	}
				3546
				3547	static char encode__doc__[] =
				3548	"S.encode([encoding[,errors]]) -> string\n\
				3549	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3550	Return an encoded string version of S. Default encoding is the current\n\
				3551	default string encoding. errors may be given to set a different error\n\
				3552	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3553	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3554
				3555	static PyObject *
				3556	unicode_encode(PyUnicodeObject self, PyObject args)
				3557	{
				3558	char *encoding = NULL;
				3559	char *errors = NULL;
				3560	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3561	return NULL;
				3562	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3563	}
				3564
				3565	static char expandtabs__doc__[] =
				3566	"S.expandtabs([tabsize]) -> unicode\n\
				3567	\n\
				3568	Return a copy of S where all tab characters are expanded using spaces.\n\
				3569	If tabsize is not given, a tab size of 8 characters is assumed.";
				3570
				3571	static PyObject*
				3572	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3573	{
				3574	Py_UNICODE *e;
				3575	Py_UNICODE *p;
				3576	Py_UNICODE *q;
				3577	int i, j;
				3578	PyUnicodeObject *u;
				3579	int tabsize = 8;
				3580
				3581	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3582	return NULL;
				3583
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3584	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3585	i = j = 0;
				3586	e = self->str + self->length;
				3587	for (p = self->str; p < e; p++)
				3588	if (*p == '\t') {
				3589	if (tabsize > 0)
				3590	j += tabsize - (j % tabsize);
				3591	}
				3592	else {
				3593	j++;
				3594	if (p == '\n' \|\| p == '\r') {
				3595	i += j;
				3596	j = 0;
				3597	}
				3598	}
				3599
				3600	/* Second pass: create output string and fill it */
				3601	u = _PyUnicode_New(i + j);
				3602	if (!u)
				3603	return NULL;
				3604
				3605	j = 0;
				3606	q = u->str;
				3607
				3608	for (p = self->str; p < e; p++)
				3609	if (*p == '\t') {
				3610	if (tabsize > 0) {
				3611	i = tabsize - (j % tabsize);
				3612	j += i;
				3613	while (i--)
				3614	*q++ = ' ';
				3615	}
				3616	}
				3617	else {
				3618	j++;
				3619	q++ = p;
				3620	if (p == '\n' \|\| p == '\r')
				3621	j = 0;
				3622	}
				3623
				3624	return (PyObject*) u;
				3625	}
				3626
				3627	static char find__doc__[] =
				3628	"S.find(sub [,start [,end]]) -> int\n\
				3629	\n\
				3630	Return the lowest index in S where substring sub is found,\n\
				3631	such that sub is contained within s[start,end]. Optional\n\
				3632	arguments start and end are interpreted as in slice notation.\n\
				3633	\n\
				3634	Return -1 on failure.";
				3635
				3636	static PyObject *
				3637	unicode_find(PyUnicodeObject self, PyObject args)
				3638	{
				3639	PyUnicodeObject *substring;
				3640	int start = 0;
				3641	int end = INT_MAX;
				3642	PyObject *result;
				3643
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3644	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3645	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3646	return NULL;
				3647	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3648	(PyObject *)substring);
				3649	if (substring == NULL)
				3650	return NULL;
				3651
				3652	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3653
				3654	Py_DECREF(substring);
				3655	return result;
				3656	}
				3657
				3658	static PyObject *
				3659	unicode_getitem(PyUnicodeObject *self, int index)
				3660	{
				3661	if (index < 0 \|\| index >= self->length) {
				3662	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3663	return NULL;
				3664	}
				3665
				3666	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3667	}
				3668
				3669	static long
				3670	unicode_hash(PyUnicodeObject *self)
				3671	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3672	/* Since Unicode objects compare equal to their ASCII string
				3673	counterparts, they should use the individual character values
				3674	as basis for their hash value. This is needed to assure that
				3675	strings and Unicode objects behave in the same way as
				3676	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3677
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3678	register int len;
				3679	register Py_UNICODE *p;
				3680	register long x;
				3681
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3682	if (self->hash != -1)
				3683	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3684	len = PyUnicode_GET_SIZE(self);
				3685	p = PyUnicode_AS_UNICODE(self);
				3686	x = *p << 7;
				3687	while (--len >= 0)
				3688	x = (1000003x) ^ p++;
				3689	x ^= PyUnicode_GET_SIZE(self);
				3690	if (x == -1)
				3691	x = -2;
				3692	self->hash = x;
				3693	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3694	}
				3695
				3696	static char index__doc__[] =
				3697	"S.index(sub [,start [,end]]) -> int\n\
				3698	\n\
				3699	Like S.find() but raise ValueError when the substring is not found.";
				3700
				3701	static PyObject *
				3702	unicode_index(PyUnicodeObject self, PyObject args)
				3703	{
				3704	int result;
				3705	PyUnicodeObject *substring;
				3706	int start = 0;
				3707	int end = INT_MAX;
				3708
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3709	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3710	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3711	return NULL;
				3712
				3713	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3714	(PyObject *)substring);
				3715	if (substring == NULL)
				3716	return NULL;
				3717
				3718	result = findstring(self, substring, start, end, 1);
				3719
				3720	Py_DECREF(substring);
				3721	if (result < 0) {
				3722	PyErr_SetString(PyExc_ValueError, "substring not found");
				3723	return NULL;
				3724	}
				3725	return PyInt_FromLong(result);
				3726	}
				3727
				3728	static char islower__doc__[] =
				3729	"S.islower() -> int\n\
				3730	\n\
				3731	Return 1 if all cased characters in S are lowercase and there is\n\
				3732	at least one cased character in S, 0 otherwise.";
				3733
				3734	static PyObject*
				3735	unicode_islower(PyUnicodeObject self, PyObject args)
				3736	{
				3737	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3738	register const Py_UNICODE *e;
				3739	int cased;
				3740
				3741	if (!PyArg_NoArgs(args))
				3742	return NULL;
				3743
				3744	/* Shortcut for single character strings */
				3745	if (PyUnicode_GET_SIZE(self) == 1)
				3746	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3747
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3748	/* Special case for empty strings */
				3749	if (PyString_GET_SIZE(self) == 0)
				3750	return PyInt_FromLong(0);
				3751
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3752	e = p + PyUnicode_GET_SIZE(self);
				3753	cased = 0;
				3754	for (; p < e; p++) {
				3755	register const Py_UNICODE ch = *p;
				3756
				3757	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3758	return PyInt_FromLong(0);
				3759	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3760	cased = 1;
				3761	}
				3762	return PyInt_FromLong(cased);
				3763	}
				3764
				3765	static char isupper__doc__[] =
				3766	"S.isupper() -> int\n\
				3767	\n\
				3768	Return 1 if all cased characters in S are uppercase and there is\n\
				3769	at least one cased character in S, 0 otherwise.";
				3770
				3771	static PyObject*
				3772	unicode_isupper(PyUnicodeObject self, PyObject args)
				3773	{
				3774	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3775	register const Py_UNICODE *e;
				3776	int cased;
				3777
				3778	if (!PyArg_NoArgs(args))
				3779	return NULL;
				3780
				3781	/* Shortcut for single character strings */
				3782	if (PyUnicode_GET_SIZE(self) == 1)
				3783	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3784
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3785	/* Special case for empty strings */
				3786	if (PyString_GET_SIZE(self) == 0)
				3787	return PyInt_FromLong(0);
				3788
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3789	e = p + PyUnicode_GET_SIZE(self);
				3790	cased = 0;
				3791	for (; p < e; p++) {
				3792	register const Py_UNICODE ch = *p;
				3793
				3794	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3795	return PyInt_FromLong(0);
				3796	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3797	cased = 1;
				3798	}
				3799	return PyInt_FromLong(cased);
				3800	}
				3801
				3802	static char istitle__doc__[] =
				3803	"S.istitle() -> int\n\
				3804	\n\
				3805	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3806	may only follow uncased characters and lowercase characters only cased\n\
				3807	ones. Return 0 otherwise.";
				3808
				3809	static PyObject*
				3810	unicode_istitle(PyUnicodeObject self, PyObject args)
				3811	{
				3812	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3813	register const Py_UNICODE *e;
				3814	int cased, previous_is_cased;
				3815
				3816	if (!PyArg_NoArgs(args))
				3817	return NULL;
				3818
				3819	/* Shortcut for single character strings */
				3820	if (PyUnicode_GET_SIZE(self) == 1)
				3821	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3822	(Py_UNICODE_ISUPPER(*p) != 0));
				3823
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3824	/* Special case for empty strings */
				3825	if (PyString_GET_SIZE(self) == 0)
				3826	return PyInt_FromLong(0);
				3827
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3828	e = p + PyUnicode_GET_SIZE(self);
				3829	cased = 0;
				3830	previous_is_cased = 0;
				3831	for (; p < e; p++) {
				3832	register const Py_UNICODE ch = *p;
				3833
				3834	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3835	if (previous_is_cased)
				3836	return PyInt_FromLong(0);
				3837	previous_is_cased = 1;
				3838	cased = 1;
				3839	}
				3840	else if (Py_UNICODE_ISLOWER(ch)) {
				3841	if (!previous_is_cased)
				3842	return PyInt_FromLong(0);
				3843	previous_is_cased = 1;
				3844	cased = 1;
				3845	}
				3846	else
				3847	previous_is_cased = 0;
				3848	}
				3849	return PyInt_FromLong(cased);
				3850	}
				3851
				3852	static char isspace__doc__[] =
				3853	"S.isspace() -> int\n\
				3854	\n\
				3855	Return 1 if there are only whitespace characters in S,\n\
				3856	0 otherwise.";
				3857
				3858	static PyObject*
				3859	unicode_isspace(PyUnicodeObject self, PyObject args)
				3860	{
				3861	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3862	register const Py_UNICODE *e;
				3863
				3864	if (!PyArg_NoArgs(args))
				3865	return NULL;
				3866
				3867	/* Shortcut for single character strings */
				3868	if (PyUnicode_GET_SIZE(self) == 1 &&
				3869	Py_UNICODE_ISSPACE(*p))
				3870	return PyInt_FromLong(1);
				3871
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3872	/* Special case for empty strings */
				3873	if (PyString_GET_SIZE(self) == 0)
				3874	return PyInt_FromLong(0);
				3875
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3876	e = p + PyUnicode_GET_SIZE(self);
				3877	for (; p < e; p++) {
				3878	if (!Py_UNICODE_ISSPACE(*p))
				3879	return PyInt_FromLong(0);
				3880	}
				3881	return PyInt_FromLong(1);
				3882	}
				3883
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3884	static char isalpha__doc__[] =
				3885	"S.isalpha() -> int\n\
				3886	\n\
				3887	Return 1 if all characters in S are alphabetic\n\
				3888	and there is at least one character in S, 0 otherwise.";
				3889
				3890	static PyObject*
				3891	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3892	{
				3893	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3894	register const Py_UNICODE *e;
				3895
				3896	if (!PyArg_NoArgs(args))
				3897	return NULL;
				3898
				3899	/* Shortcut for single character strings */
				3900	if (PyUnicode_GET_SIZE(self) == 1 &&
				3901	Py_UNICODE_ISALPHA(*p))
				3902	return PyInt_FromLong(1);
				3903
				3904	/* Special case for empty strings */
				3905	if (PyString_GET_SIZE(self) == 0)
				3906	return PyInt_FromLong(0);
				3907
				3908	e = p + PyUnicode_GET_SIZE(self);
				3909	for (; p < e; p++) {
				3910	if (!Py_UNICODE_ISALPHA(*p))
				3911	return PyInt_FromLong(0);
				3912	}
				3913	return PyInt_FromLong(1);
				3914	}
				3915
				3916	static char isalnum__doc__[] =
				3917	"S.isalnum() -> int\n\
				3918	\n\
				3919	Return 1 if all characters in S are alphanumeric\n\
				3920	and there is at least one character in S, 0 otherwise.";
				3921
				3922	static PyObject*
				3923	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3924	{
				3925	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3926	register const Py_UNICODE *e;
				3927
				3928	if (!PyArg_NoArgs(args))
				3929	return NULL;
				3930
				3931	/* Shortcut for single character strings */
				3932	if (PyUnicode_GET_SIZE(self) == 1 &&
				3933	Py_UNICODE_ISALNUM(*p))
				3934	return PyInt_FromLong(1);
				3935
				3936	/* Special case for empty strings */
				3937	if (PyString_GET_SIZE(self) == 0)
				3938	return PyInt_FromLong(0);
				3939
				3940	e = p + PyUnicode_GET_SIZE(self);
				3941	for (; p < e; p++) {
				3942	if (!Py_UNICODE_ISALNUM(*p))
				3943	return PyInt_FromLong(0);
				3944	}
				3945	return PyInt_FromLong(1);
				3946	}
				3947
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3948	static char isdecimal__doc__[] =
				3949	"S.isdecimal() -> int\n\
				3950	\n\
				3951	Return 1 if there are only decimal characters in S,\n\
				3952	0 otherwise.";
				3953
				3954	static PyObject*
				3955	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3956	{
				3957	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3958	register const Py_UNICODE *e;
				3959
				3960	if (!PyArg_NoArgs(args))
				3961	return NULL;
				3962
				3963	/* Shortcut for single character strings */
				3964	if (PyUnicode_GET_SIZE(self) == 1 &&
				3965	Py_UNICODE_ISDECIMAL(*p))
				3966	return PyInt_FromLong(1);
				3967
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3968	/* Special case for empty strings */
				3969	if (PyString_GET_SIZE(self) == 0)
				3970	return PyInt_FromLong(0);
				3971
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3972	e = p + PyUnicode_GET_SIZE(self);
				3973	for (; p < e; p++) {
				3974	if (!Py_UNICODE_ISDECIMAL(*p))
				3975	return PyInt_FromLong(0);
				3976	}
				3977	return PyInt_FromLong(1);
				3978	}
				3979
				3980	static char isdigit__doc__[] =
				3981	"S.isdigit() -> int\n\
				3982	\n\
				3983	Return 1 if there are only digit characters in S,\n\
				3984	0 otherwise.";
				3985
				3986	static PyObject*
				3987	unicode_isdigit(PyUnicodeObject self, PyObject args)
				3988	{
				3989	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3990	register const Py_UNICODE *e;
				3991
				3992	if (!PyArg_NoArgs(args))
				3993	return NULL;
				3994
				3995	/* Shortcut for single character strings */
				3996	if (PyUnicode_GET_SIZE(self) == 1 &&
				3997	Py_UNICODE_ISDIGIT(*p))
				3998	return PyInt_FromLong(1);
				3999
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4000	/* Special case for empty strings */
				4001	if (PyString_GET_SIZE(self) == 0)
				4002	return PyInt_FromLong(0);
				4003
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4004	e = p + PyUnicode_GET_SIZE(self);
				4005	for (; p < e; p++) {
				4006	if (!Py_UNICODE_ISDIGIT(*p))
				4007	return PyInt_FromLong(0);
				4008	}
				4009	return PyInt_FromLong(1);
				4010	}
				4011
				4012	static char isnumeric__doc__[] =
				4013	"S.isnumeric() -> int\n\
				4014	\n\
				4015	Return 1 if there are only numeric characters in S,\n\
				4016	0 otherwise.";
				4017
				4018	static PyObject*
				4019	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				4020	{
				4021	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4022	register const Py_UNICODE *e;
				4023
				4024	if (!PyArg_NoArgs(args))
				4025	return NULL;
				4026
				4027	/* Shortcut for single character strings */
				4028	if (PyUnicode_GET_SIZE(self) == 1 &&
				4029	Py_UNICODE_ISNUMERIC(*p))
				4030	return PyInt_FromLong(1);
				4031
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4032	/* Special case for empty strings */
				4033	if (PyString_GET_SIZE(self) == 0)
				4034	return PyInt_FromLong(0);
				4035
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4036	e = p + PyUnicode_GET_SIZE(self);
				4037	for (; p < e; p++) {
				4038	if (!Py_UNICODE_ISNUMERIC(*p))
				4039	return PyInt_FromLong(0);
				4040	}
				4041	return PyInt_FromLong(1);
				4042	}
				4043
				4044	static char join__doc__[] =
				4045	"S.join(sequence) -> unicode\n\
				4046	\n\
				4047	Return a string which is the concatenation of the strings in the\n\
				4048	sequence. The separator between elements is S.";
				4049
				4050	static PyObject*
				4051	unicode_join(PyUnicodeObject self, PyObject args)
				4052	{
				4053	PyObject *data;
				4054	if (!PyArg_ParseTuple(args, "O:join", &data))
				4055	return NULL;
				4056
				4057	return PyUnicode_Join((PyObject *)self, data);
				4058	}
				4059
				4060	static int
				4061	unicode_length(PyUnicodeObject *self)
				4062	{
				4063	return self->length;
				4064	}
				4065
				4066	static char ljust__doc__[] =
				4067	"S.ljust(width) -> unicode\n\
				4068	\n\
				4069	Return S left justified in a Unicode string of length width. Padding is\n\
				4070	done using spaces.";
				4071
				4072	static PyObject *
				4073	unicode_ljust(PyUnicodeObject self, PyObject args)
				4074	{
				4075	int width;
				4076	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4077	return NULL;
				4078
				4079	if (self->length >= width) {
				4080	Py_INCREF(self);
				4081	return (PyObject*) self;
				4082	}
				4083
				4084	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4085	}
				4086
				4087	static char lower__doc__[] =
				4088	"S.lower() -> unicode\n\
				4089	\n\
				4090	Return a copy of the string S converted to lowercase.";
				4091
				4092	static PyObject*
				4093	unicode_lower(PyUnicodeObject self, PyObject args)
				4094	{
				4095	if (!PyArg_NoArgs(args))
				4096	return NULL;
				4097	return fixup(self, fixlower);
				4098	}
				4099
				4100	static char lstrip__doc__[] =
				4101	"S.lstrip() -> unicode\n\
				4102	\n\
				4103	Return a copy of the string S with leading whitespace removed.";
				4104
				4105	static PyObject *
				4106	unicode_lstrip(PyUnicodeObject self, PyObject args)
				4107	{
				4108	if (!PyArg_NoArgs(args))
				4109	return NULL;
				4110	return strip(self, 1, 0);
				4111	}
				4112
				4113	static PyObject*
				4114	unicode_repeat(PyUnicodeObject *str, int len)
				4115	{
				4116	PyUnicodeObject *u;
				4117	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4118	int nchars;
				4119	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4120
				4121	if (len < 0)
				4122	len = 0;
				4123
				4124	if (len == 1) {
				4125	/* no repeat, return original string */
				4126	Py_INCREF(str);
				4127	return (PyObject*) str;
				4128	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4129
				4130	/* ensure # of chars needed doesn't overflow int and # of bytes
				4131	* needed doesn't overflow size_t
				4132	*/
				4133	nchars = len * str->length;
				4134	if (len && nchars / len != str->length) {
				4135	PyErr_SetString(PyExc_OverflowError,
				4136	"repeated string is too long");
				4137	return NULL;
				4138	}
				4139	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4140	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4141	PyErr_SetString(PyExc_OverflowError,
				4142	"repeated string is too long");
				4143	return NULL;
				4144	}
				4145	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4146	if (!u)
				4147	return NULL;
				4148
				4149	p = u->str;
				4150
				4151	while (len-- > 0) {
				4152	Py_UNICODE_COPY(p, str->str, str->length);
				4153	p += str->length;
				4154	}
				4155
				4156	return (PyObject*) u;
				4157	}
				4158
				4159	PyObject PyUnicode_Replace(PyObject obj,
				4160	PyObject *subobj,
				4161	PyObject *replobj,
				4162	int maxcount)
				4163	{
				4164	PyObject *self;
				4165	PyObject *str1;
				4166	PyObject *str2;
				4167	PyObject *result;
				4168
				4169	self = PyUnicode_FromObject(obj);
				4170	if (self == NULL)
				4171	return NULL;
				4172	str1 = PyUnicode_FromObject(subobj);
				4173	if (str1 == NULL) {
				4174	Py_DECREF(self);
				4175	return NULL;
				4176	}
				4177	str2 = PyUnicode_FromObject(replobj);
				4178	if (str2 == NULL) {
				4179	Py_DECREF(self);
				4180	Py_DECREF(str1);
				4181	return NULL;
				4182	}
				4183	result = replace((PyUnicodeObject *)self,
				4184	(PyUnicodeObject *)str1,
				4185	(PyUnicodeObject *)str2,
				4186	maxcount);
				4187	Py_DECREF(self);
				4188	Py_DECREF(str1);
				4189	Py_DECREF(str2);
				4190	return result;
				4191	}
				4192
				4193	static char replace__doc__[] =
				4194	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4195	\n\
				4196	Return a copy of S with all occurrences of substring\n\
				4197	old replaced by new. If the optional argument maxsplit is\n\
				4198	given, only the first maxsplit occurrences are replaced.";
				4199
				4200	static PyObject*
				4201	unicode_replace(PyUnicodeObject self, PyObject args)
				4202	{
				4203	PyUnicodeObject *str1;
				4204	PyUnicodeObject *str2;
				4205	int maxcount = -1;
				4206	PyObject *result;
				4207
				4208	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4209	return NULL;
				4210	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4211	if (str1 == NULL)
				4212	return NULL;
				4213	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4214	if (str2 == NULL)
				4215	return NULL;
				4216
				4217	result = replace(self, str1, str2, maxcount);
				4218
				4219	Py_DECREF(str1);
				4220	Py_DECREF(str2);
				4221	return result;
				4222	}
				4223
				4224	static
				4225	PyObject unicode_repr(PyObject unicode)
				4226	{
				4227	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4228	PyUnicode_GET_SIZE(unicode),
				4229	1);
				4230	}
				4231
				4232	static char rfind__doc__[] =
				4233	"S.rfind(sub [,start [,end]]) -> int\n\
				4234	\n\
				4235	Return the highest index in S where substring sub is found,\n\
				4236	such that sub is contained within s[start,end]. Optional\n\
				4237	arguments start and end are interpreted as in slice notation.\n\
				4238	\n\
				4239	Return -1 on failure.";
				4240
				4241	static PyObject *
				4242	unicode_rfind(PyUnicodeObject self, PyObject args)
				4243	{
				4244	PyUnicodeObject *substring;
				4245	int start = 0;
				4246	int end = INT_MAX;
				4247	PyObject *result;
				4248
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4249	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4250	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4251	return NULL;
				4252	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4253	(PyObject *)substring);
				4254	if (substring == NULL)
				4255	return NULL;
				4256
				4257	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4258
				4259	Py_DECREF(substring);
				4260	return result;
				4261	}
				4262
				4263	static char rindex__doc__[] =
				4264	"S.rindex(sub [,start [,end]]) -> int\n\
				4265	\n\
				4266	Like S.rfind() but raise ValueError when the substring is not found.";
				4267
				4268	static PyObject *
				4269	unicode_rindex(PyUnicodeObject self, PyObject args)
				4270	{
				4271	int result;
				4272	PyUnicodeObject *substring;
				4273	int start = 0;
				4274	int end = INT_MAX;
				4275
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4276	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4277	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4278	return NULL;
				4279	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4280	(PyObject *)substring);
				4281	if (substring == NULL)
				4282	return NULL;
				4283
				4284	result = findstring(self, substring, start, end, -1);
				4285
				4286	Py_DECREF(substring);
				4287	if (result < 0) {
				4288	PyErr_SetString(PyExc_ValueError, "substring not found");
				4289	return NULL;
				4290	}
				4291	return PyInt_FromLong(result);
				4292	}
				4293
				4294	static char rjust__doc__[] =
				4295	"S.rjust(width) -> unicode\n\
				4296	\n\
				4297	Return S right justified in a Unicode string of length width. Padding is\n\
				4298	done using spaces.";
				4299
				4300	static PyObject *
				4301	unicode_rjust(PyUnicodeObject self, PyObject args)
				4302	{
				4303	int width;
				4304	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4305	return NULL;
				4306
				4307	if (self->length >= width) {
				4308	Py_INCREF(self);
				4309	return (PyObject*) self;
				4310	}
				4311
				4312	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4313	}
				4314
				4315	static char rstrip__doc__[] =
				4316	"S.rstrip() -> unicode\n\
				4317	\n\
				4318	Return a copy of the string S with trailing whitespace removed.";
				4319
				4320	static PyObject *
				4321	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4322	{
				4323	if (!PyArg_NoArgs(args))
				4324	return NULL;
				4325	return strip(self, 0, 1);
				4326	}
				4327
				4328	static PyObject*
				4329	unicode_slice(PyUnicodeObject *self, int start, int end)
				4330	{
				4331	/* standard clamping */
				4332	if (start < 0)
				4333	start = 0;
				4334	if (end < 0)
				4335	end = 0;
				4336	if (end > self->length)
				4337	end = self->length;
				4338	if (start == 0 && end == self->length) {
				4339	/* full slice, return original string */
				4340	Py_INCREF(self);
				4341	return (PyObject*) self;
				4342	}
				4343	if (start > end)
				4344	start = end;
				4345	/* copy slice */
				4346	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4347	end - start);
				4348	}
				4349
				4350	PyObject PyUnicode_Split(PyObject s,
				4351	PyObject *sep,
				4352	int maxsplit)
				4353	{
				4354	PyObject *result;
				4355
				4356	s = PyUnicode_FromObject(s);
				4357	if (s == NULL)
				4358	return NULL;
				4359	if (sep != NULL) {
				4360	sep = PyUnicode_FromObject(sep);
				4361	if (sep == NULL) {
				4362	Py_DECREF(s);
				4363	return NULL;
				4364	}
				4365	}
				4366
				4367	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4368
				4369	Py_DECREF(s);
				4370	Py_XDECREF(sep);
				4371	return result;
				4372	}
				4373
				4374	static char split__doc__[] =
				4375	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4376	\n\
				4377	Return a list of the words in S, using sep as the\n\
				4378	delimiter string. If maxsplit is given, at most maxsplit\n\
				4379	splits are done. If sep is not specified, any whitespace string\n\
				4380	is a separator.";
				4381
				4382	static PyObject*
				4383	unicode_split(PyUnicodeObject self, PyObject args)
				4384	{
				4385	PyObject *substring = Py_None;
				4386	int maxcount = -1;
				4387
				4388	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4389	return NULL;
				4390
				4391	if (substring == Py_None)
				4392	return split(self, NULL, maxcount);
				4393	else if (PyUnicode_Check(substring))
				4394	return split(self, (PyUnicodeObject *)substring, maxcount);
				4395	else
				4396	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4397	}
				4398
				4399	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4400	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4401	\n\
				4402	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4403	Line breaks are not included in the resulting list unless keepends\n\
				4404	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4405
				4406	static PyObject*
				4407	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4408	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4409	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4410
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4411	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4412	return NULL;
				4413
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4414	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4415	}
				4416
				4417	static
				4418	PyObject unicode_str(PyUnicodeObject self)
				4419	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4420	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4421	}
				4422
				4423	static char strip__doc__[] =
				4424	"S.strip() -> unicode\n\
				4425	\n\
				4426	Return a copy of S with leading and trailing whitespace removed.";
				4427
				4428	static PyObject *
				4429	unicode_strip(PyUnicodeObject self, PyObject args)
				4430	{
				4431	if (!PyArg_NoArgs(args))
				4432	return NULL;
				4433	return strip(self, 1, 1);
				4434	}
				4435
				4436	static char swapcase__doc__[] =
				4437	"S.swapcase() -> unicode\n\
				4438	\n\
				4439	Return a copy of S with uppercase characters converted to lowercase\n\
				4440	and vice versa.";
				4441
				4442	static PyObject*
				4443	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4444	{
				4445	if (!PyArg_NoArgs(args))
				4446	return NULL;
				4447	return fixup(self, fixswapcase);
				4448	}
				4449
				4450	static char translate__doc__[] =
				4451	"S.translate(table) -> unicode\n\
				4452	\n\
				4453	Return a copy of the string S, where all characters have been mapped\n\
				4454	through the given translation table, which must be a mapping of\n\
				4455	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4456	are left untouched. Characters mapped to None are deleted.";
				4457
				4458	static PyObject*
				4459	unicode_translate(PyUnicodeObject self, PyObject args)
				4460	{
				4461	PyObject *table;
				4462
				4463	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4464	return NULL;
				4465	return PyUnicode_TranslateCharmap(self->str,
				4466	self->length,
				4467	table,
				4468	"ignore");
				4469	}
				4470
				4471	static char upper__doc__[] =
				4472	"S.upper() -> unicode\n\
				4473	\n\
				4474	Return a copy of S converted to uppercase.";
				4475
				4476	static PyObject*
				4477	unicode_upper(PyUnicodeObject self, PyObject args)
				4478	{
				4479	if (!PyArg_NoArgs(args))
				4480	return NULL;
				4481	return fixup(self, fixupper);
				4482	}
				4483
				4484	#if 0
				4485	static char zfill__doc__[] =
				4486	"S.zfill(width) -> unicode\n\
				4487	\n\
				4488	Pad a numeric string x with zeros on the left, to fill a field\n\
				4489	of the specified width. The string x is never truncated.";
				4490
				4491	static PyObject *
				4492	unicode_zfill(PyUnicodeObject self, PyObject args)
				4493	{
				4494	int fill;
				4495	PyUnicodeObject *u;
				4496
				4497	int width;
				4498	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4499	return NULL;
				4500
				4501	if (self->length >= width) {
				4502	Py_INCREF(self);
				4503	return (PyObject*) self;
				4504	}
				4505
				4506	fill = width - self->length;
				4507
				4508	u = pad(self, fill, 0, '0');
				4509
				4510	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4511	/* move sign to beginning of string */
				4512	u->str[0] = u->str[fill];
				4513	u->str[fill] = '0';
				4514	}
				4515
				4516	return (PyObject*) u;
				4517	}
				4518	#endif
				4519
				4520	#if 0
				4521	static PyObject*
				4522	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4523	{
				4524	if (!PyArg_NoArgs(args))
				4525	return NULL;
				4526	return PyInt_FromLong(unicode_freelist_size);
				4527	}
				4528	#endif
				4529
				4530	static char startswith__doc__[] =
				4531	"S.startswith(prefix[, start[, end]]) -> int\n\
				4532	\n\
				4533	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4534	optional start, test S beginning at that position. With optional end, stop\n\
				4535	comparing S at that position.";
				4536
				4537	static PyObject *
				4538	unicode_startswith(PyUnicodeObject *self,
				4539	PyObject *args)
				4540	{
				4541	PyUnicodeObject *substring;
				4542	int start = 0;
				4543	int end = INT_MAX;
				4544	PyObject *result;
				4545
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4546	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4547	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4548	return NULL;
				4549	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4550	(PyObject *)substring);
				4551	if (substring == NULL)
				4552	return NULL;
				4553
				4554	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4555
				4556	Py_DECREF(substring);
				4557	return result;
				4558	}
				4559
				4560
				4561	static char endswith__doc__[] =
				4562	"S.endswith(suffix[, start[, end]]) -> int\n\
				4563	\n\
				4564	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4565	optional start, test S beginning at that position. With optional end, stop\n\
				4566	comparing S at that position.";
				4567
				4568	static PyObject *
				4569	unicode_endswith(PyUnicodeObject *self,
				4570	PyObject *args)
				4571	{
				4572	PyUnicodeObject *substring;
				4573	int start = 0;
				4574	int end = INT_MAX;
				4575	PyObject *result;
				4576
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4577	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4578	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4579	return NULL;
				4580	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4581	(PyObject *)substring);
				4582	if (substring == NULL)
				4583	return NULL;
				4584
				4585	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4586
				4587	Py_DECREF(substring);
				4588	return result;
				4589	}
				4590
				4591
				4592	static PyMethodDef unicode_methods[] = {
				4593
				4594	/* Order is according to common usage: often used methods should
				4595	appear first, since lookup is done sequentially. */
				4596
				4597	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4598	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4599	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4600	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4601	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4602	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4603	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4604	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4605	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4606	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4607	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4608	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4609	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4610	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4611	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4612	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4613	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4614	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4615	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4616	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4617	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4618	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4619	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4620	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4621	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4622	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4623	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4624	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4625	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4626	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4627	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4628	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4629	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4630	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4631	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4632	#if 0
				4633	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4634	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4635	#endif
				4636
				4637	#if 0
				4638	/* This one is just used for debugging the implementation. */
				4639	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4640	#endif
				4641
				4642	{NULL, NULL}
				4643	};
				4644
				4645	static PyObject *
				4646	unicode_getattr(PyUnicodeObject self, char name)
				4647	{
				4648	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4649	}
				4650
				4651	static PySequenceMethods unicode_as_sequence = {
				4652	(inquiry) unicode_length, /* sq_length */
				4653	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4654	(intargfunc) unicode_repeat, /* sq_repeat */
				4655	(intargfunc) unicode_getitem, /* sq_item */
				4656	(intintargfunc) unicode_slice, /* sq_slice */
				4657	0, /* sq_ass_item */
				4658	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4659	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4660	};
				4661
				4662	static int
				4663	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4664	int index,
				4665	const void **ptr)
				4666	{
				4667	if (index != 0) {
				4668	PyErr_SetString(PyExc_SystemError,
				4669	"accessing non-existent unicode segment");
				4670	return -1;
				4671	}
				4672	ptr = (void ) self->str;
				4673	return PyUnicode_GET_DATA_SIZE(self);
				4674	}
				4675
				4676	static int
				4677	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4678	const void **ptr)
				4679	{
				4680	PyErr_SetString(PyExc_TypeError,
				4681	"cannot use unicode as modifyable buffer");
				4682	return -1;
				4683	}
				4684
				4685	static int
				4686	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4687	int *lenp)
				4688	{
				4689	if (lenp)
				4690	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4691	return 1;
				4692	}
				4693
				4694	static int
				4695	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4696	int index,
				4697	const void **ptr)
				4698	{
				4699	PyObject *str;
				4700
				4701	if (index != 0) {
				4702	PyErr_SetString(PyExc_SystemError,
				4703	"accessing non-existent unicode segment");
				4704	return -1;
				4705	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4706	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4707	if (str == NULL)
				4708	return -1;
				4709	ptr = (void ) PyString_AS_STRING(str);
				4710	return PyString_GET_SIZE(str);
				4711	}
				4712
				4713	/* Helpers for PyUnicode_Format() */
				4714
				4715	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4716	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4717	{
				4718	int argidx = *p_argidx;
				4719	if (argidx < arglen) {
				4720	(*p_argidx)++;
				4721	if (arglen < 0)
				4722	return args;
				4723	else
				4724	return PyTuple_GetItem(args, argidx);
				4725	}
				4726	PyErr_SetString(PyExc_TypeError,
				4727	"not enough arguments for format string");
				4728	return NULL;
				4729	}
				4730
				4731	#define F_LJUST (1<<0)
				4732	#define F_SIGN (1<<1)
				4733	#define F_BLANK (1<<2)
				4734	#define F_ALT (1<<3)
				4735	#define F_ZERO (1<<4)
				4736
				4737	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4738	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4739	{
				4740	register int i;
				4741	int len;
				4742	va_list va;
				4743	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4744	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4745
				4746	/* First, format the string as char array, then expand to Py_UNICODE
				4747	array. */
				4748	charbuffer = (char *)buffer;
				4749	len = vsprintf(charbuffer, format, va);
				4750	for (i = len - 1; i >= 0; i--)
				4751	buffer[i] = (Py_UNICODE) charbuffer[i];
				4752
				4753	va_end(va);
				4754	return len;
				4755	}
				4756
				4757	static int
				4758	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4759	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4760	int flags,
				4761	int prec,
				4762	int type,
				4763	PyObject *v)
				4764	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4765	/* fmt = '%#.' + `prec` + `type`
				4766	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4767	char fmt[20];
				4768	double x;
				4769
				4770	x = PyFloat_AsDouble(v);
				4771	if (x == -1.0 && PyErr_Occurred())
				4772	return -1;
				4773	if (prec < 0)
				4774	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4775	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4776	type = 'g';
				4777	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4778	/* worst case length calc to ensure no buffer overrun:
				4779	fmt = %#.<prec>g
				4780	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4781	for any double rep.)
				4782	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4783	If prec=0 the effective precision is 1 (the leading digit is
				4784	always given), therefore increase by one to 10+prec. */
				4785	if (buflen <= (size_t)10 + (size_t)prec) {
				4786	PyErr_SetString(PyExc_OverflowError,
				4787	"formatted float is too long (precision too long?)");
				4788	return -1;
				4789	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4790	return usprintf(buf, fmt, x);
				4791	}
				4792
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4793	static PyObject*
				4794	formatlong(PyObject *val, int flags, int prec, int type)
				4795	{
				4796	char *buf;
				4797	int i, len;
				4798	PyObject str; / temporary string object. */
				4799	PyUnicodeObject *result;
				4800
				4801	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4802	if (!str)
				4803	return NULL;
				4804	result = _PyUnicode_New(len);
				4805	for (i = 0; i < len; i++)
				4806	result->str[i] = buf[i];
				4807	result->str[len] = 0;
				4808	Py_DECREF(str);
				4809	return (PyObject*)result;
				4810	}
				4811
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4812	static int
				4813	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4814	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4815	int flags,
				4816	int prec,
				4817	int type,
				4818	PyObject *v)
				4819	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4820	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4821	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4822	+ 1 + 1 = 24*/
				4823	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4824	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4825	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4826
				4827	x = PyInt_AsLong(v);
				4828	if (x == -1 && PyErr_Occurred())
				4829	return -1;
				4830	if (prec < 0)
				4831	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4832	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4833	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4834	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4835	PyErr_SetString(PyExc_OverflowError,
				4836	"formatted integer is too long (precision too long?)");
				4837	return -1;
				4838	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4839	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				4840	* but we want it (for consistency with other %#x conversions, and
				4841	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4842	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				4843	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				4844	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4845	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4846	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				4847	/* Only way to know what the platform does is to try it. */
				4848	sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
				4849	if (fmt[1] != (char)type) {
				4850	/* Supply our own leading 0x/0X -- needed under std C */
				4851	use_native_c_format = 0;
				4852	sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
				4853	}
				4854	}
				4855	if (use_native_c_format)
				4856	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4857	return usprintf(buf, fmt, x);
				4858	}
				4859
				4860	static int
				4861	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4862	size_t buflen,
				4863	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4864	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4865	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4866	if (PyUnicode_Check(v)) {
				4867	if (PyUnicode_GET_SIZE(v) != 1)
				4868	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4869	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4870	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4871
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4872	else if (PyString_Check(v)) {
				4873	if (PyString_GET_SIZE(v) != 1)
				4874	goto onError;
				4875	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4876	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4877
				4878	else {
				4879	/* Integer input truncated to a character */
				4880	long x;
				4881	x = PyInt_AsLong(v);
				4882	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4883	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4884	buf[0] = (char) x;
				4885	}
				4886	buf[1] = '\0';
				4887	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4888
				4889	onError:
				4890	PyErr_SetString(PyExc_TypeError,
				4891	"%c requires int or char");
				4892	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4893	}
				4894
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4895	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4896
				4897	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4898	chars are formatted. XXX This is a magic number. Each formatting
				4899	routine does bounds checking to ensure no overflow, but a better
				4900	solution may be to malloc a buffer of appropriate size for each
				4901	format. For now, the current solution is sufficient.
				4902	*/
				4903	#define FORMATBUFLEN (size_t)120
				4904
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4905	PyObject PyUnicode_Format(PyObject format,
				4906	PyObject *args)
				4907	{
				4908	Py_UNICODE fmt, res;
				4909	int fmtcnt, rescnt, reslen, arglen, argidx;
				4910	int args_owned = 0;
				4911	PyUnicodeObject *result = NULL;
				4912	PyObject *dict = NULL;
				4913	PyObject *uformat;
				4914
				4915	if (format == NULL \|\| args == NULL) {
				4916	PyErr_BadInternalCall();
				4917	return NULL;
				4918	}
				4919	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4920	if (uformat == NULL)
				4921	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4922	fmt = PyUnicode_AS_UNICODE(uformat);
				4923	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4924
				4925	reslen = rescnt = fmtcnt + 100;
				4926	result = _PyUnicode_New(reslen);
				4927	if (result == NULL)
				4928	goto onError;
				4929	res = PyUnicode_AS_UNICODE(result);
				4930
				4931	if (PyTuple_Check(args)) {
				4932	arglen = PyTuple_Size(args);
				4933	argidx = 0;
				4934	}
				4935	else {
				4936	arglen = -1;
				4937	argidx = -2;
				4938	}
				4939	if (args->ob_type->tp_as_mapping)
				4940	dict = args;
				4941
				4942	while (--fmtcnt >= 0) {
				4943	if (*fmt != '%') {
				4944	if (--rescnt < 0) {
				4945	rescnt = fmtcnt + 100;
				4946	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	4947	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4948	return NULL;
				4949	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4950	--rescnt;
				4951	}
				4952	res++ = fmt++;
				4953	}
				4954	else {
				4955	/* Got a format specifier */
				4956	int flags = 0;
				4957	int width = -1;
				4958	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4959	Py_UNICODE c = '\0';
				4960	Py_UNICODE fill;
				4961	PyObject *v = NULL;
				4962	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4963	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4964	Py_UNICODE sign;
				4965	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4966	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4967
				4968	fmt++;
				4969	if (*fmt == '(') {
				4970	Py_UNICODE *keystart;
				4971	int keylen;
				4972	PyObject *key;
				4973	int pcount = 1;
				4974
				4975	if (dict == NULL) {
				4976	PyErr_SetString(PyExc_TypeError,
				4977	"format requires a mapping");
				4978	goto onError;
				4979	}
				4980	++fmt;
				4981	--fmtcnt;
				4982	keystart = fmt;
				4983	/* Skip over balanced parentheses */
				4984	while (pcount > 0 && --fmtcnt >= 0) {
				4985	if (*fmt == ')')
				4986	--pcount;
				4987	else if (*fmt == '(')
				4988	++pcount;
				4989	fmt++;
				4990	}
				4991	keylen = fmt - keystart - 1;
				4992	if (fmtcnt < 0 \|\| pcount > 0) {
				4993	PyErr_SetString(PyExc_ValueError,
				4994	"incomplete format key");
				4995	goto onError;
				4996	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4997	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4998	then looked up since Python uses strings to hold
				4999	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5000	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5001	key = PyUnicode_EncodeUTF8(keystart,
				5002	keylen,
				5003	NULL);
				5004	if (key == NULL)
				5005	goto onError;
				5006	if (args_owned) {
				5007	Py_DECREF(args);
				5008	args_owned = 0;
				5009	}
				5010	args = PyObject_GetItem(dict, key);
				5011	Py_DECREF(key);
				5012	if (args == NULL) {
				5013	goto onError;
				5014	}
				5015	args_owned = 1;
				5016	arglen = -1;
				5017	argidx = -2;
				5018	}
				5019	while (--fmtcnt >= 0) {
				5020	switch (c = *fmt++) {
				5021	case '-': flags \|= F_LJUST; continue;
				5022	case '+': flags \|= F_SIGN; continue;
				5023	case ' ': flags \|= F_BLANK; continue;
				5024	case '#': flags \|= F_ALT; continue;
				5025	case '0': flags \|= F_ZERO; continue;
				5026	}
				5027	break;
				5028	}
				5029	if (c == '*') {
				5030	v = getnextarg(args, arglen, &argidx);
				5031	if (v == NULL)
				5032	goto onError;
				5033	if (!PyInt_Check(v)) {
				5034	PyErr_SetString(PyExc_TypeError,
				5035	"* wants int");
				5036	goto onError;
				5037	}
				5038	width = PyInt_AsLong(v);
				5039	if (width < 0) {
				5040	flags \|= F_LJUST;
				5041	width = -width;
				5042	}
				5043	if (--fmtcnt >= 0)
				5044	c = *fmt++;
				5045	}
				5046	else if (c >= '0' && c <= '9') {
				5047	width = c - '0';
				5048	while (--fmtcnt >= 0) {
				5049	c = *fmt++;
				5050	if (c < '0' \|\| c > '9')
				5051	break;
				5052	if ((width*10) / 10 != width) {
				5053	PyErr_SetString(PyExc_ValueError,
				5054	"width too big");
				5055	goto onError;
				5056	}
				5057	width = width*10 + (c - '0');
				5058	}
				5059	}
				5060	if (c == '.') {
				5061	prec = 0;
				5062	if (--fmtcnt >= 0)
				5063	c = *fmt++;
				5064	if (c == '*') {
				5065	v = getnextarg(args, arglen, &argidx);
				5066	if (v == NULL)
				5067	goto onError;
				5068	if (!PyInt_Check(v)) {
				5069	PyErr_SetString(PyExc_TypeError,
				5070	"* wants int");
				5071	goto onError;
				5072	}
				5073	prec = PyInt_AsLong(v);
				5074	if (prec < 0)
				5075	prec = 0;
				5076	if (--fmtcnt >= 0)
				5077	c = *fmt++;
				5078	}
				5079	else if (c >= '0' && c <= '9') {
				5080	prec = c - '0';
				5081	while (--fmtcnt >= 0) {
				5082	c = Py_CHARMASK(*fmt++);
				5083	if (c < '0' \|\| c > '9')
				5084	break;
				5085	if ((prec*10) / 10 != prec) {
				5086	PyErr_SetString(PyExc_ValueError,
				5087	"prec too big");
				5088	goto onError;
				5089	}
				5090	prec = prec*10 + (c - '0');
				5091	}
				5092	}
				5093	} /* prec */
				5094	if (fmtcnt >= 0) {
				5095	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5096	if (--fmtcnt >= 0)
				5097	c = *fmt++;
				5098	}
				5099	}
				5100	if (fmtcnt < 0) {
				5101	PyErr_SetString(PyExc_ValueError,
				5102	"incomplete format");
				5103	goto onError;
				5104	}
				5105	if (c != '%') {
				5106	v = getnextarg(args, arglen, &argidx);
				5107	if (v == NULL)
				5108	goto onError;
				5109	}
				5110	sign = 0;
				5111	fill = ' ';
				5112	switch (c) {
				5113
				5114	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5115	pbuf = formatbuf;
				5116	/* presume that buffer length is at least 1 */
				5117	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5118	len = 1;
				5119	break;
				5120
				5121	case 's':
				5122	case 'r':
				5123	if (PyUnicode_Check(v) && c == 's') {
				5124	temp = v;
				5125	Py_INCREF(temp);
				5126	}
				5127	else {
				5128	PyObject *unicode;
				5129	if (c == 's')
				5130	temp = PyObject_Str(v);
				5131	else
				5132	temp = PyObject_Repr(v);
				5133	if (temp == NULL)
				5134	goto onError;
				5135	if (!PyString_Check(temp)) {
				5136	/* XXX Note: this should never happen, since
				5137	PyObject_Repr() and PyObject_Str() assure
				5138	this */
				5139	Py_DECREF(temp);
				5140	PyErr_SetString(PyExc_TypeError,
				5141	"%s argument has non-string str()");
				5142	goto onError;
				5143	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5144	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5145	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5146	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5147	"strict");
				5148	Py_DECREF(temp);
				5149	temp = unicode;
				5150	if (temp == NULL)
				5151	goto onError;
				5152	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5153	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5154	len = PyUnicode_GET_SIZE(temp);
				5155	if (prec >= 0 && len > prec)
				5156	len = prec;
				5157	break;
				5158
				5159	case 'i':
				5160	case 'd':
				5161	case 'u':
				5162	case 'o':
				5163	case 'x':
				5164	case 'X':
				5165	if (c == 'i')
				5166	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5167	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5168	temp = formatlong(v, flags, prec, c);
				5169	if (!temp)
				5170	goto onError;
				5171	pbuf = PyUnicode_AS_UNICODE(temp);
				5172	len = PyUnicode_GET_SIZE(temp);
				5173	/* unbounded ints can always produce
				5174	a sign character! */
				5175	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5176	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5177	else {
				5178	pbuf = formatbuf;
				5179	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5180	flags, prec, c, v);
				5181	if (len < 0)
				5182	goto onError;
				5183	/* only d conversion is signed */
				5184	sign = c == 'd';
				5185	}
				5186	if (flags & F_ZERO)
				5187	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5188	break;
				5189
				5190	case 'e':
				5191	case 'E':
				5192	case 'f':
				5193	case 'g':
				5194	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5195	pbuf = formatbuf;
				5196	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5197	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5198	if (len < 0)
				5199	goto onError;
				5200	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5201	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5202	fill = '0';
				5203	break;
				5204
				5205	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5206	pbuf = formatbuf;
				5207	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5208	if (len < 0)
				5209	goto onError;
				5210	break;
				5211
				5212	default:
				5213	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5214	"unsupported format character '%c' (0x%x) "
				5215	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5216	(31<=c && c<=126) ? c : '?',
				5217	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5218	goto onError;
				5219	}
				5220	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5221	if (pbuf == '-' \|\| pbuf == '+') {
				5222	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5223	len--;
				5224	}
				5225	else if (flags & F_SIGN)
				5226	sign = '+';
				5227	else if (flags & F_BLANK)
				5228	sign = ' ';
				5229	else
				5230	sign = 0;
				5231	}
				5232	if (width < len)
				5233	width = len;
				5234	if (rescnt < width + (sign != 0)) {
				5235	reslen -= rescnt;
				5236	rescnt = width + fmtcnt + 100;
				5237	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5238	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5239	return NULL;
				5240	res = PyUnicode_AS_UNICODE(result)
				5241	+ reslen - rescnt;
				5242	}
				5243	if (sign) {
				5244	if (fill != ' ')
				5245	*res++ = sign;
				5246	rescnt--;
				5247	if (width > len)
				5248	width--;
				5249	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5250	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5251	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5252	assert(pbuf[1] == c);
				5253	if (fill != ' ') {
				5254	res++ = pbuf++;
				5255	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5256	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5257	rescnt -= 2;
				5258	width -= 2;
				5259	if (width < 0)
				5260	width = 0;
				5261	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5262	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5263	if (width > len && !(flags & F_LJUST)) {
				5264	do {
				5265	--rescnt;
				5266	*res++ = fill;
				5267	} while (--width > len);
				5268	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5269	if (fill == ' ') {
				5270	if (sign)
				5271	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5272	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5273	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5274	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5275	res++ = pbuf++;
				5276	res++ = pbuf++;
				5277	}
				5278	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5279	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5280	res += len;
				5281	rescnt -= len;
				5282	while (--width >= len) {
				5283	--rescnt;
				5284	*res++ = ' ';
				5285	}
				5286	if (dict && (argidx < arglen) && c != '%') {
				5287	PyErr_SetString(PyExc_TypeError,
				5288	"not all arguments converted");
				5289	goto onError;
				5290	}
				5291	Py_XDECREF(temp);
				5292	} /* '%' */
				5293	} /* until end */
				5294	if (argidx < arglen && !dict) {
				5295	PyErr_SetString(PyExc_TypeError,
				5296	"not all arguments converted");
				5297	goto onError;
				5298	}
				5299
				5300	if (args_owned) {
				5301	Py_DECREF(args);
				5302	}
				5303	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5304	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5305	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5306	return (PyObject *)result;
				5307
				5308	onError:
				5309	Py_XDECREF(result);
				5310	Py_DECREF(uformat);
				5311	if (args_owned) {
				5312	Py_DECREF(args);
				5313	}
				5314	return NULL;
				5315	}
				5316
				5317	static PyBufferProcs unicode_as_buffer = {
				5318	(getreadbufferproc) unicode_buffer_getreadbuf,
				5319	(getwritebufferproc) unicode_buffer_getwritebuf,
				5320	(getsegcountproc) unicode_buffer_getsegcount,
				5321	(getcharbufferproc) unicode_buffer_getcharbuf,
				5322	};
				5323
				5324	PyTypeObject PyUnicode_Type = {
				5325	PyObject_HEAD_INIT(&PyType_Type)
				5326	0, /* ob_size */
				5327	"unicode", /* tp_name */
				5328	sizeof(PyUnicodeObject), /* tp_size */
				5329	0, /* tp_itemsize */
				5330	/* Slots */
				5331	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5332	0, /* tp_print */
				5333	(getattrfunc)unicode_getattr, /* tp_getattr */
				5334	0, /* tp_setattr */
				5335	(cmpfunc) unicode_compare, /* tp_compare */
				5336	(reprfunc) unicode_repr, /* tp_repr */
				5337	0, /* tp_as_number */
				5338	&unicode_as_sequence, /* tp_as_sequence */
				5339	0, /* tp_as_mapping */
				5340	(hashfunc) unicode_hash, /* tp_hash*/
				5341	0, /* tp_call*/
				5342	(reprfunc) unicode_str, /* tp_str */
				5343	(getattrofunc) NULL, /* tp_getattro */
				5344	(setattrofunc) NULL, /* tp_setattro */
				5345	&unicode_as_buffer, /* tp_as_buffer */
				5346	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5347	};
				5348
				5349	/* Initialize the Unicode implementation */
				5350
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5351	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5352	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5353	int i;
				5354
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5355	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5356	unicode_freelist = NULL;
				5357	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5358	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5359	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5360	for (i = 0; i < 256; i++)
				5361	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5362	}
				5363
				5364	/* Finalize the Unicode implementation */
				5365
				5366	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5367	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5368	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5369	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5370	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5371
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5372	Py_XDECREF(unicode_empty);
				5373	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5374
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5375	for (i = 0; i < 256; i++) {
				5376	if (unicode_latin1[i]) {
				5377	Py_DECREF(unicode_latin1[i]);
				5378	unicode_latin1[i] = NULL;
				5379	}
				5380	}
				5381
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5382	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5383	PyUnicodeObject *v = u;
				5384	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5385	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5386	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5387	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5388	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5389	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5390	unicode_freelist = NULL;
				5391	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5392	}