Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 08ba0659289867165ccd6c79fd35f01ac43dfa6c [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	107	PyUnicode_GetMax(void)
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	108	{
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	109	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
				204	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	222	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
				227	void _PyUnicode_Free(register PyUnicodeObject *unicode)
				228	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	229	if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	230	/* Keep-Alive optimization */
				231	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	232	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	233	unicode->str = NULL;
				234	unicode->length = 0;
				235	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	236	if (unicode->defenc) {
				237	Py_DECREF(unicode->defenc);
				238	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	239	}
				240	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	241	(PyUnicodeObject *)unicode = unicode_freelist;
				242	unicode_freelist = unicode;
				243	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	244	}
				245	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	246	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	247	Py_XDECREF(unicode->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	248	PyObject_DEL(unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	249	}
				250	}
				251
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	252	int PyUnicode_Resize(PyObject **unicode,
				253	int length)
				254	{
				255	register PyUnicodeObject *v;
				256
				257	/* Argument checks */
				258	if (unicode == NULL) {
				259	PyErr_BadInternalCall();
				260	return -1;
				261	}
				262	v = (PyUnicodeObject )unicode;
				263	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				264	PyErr_BadInternalCall();
				265	return -1;
				266	}
				267
				268	/* Resizing unicode_empty and single character objects is not
				269	possible since these are being shared. We simply return a fresh
				270	copy with the same Unicode content. */
				271	if (v->length != length &&
				272	(v == unicode_empty \|\| v->length == 1)) {
				273	PyUnicodeObject *w = _PyUnicode_New(length);
				274	if (w == NULL)
				275	return -1;
				276	Py_UNICODE_COPY(w->str, v->str,
				277	length < v->length ? length : v->length);
				278	unicode = (PyObject )w;
				279	return 0;
				280	}
				281
				282	/* Note that we don't have to modify *unicode for unshared Unicode
				283	objects, since we can modify them in-place. */
				284	return unicode_resize(v, length);
				285	}
				286
				287	/* Internal API for use in unicodeobject.c only ! */
				288	#define _PyUnicode_Resize(unicodevar, length) \
				289	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				290
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	291	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				292	int size)
				293	{
				294	PyUnicodeObject *unicode;
				295
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	296	/* If the Unicode data is known at construction time, we can apply
				297	some optimizations which share commonly used objects. */
				298	if (u != NULL) {
				299
				300	/* Optimization for empty strings */
				301	if (size == 0 && unicode_empty != NULL) {
				302	Py_INCREF(unicode_empty);
				303	return (PyObject *)unicode_empty;
				304	}
				305
				306	/* Single character Unicode objects in the Latin-1 range are
				307	shared when using this constructor */
				308	if (size == 1 && *u < 256) {
				309	unicode = unicode_latin1[*u];
				310	if (!unicode) {
				311	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	312	if (!unicode)
				313	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	314	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	315	unicode_latin1[*u] = unicode;
				316	}
				317	Py_INCREF(unicode);
				318	return (PyObject *)unicode;
				319	}
				320	}
				321
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	322	unicode = _PyUnicode_New(size);
				323	if (!unicode)
				324	return NULL;
				325
				326	/* Copy the Unicode data into the new object */
				327	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	328	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	329
				330	return (PyObject *)unicode;
				331	}
				332
				333	#ifdef HAVE_WCHAR_H
				334
				335	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				336	int size)
				337	{
				338	PyUnicodeObject *unicode;
				339
				340	if (w == NULL) {
				341	PyErr_BadInternalCall();
				342	return NULL;
				343	}
				344
				345	unicode = _PyUnicode_New(size);
				346	if (!unicode)
				347	return NULL;
				348
				349	/* Copy the wchar_t data into the new object */
				350	#ifdef HAVE_USABLE_WCHAR_T
				351	memcpy(unicode->str, w, size * sizeof(wchar_t));
				352	#else
				353	{
				354	register Py_UNICODE *u;
				355	register int i;
				356	u = PyUnicode_AS_UNICODE(unicode);
				357	for (i = size; i >= 0; i--)
				358	u++ = w++;
				359	}
				360	#endif
				361
				362	return (PyObject *)unicode;
				363	}
				364
				365	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				366	register wchar_t *w,
				367	int size)
				368	{
				369	if (unicode == NULL) {
				370	PyErr_BadInternalCall();
				371	return -1;
				372	}
				373	if (size > PyUnicode_GET_SIZE(unicode))
				374	size = PyUnicode_GET_SIZE(unicode);
				375	#ifdef HAVE_USABLE_WCHAR_T
				376	memcpy(w, unicode->str, size * sizeof(wchar_t));
				377	#else
				378	{
				379	register Py_UNICODE *u;
				380	register int i;
				381	u = PyUnicode_AS_UNICODE(unicode);
				382	for (i = size; i >= 0; i--)
				383	w++ = u++;
				384	}
				385	#endif
				386
				387	return size;
				388	}
				389
				390	#endif
				391
				392	PyObject PyUnicode_FromObject(register PyObject obj)
				393	{
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	394	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				395	}
				396
				397	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				398	const char *encoding,
				399	const char *errors)
				400	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	401	const char *s;
				402	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	403	int owned = 0;
				404	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	405
				406	if (obj == NULL) {
				407	PyErr_BadInternalCall();
				408	return NULL;
				409	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	410
				411	/* Coerce object */
				412	if (PyInstance_Check(obj)) {
				413	PyObject *func;
				414	func = PyObject_GetAttrString(obj, "__str__");
				415	if (func == NULL) {
				416	PyErr_SetString(PyExc_TypeError,
				417	"coercing to Unicode: instance doesn't define __str__");
				418	return NULL;
				419	}
				420	obj = PyEval_CallObject(func, NULL);
				421	Py_DECREF(func);
				422	if (obj == NULL)
				423	return NULL;
				424	owned = 1;
				425	}
				426	if (PyUnicode_Check(obj)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	427	Py_INCREF(obj);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	428	v = obj;
				429	if (encoding) {
				430	PyErr_SetString(PyExc_TypeError,
				431	"decoding Unicode is not supported");
				432	return NULL;
				433	}
				434	goto done;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	435	}
				436	else if (PyString_Check(obj)) {
				437	s = PyString_AS_STRING(obj);
				438	len = PyString_GET_SIZE(obj);
				439	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	440	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				441	/* Overwrite the error message with something more useful in
				442	case of a TypeError. */
				443	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	566d8a6	2000-07-11 09:47:04 +0000	[diff] [blame]	444	PyErr_Format(PyExc_TypeError,
				445	"coercing to Unicode: need string or buffer, "
				446	"%.80s found",
				447	obj->ob_type->tp_name);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	448	goto onError;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	449	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	450
				451	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	452	if (len == 0) {
				453	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	454	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	455	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	456	else
				457	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	458
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	459	done:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	460	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	461	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	462	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	463	return v;
				464
				465	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	466	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	467	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	468	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	469	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	470	}
				471
				472	PyObject PyUnicode_Decode(const char s,
				473	int size,
				474	const char *encoding,
				475	const char *errors)
				476	{
				477	PyObject buffer = NULL, unicode;
				478
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	479	if (encoding == NULL)
				480	encoding = PyUnicode_GetDefaultEncoding();
				481
				482	/* Shortcuts for common default encodings */
				483	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	484	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	485	else if (strcmp(encoding, "latin-1") == 0)
				486	return PyUnicode_DecodeLatin1(s, size, errors);
				487	else if (strcmp(encoding, "ascii") == 0)
				488	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	489
				490	/* Decode via the codec registry */
				491	buffer = PyBuffer_FromMemory((void *)s, size);
				492	if (buffer == NULL)
				493	goto onError;
				494	unicode = PyCodec_Decode(buffer, encoding, errors);
				495	if (unicode == NULL)
				496	goto onError;
				497	if (!PyUnicode_Check(unicode)) {
				498	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	499	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	500	unicode->ob_type->tp_name);
				501	Py_DECREF(unicode);
				502	goto onError;
				503	}
				504	Py_DECREF(buffer);
				505	return unicode;
				506
				507	onError:
				508	Py_XDECREF(buffer);
				509	return NULL;
				510	}
				511
				512	PyObject PyUnicode_Encode(const Py_UNICODE s,
				513	int size,
				514	const char *encoding,
				515	const char *errors)
				516	{
				517	PyObject v, unicode;
				518
				519	unicode = PyUnicode_FromUnicode(s, size);
				520	if (unicode == NULL)
				521	return NULL;
				522	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				523	Py_DECREF(unicode);
				524	return v;
				525	}
				526
				527	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				528	const char *encoding,
				529	const char *errors)
				530	{
				531	PyObject *v;
				532
				533	if (!PyUnicode_Check(unicode)) {
				534	PyErr_BadArgument();
				535	goto onError;
				536	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	537
				538	if (encoding == NULL)
				539	encoding = PyUnicode_GetDefaultEncoding();
				540
				541	/* Shortcuts for common default encodings */
				542	if (errors == NULL) {
				543	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	544	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	545	else if (strcmp(encoding, "latin-1") == 0)
				546	return PyUnicode_AsLatin1String(unicode);
				547	else if (strcmp(encoding, "ascii") == 0)
				548	return PyUnicode_AsASCIIString(unicode);
				549	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	550
				551	/* Encode via the codec registry */
				552	v = PyCodec_Encode(unicode, encoding, errors);
				553	if (v == NULL)
				554	goto onError;
				555	/* XXX Should we really enforce this ? */
				556	if (!PyString_Check(v)) {
				557	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	558	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	559	v->ob_type->tp_name);
				560	Py_DECREF(v);
				561	goto onError;
				562	}
				563	return v;
				564
				565	onError:
				566	return NULL;
				567	}
				568
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	569	/* Return a Python string holding the default encoded value of the
				570	Unicode object.
				571
				572	The resulting string is cached in the Unicode object for subsequent
				573	usage by this function. The cached version is needed to implement
				574	the character buffer interface and will live (at least) as long as
				575	the Unicode object itself.
				576
				577	The refcount of the string is not incremented.
				578
				579	* Exported for internal use by the interpreter only !!! *
				580
				581	*/
				582
				583	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				584	const char *errors)
				585	{
				586	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				587
				588	if (v)
				589	return v;
				590	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				591	if (v && errors == NULL)
				592	((PyUnicodeObject *)unicode)->defenc = v;
				593	return v;
				594	}
				595
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	596	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				597	{
				598	if (!PyUnicode_Check(unicode)) {
				599	PyErr_BadArgument();
				600	goto onError;
				601	}
				602	return PyUnicode_AS_UNICODE(unicode);
				603
				604	onError:
				605	return NULL;
				606	}
				607
				608	int PyUnicode_GetSize(PyObject *unicode)
				609	{
				610	if (!PyUnicode_Check(unicode)) {
				611	PyErr_BadArgument();
				612	goto onError;
				613	}
				614	return PyUnicode_GET_SIZE(unicode);
				615
				616	onError:
				617	return -1;
				618	}
				619
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	620	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	621	{
				622	return unicode_default_encoding;
				623	}
				624
				625	int PyUnicode_SetDefaultEncoding(const char *encoding)
				626	{
				627	PyObject *v;
				628
				629	/* Make sure the encoding is valid. As side effect, this also
				630	loads the encoding into the codec registry cache. */
				631	v = _PyCodec_Lookup(encoding);
				632	if (v == NULL)
				633	goto onError;
				634	Py_DECREF(v);
				635	strncpy(unicode_default_encoding,
				636	encoding,
				637	sizeof(unicode_default_encoding));
				638	return 0;
				639
				640	onError:
				641	return -1;
				642	}
				643
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	644	/* --- UTF-8 Codec -------------------------------------------------------- */
				645
				646	static
				647	char utf8_code_length[256] = {
				648	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				649	illegal prefix. see RFC 2279 for details */
				650	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				651	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				652	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				653	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				654	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				655	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				656	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				657	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				658	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				659	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				660	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				661	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				662	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				663	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				664	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				665	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				666	};
				667
				668	static
				669	int utf8_decoding_error(const char **source,
				670	Py_UNICODE **dest,
				671	const char *errors,
				672	const char *details)
				673	{
				674	if ((errors == NULL) \|\|
				675	(strcmp(errors,"strict") == 0)) {
				676	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	677	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	678	details);
				679	return -1;
				680	}
				681	else if (strcmp(errors,"ignore") == 0) {
				682	(*source)++;
				683	return 0;
				684	}
				685	else if (strcmp(errors,"replace") == 0) {
				686	(*source)++;
				687	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				688	(*dest)++;
				689	return 0;
				690	}
				691	else {
				692	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	693	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	694	errors);
				695	return -1;
				696	}
				697	}
				698
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	699	PyObject PyUnicode_DecodeUTF8(const char s,
				700	int size,
				701	const char *errors)
				702	{
				703	int n;
				704	const char *e;
				705	PyUnicodeObject *unicode;
				706	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	707	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	708
				709	/* Note: size will always be longer than the resulting Unicode
				710	character count */
				711	unicode = _PyUnicode_New(size);
				712	if (!unicode)
				713	return NULL;
				714	if (size == 0)
				715	return (PyObject *)unicode;
				716
				717	/* Unpack UTF-8 encoded data */
				718	p = unicode->str;
				719	e = s + size;
				720
				721	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	722	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	723
				724	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	725	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	726	s++;
				727	continue;
				728	}
				729
				730	n = utf8_code_length[ch];
				731
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	732	if (s + n > e) {
				733	errmsg = "unexpected end of data";
				734	goto utf8Error;
				735	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	736
				737	switch (n) {
				738
				739	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	740	errmsg = "unexpected code byte";
				741	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	742
				743	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	744	errmsg = "internal error";
				745	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	746
				747	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	748	if ((s[1] & 0xc0) != 0x80) {
				749	errmsg = "invalid data";
				750	goto utf8Error;
				751	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	752	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	753	if (ch < 0x80) {
				754	errmsg = "illegal encoding";
				755	goto utf8Error;
				756	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	757	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	758	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	759	break;
				760
				761	case 3:
				762	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	763	(s[2] & 0xc0) != 0x80) {
				764	errmsg = "invalid data";
				765	goto utf8Error;
				766	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	767	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	768	if (ch < 0x800 \|\| (ch >= 0xd800 && ch < 0xe000)) {
				769	errmsg = "illegal encoding";
				770	goto utf8Error;
				771	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	772	else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	773	*p++ = (Py_UNICODE)ch;
				774	break;
				775
				776	case 4:
				777	if ((s[1] & 0xc0) != 0x80 \|\|
				778	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	779	(s[3] & 0xc0) != 0x80) {
				780	errmsg = "invalid data";
				781	goto utf8Error;
				782	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	783	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				784	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				785	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	786	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	787	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	788	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	789	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	790	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	791	errmsg = "illegal encoding";
				792	goto utf8Error;
				793	}
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	794	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	795	*p++ = (Py_UNICODE)ch;
				796	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	797	/* compute and append the two surrogates: */
				798
				799	/* translate from 10000..10FFFF to 0..FFFF */
				800	ch -= 0x10000;
				801
				802	/* high surrogate = top 10 bits added to D800 */
				803	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				804
				805	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	806	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	807	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	808	break;
				809
				810	default:
				811	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	812	errmsg = "unsupported Unicode code range";
				813	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	814	}
				815	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	816	continue;
				817
				818	utf8Error:
				819	if (utf8_decoding_error(&s, &p, errors, errmsg))
				820	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	821	}
				822
				823	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	824	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	825	goto onError;
				826
				827	return (PyObject *)unicode;
				828
				829	onError:
				830	Py_DECREF(unicode);
				831	return NULL;
				832	}
				833
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	834	/* Not used anymore, now that the encoder supports UTF-16
				835	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	836	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	837	static
				838	int utf8_encoding_error(const Py_UNICODE **source,
				839	char **dest,
				840	const char *errors,
				841	const char *details)
				842	{
				843	if ((errors == NULL) \|\|
				844	(strcmp(errors,"strict") == 0)) {
				845	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	846	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	847	details);
				848	return -1;
				849	}
				850	else if (strcmp(errors,"ignore") == 0) {
				851	return 0;
				852	}
				853	else if (strcmp(errors,"replace") == 0) {
				854	**dest = '?';
				855	(*dest)++;
				856	return 0;
				857	}
				858	else {
				859	PyErr_Format(PyExc_ValueError,
				860	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	861	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	862	errors);
				863	return -1;
				864	}
				865	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	866	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	867
				868	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				869	int size,
				870	const char *errors)
				871	{
				872	PyObject *v;
				873	char *p;
				874	char *q;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	875	Py_UCS4 ch2;
				876	unsigned int cbAllocated = 3 * size;
				877	unsigned int cbWritten = 0;
				878	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	879
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	880	v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	881	if (v == NULL)
				882	return NULL;
				883	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	884	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	885
				886	p = q = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	887	while (i < size) {
				888	Py_UCS4 ch = s[i++];
				889	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	890	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	891	cbWritten++;
				892	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	893	else if (ch < 0x0800) {
				894	*p++ = 0xc0 \| (ch >> 6);
				895	*p++ = 0x80 \| (ch & 0x3f);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	896	cbWritten += 2;
				897	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	898	else if (ch < 0x10000) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	899	/* Check for high surrogate */
				900	if (0xD800 <= ch && ch <= 0xDBFF) {
				901	if (i != size) {
				902	ch2 = s[i];
				903	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
				904
				905	if (cbWritten >= (cbAllocated - 4)) {
				906	/* Provide enough room for some more
				907	surrogates */
				908	cbAllocated += 4*10;
				909	if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	910	goto onError;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	911	}
				912
				913	/* combine the two values */
				914	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
				915
				916	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	917	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	918	i++;
				919	cbWritten += 4;
				920	}
				921	}
				922	}
				923	else {
				924	*p++ = (char)(0xe0 \| (ch >> 12));
				925	cbWritten += 3;
				926	}
				927	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				928	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	929	} else {
				930	*p++ = 0xf0 \| (ch>>18);
				931	*p++ = 0x80 \| ((ch>>12) & 0x3f);
				932	*p++ = 0x80 \| ((ch>>6) & 0x3f);
				933	*p++ = 0x80 \| (ch & 0x3f);
				934	cbWritten += 4;
				935	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	936	}
				937	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	938	if (_PyString_Resize(&v, p - q))
				939	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	940	return v;
				941
				942	onError:
				943	Py_DECREF(v);
				944	return NULL;
				945	}
				946
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	947	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				948	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	949	if (!PyUnicode_Check(unicode)) {
				950	PyErr_BadArgument();
				951	return NULL;
				952	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	953	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				954	PyUnicode_GET_SIZE(unicode),
				955	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	956	}
				957
				958	/* --- UTF-16 Codec ------------------------------------------------------- */
				959
				960	static
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	961	int utf16_decoding_error(const Py_UCS2 **source,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	962	Py_UNICODE **dest,
				963	const char *errors,
				964	const char *details)
				965	{
				966	if ((errors == NULL) \|\|
				967	(strcmp(errors,"strict") == 0)) {
				968	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	969	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	970	details);
				971	return -1;
				972	}
				973	else if (strcmp(errors,"ignore") == 0) {
				974	return 0;
				975	}
				976	else if (strcmp(errors,"replace") == 0) {
				977	if (dest) {
				978	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				979	(*dest)++;
				980	}
				981	return 0;
				982	}
				983	else {
				984	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	985	"UTF-16 decoding error; "
				986	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	987	errors);
				988	return -1;
				989	}
				990	}
				991
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	992	PyObject PyUnicode_DecodeUTF16(const char s,
				993	int size,
				994	const char *errors,
				995	int *byteorder)
				996	{
				997	PyUnicodeObject *unicode;
				998	Py_UNICODE *p;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	999	const Py_UCS2 q, e;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1000	int bo = 0;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1001	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1002
				1003	/* size should be an even number */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1004	if (size % sizeof(Py_UCS2) != 0) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1005	if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
				1006	return NULL;
				1007	/* The remaining input chars are ignored if we fall through
				1008	here... */
				1009	}
				1010
				1011	/* Note: size will always be longer than the resulting Unicode
				1012	character count */
				1013	unicode = _PyUnicode_New(size);
				1014	if (!unicode)
				1015	return NULL;
				1016	if (size == 0)
				1017	return (PyObject *)unicode;
				1018
				1019	/* Unpack UTF-16 encoded data */
				1020	p = unicode->str;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1021	q = (Py_UCS2 *)s;
				1022	e = q + (size / sizeof(Py_UCS2));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1023
				1024	if (byteorder)
				1025	bo = *byteorder;
				1026
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1027	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1028	byte order setting accordingly. In native mode, the leading BOM
				1029	mark is skipped, in all other modes, it is copied to the output
				1030	stream as-is (giving a ZWNBSP character). */
				1031	if (bo == 0) {
				1032	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1033	if (*q == 0xFEFF) {
				1034	q++;
				1035	bo = -1;
				1036	} else if (*q == 0xFFFE) {
				1037	q++;
				1038	bo = 1;
				1039	}
				1040	#else
				1041	if (*q == 0xFEFF) {
				1042	q++;
				1043	bo = 1;
				1044	} else if (*q == 0xFFFE) {
				1045	q++;
				1046	bo = -1;
				1047	}
				1048	#endif
				1049	}
				1050
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1051	while (q < e) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1052	register Py_UCS2 ch = *q++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1053
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1054	/* Swap input bytes if needed. (This assumes
				1055	sizeof(Py_UNICODE) == 2 !) */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1056	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1057	if (bo == 1)
				1058	ch = (ch >> 8) \| (ch << 8);
				1059	#else
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1060	if (bo == -1)
				1061	ch = (ch >> 8) \| (ch << 8);
				1062	#endif
				1063	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1064	*p++ = ch;
				1065	continue;
				1066	}
				1067
				1068	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1069	if (q >= e) {
				1070	errmsg = "unexpected end of data";
				1071	goto utf16Error;
				1072	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1073	if (0xD800 <= ch && ch <= 0xDBFF) {
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1074	Py_UCS2 ch2 = *q++;
				1075	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1076	if (bo == 1)
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1077	ch2 = (ch2 >> 8) \| (ch2 << 8);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1078	#else
				1079	if (bo == -1)
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1080	ch2 = (ch2 >> 8) \| (ch2 << 8);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1081	#endif
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1082	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1083	#ifndef Py_UNICODE_WIDE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1084	*p++ = ch;
				1085	*p++ = ch2;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1086	#else
				1087	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1088	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1089	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1090	}
				1091	else {
				1092	errmsg = "illegal UTF-16 surrogate";
				1093	goto utf16Error;
				1094	}
				1095
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1096	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1097	errmsg = "illegal encoding";
				1098	/* Fall through to report the error */
				1099
				1100	utf16Error:
				1101	if (utf16_decoding_error(&q, &p, errors, errmsg))
				1102	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1103	}
				1104
				1105	if (byteorder)
				1106	*byteorder = bo;
				1107
				1108	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1109	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1110	goto onError;
				1111
				1112	return (PyObject *)unicode;
				1113
				1114	onError:
				1115	Py_DECREF(unicode);
				1116	return NULL;
				1117	}
				1118
				1119	#undef UTF16_ERROR
				1120
				1121	PyObject PyUnicode_EncodeUTF16(const Py_UNICODE s,
				1122	int size,
				1123	const char *errors,
				1124	int byteorder)
				1125	{
				1126	PyObject *v;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1127	Py_UCS2 *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1128	char *q;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1129	int i, pairs, doswap = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1130
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1131	for (i = pairs = 0; i < size; i++)
				1132	if (s[i] >= 0x10000)
				1133	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1134	v = PyString_FromStringAndSize(NULL,
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1135	sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1136	if (v == NULL)
				1137	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1138
				1139	q = PyString_AS_STRING(v);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1140	p = (Py_UCS2 *)q;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1141	if (byteorder == 0)
				1142	*p++ = 0xFEFF;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1143	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1144	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1145	if (byteorder == 0 \|\|
				1146	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1147	byteorder == -1
				1148	#else
				1149	byteorder == 1
				1150	#endif
				1151	)
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1152	doswap = 0;
				1153	while (size-- > 0) {
				1154	Py_UNICODE ch = *s++;
				1155	Py_UNICODE ch2 = 0;
				1156	if (ch >= 0x10000) {
				1157	ch2 = 0xDC00\|((ch-0x10000) & 0x3FF);
				1158	ch = 0xD800\|((ch-0x10000)>>10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1159	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1160	if (doswap){
				1161	*p++ = (ch >> 8) \| (ch << 8);
				1162	if (ch2)
				1163	*p++ = (ch2 >> 8) \| (ch2 << 8);
				1164	}else{
				1165	*p++ = ch;
				1166	if(ch2)
				1167	*p++ = ch2;
				1168	}
				1169	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1170	return v;
				1171	}
				1172
				1173	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1174	{
				1175	if (!PyUnicode_Check(unicode)) {
				1176	PyErr_BadArgument();
				1177	return NULL;
				1178	}
				1179	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1180	PyUnicode_GET_SIZE(unicode),
				1181	NULL,
				1182	0);
				1183	}
				1184
				1185	/* --- Unicode Escape Codec ----------------------------------------------- */
				1186
				1187	static
				1188	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1189	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1190	const char *errors,
				1191	const char *details)
				1192	{
				1193	if ((errors == NULL) \|\|
				1194	(strcmp(errors,"strict") == 0)) {
				1195	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1196	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1197	details);
				1198	return -1;
				1199	}
				1200	else if (strcmp(errors,"ignore") == 0) {
				1201	return 0;
				1202	}
				1203	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1204	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1205	return 0;
				1206	}
				1207	else {
				1208	PyErr_Format(PyExc_ValueError,
				1209	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1210	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1211	errors);
				1212	return -1;
				1213	}
				1214	}
				1215
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1216	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1217
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1218	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1219	int size,
				1220	const char *errors)
				1221	{
				1222	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1223	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1224	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1225	char* message;
				1226	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1227
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1228	/* Escaped strings will always be longer than the resulting
				1229	Unicode string, so we start with size here and then reduce the
				1230	length after conversion to the true value. */
				1231	v = _PyUnicode_New(size);
				1232	if (v == NULL)
				1233	goto onError;
				1234	if (size == 0)
				1235	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1236
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1237	p = buf = PyUnicode_AS_UNICODE(v);
				1238	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1239
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1240	while (s < end) {
				1241	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1242	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1243	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1244
				1245	/* Non-escape characters are interpreted as Unicode ordinals */
				1246	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1247	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1248	continue;
				1249	}
				1250
				1251	/* \ - Escapes */
				1252	s++;
				1253	switch (*s++) {
				1254
				1255	/* \x escapes */
				1256	case '\n': break;
				1257	case '\\': *p++ = '\\'; break;
				1258	case '\'': *p++ = '\''; break;
				1259	case '\"': *p++ = '\"'; break;
				1260	case 'b': *p++ = '\b'; break;
				1261	case 'f': p++ = '\014'; break; / FF */
				1262	case 't': *p++ = '\t'; break;
				1263	case 'n': *p++ = '\n'; break;
				1264	case 'r': *p++ = '\r'; break;
				1265	case 'v': p++ = '\013'; break; / VT */
				1266	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1267
				1268	/* \OOO (octal) escapes */
				1269	case '0': case '1': case '2': case '3':
				1270	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1271	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1272	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1273	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1274	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1275	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1276	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1277	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1278	break;
				1279
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1280	/* hex escapes */
				1281	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1282	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1283	digits = 2;
				1284	message = "truncated \\xXX escape";
				1285	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1286
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1287	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1288	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1289	digits = 4;
				1290	message = "truncated \\uXXXX escape";
				1291	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1292
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1293	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1294	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1295	digits = 8;
				1296	message = "truncated \\UXXXXXXXX escape";
				1297	hexescape:
				1298	chr = 0;
				1299	for (i = 0; i < digits; i++) {
				1300	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1301	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1302	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1303	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1304	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1305	i++;
				1306	break;
				1307	}
				1308	chr = (chr<<4) & ~0xF;
				1309	if (c >= '0' && c <= '9')
				1310	chr += c - '0';
				1311	else if (c >= 'a' && c <= 'f')
				1312	chr += 10 + c - 'a';
				1313	else
				1314	chr += 10 + c - 'A';
				1315	}
				1316	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1317	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1318	/* when we get here, chr is a 32-bit unicode character */
				1319	if (chr <= 0xffff)
				1320	/* UCS-2 character */
				1321	*p++ = (Py_UNICODE) chr;
				1322	else if (chr <= 0x10ffff) {
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1323	/* UCS-4 character. Either store directly, or as
				1324	surrogate pair. */
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1325	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1326	*p++ = chr;
				1327	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1328	chr -= 0x10000L;
				1329	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1330	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1331	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1332	} else {
				1333	if (unicodeescape_decoding_error(
				1334	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1335	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1336	)
				1337	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1338	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1339	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1340	break;
				1341
				1342	/* \N{name} */
				1343	case 'N':
				1344	message = "malformed \\N character escape";
				1345	if (ucnhash_CAPI == NULL) {
				1346	/* load the unicode data module */
				1347	PyObject m, v;
				1348	m = PyImport_ImportModule("unicodedata");
				1349	if (m == NULL)
				1350	goto ucnhashError;
				1351	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1352	Py_DECREF(m);
				1353	if (v == NULL)
				1354	goto ucnhashError;
				1355	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1356	Py_DECREF(v);
				1357	if (ucnhash_CAPI == NULL)
				1358	goto ucnhashError;
				1359	}
				1360	if (*s == '{') {
				1361	const char *start = s+1;
				1362	/* look for the closing brace */
				1363	while (*s != '}' && s < end)
				1364	s++;
				1365	if (s > start && s < end && *s == '}') {
				1366	/* found a name. look it up in the unicode database */
				1367	message = "unknown Unicode character name";
				1368	s++;
				1369	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1370	goto store;
				1371	}
				1372	}
				1373	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1374	goto onError;
				1375	*p++ = x;
				1376	break;
				1377
				1378	default:
				1379	*p++ = '\\';
				1380	*p++ = (unsigned char)s[-1];
				1381	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1382	}
				1383	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1384	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1385	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1386	return (PyObject *)v;
				1387
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1388	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1389	PyErr_SetString(
				1390	PyExc_UnicodeError,
				1391	"\\N escapes not supported (can't load unicodedata module)"
				1392	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1393	return NULL;
				1394
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1395	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1396	Py_XDECREF(v);
				1397	return NULL;
				1398	}
				1399
				1400	/* Return a Unicode-Escape string version of the Unicode object.
				1401
				1402	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1403	appropriate.
				1404
				1405	*/
				1406
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1407	static const Py_UNICODE findchar(const Py_UNICODE s,
				1408	int size,
				1409	Py_UNICODE ch);
				1410
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1411	static
				1412	PyObject unicodeescape_string(const Py_UNICODE s,
				1413	int size,
				1414	int quotes)
				1415	{
				1416	PyObject *repr;
				1417	char *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1418
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1419	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1420
				1421	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1422	if (repr == NULL)
				1423	return NULL;
				1424
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame^]	1425	p = PyString_AS_STRING(repr);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1426
				1427	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1428	*p++ = 'u';
				1429	*p++ = (findchar(s, size, '\'') &&
				1430	!findchar(s, size, '"')) ? '"' : '\'';
				1431	}
				1432	while (size-- > 0) {
				1433	Py_UNICODE ch = *s++;
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame^]	1434
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1435	/* Escape quotes */
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame^]	1436	if (quotes &&
				1437	(ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1438	*p++ = '\\';
				1439	*p++ = (char) ch;
				1440	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame^]	1441
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1442	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1443	/* Map 21-bit characters to '\U00xxxxxx' */
				1444	else if (ch >= 0x10000) {
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame^]	1445	int offset = p - PyString_AS_STRING(repr);
				1446
				1447	/* Resize the string if necessary */
				1448	if (offset + 12 > PyString_GET_SIZE(repr)) {
				1449	if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
				1450	goto onError;
				1451	p = PyString_AS_STRING(repr) + offset;
				1452	}
				1453
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1454	*p++ = '\\';
				1455	*p++ = 'U';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1456	*p++ = hexdigit[(ch >> 28) & 0x0000000F];
				1457	*p++ = hexdigit[(ch >> 24) & 0x0000000F];
				1458	*p++ = hexdigit[(ch >> 20) & 0x0000000F];
				1459	*p++ = hexdigit[(ch >> 16) & 0x0000000F];
				1460	*p++ = hexdigit[(ch >> 12) & 0x0000000F];
				1461	*p++ = hexdigit[(ch >> 8) & 0x0000000F];
				1462	*p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame^]	1463	*p++ = hexdigit[ch & 0x0000000F];
				1464	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1465	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1466	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1467	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
				1468	else if (ch >= 0xD800 && ch < 0xDC00) {
				1469	Py_UNICODE ch2;
				1470	Py_UCS4 ucs;
				1471
				1472	ch2 = *s++;
				1473	size--;
				1474	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
				1475	ucs = (((ch & 0x03FF) << 10) \| (ch2 & 0x03FF)) + 0x00010000;
				1476	*p++ = '\\';
				1477	*p++ = 'U';
				1478	*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
				1479	*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
				1480	*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
				1481	*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
				1482	*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
				1483	*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
				1484	*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
				1485	*p++ = hexdigit[ucs & 0x0000000F];
				1486	continue;
				1487	}
				1488	/* Fall through: isolated surrogates are copied as-is */
				1489	s--;
				1490	size++;
				1491	}
				1492
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1493	/* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1494	if (ch >= 256) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1495	*p++ = '\\';
				1496	*p++ = 'u';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1497	*p++ = hexdigit[(ch >> 12) & 0x000F];
				1498	*p++ = hexdigit[(ch >> 8) & 0x000F];
				1499	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1500	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1501	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame^]	1502
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1503	/* Map special whitespace to '\t', \n', '\r' */
				1504	else if (ch == '\t') {
				1505	*p++ = '\\';
				1506	*p++ = 't';
				1507	}
				1508	else if (ch == '\n') {
				1509	*p++ = '\\';
				1510	*p++ = 'n';
				1511	}
				1512	else if (ch == '\r') {
				1513	*p++ = '\\';
				1514	*p++ = 'r';
				1515	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame^]	1516
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1517	/* Map non-printable US ASCII to '\xhh' */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1518	else if (ch < ' ' \|\| ch >= 128) {
				1519	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1520	*p++ = 'x';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1521	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1522	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1523	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame^]	1524
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1525	/* Copy everything else as-is */
				1526	else
				1527	*p++ = (char) ch;
				1528	}
				1529	if (quotes)
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame^]	1530	*p++ = PyString_AS_STRING(repr)[1];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1531
				1532	*p = '\0';
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame^]	1533	if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1534	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1535
				1536	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1537
				1538	onError:
				1539	Py_DECREF(repr);
				1540	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1541	}
				1542
				1543	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1544	int size)
				1545	{
				1546	return unicodeescape_string(s, size, 0);
				1547	}
				1548
				1549	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1550	{
				1551	if (!PyUnicode_Check(unicode)) {
				1552	PyErr_BadArgument();
				1553	return NULL;
				1554	}
				1555	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1556	PyUnicode_GET_SIZE(unicode));
				1557	}
				1558
				1559	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1560
				1561	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1562	int size,
				1563	const char *errors)
				1564	{
				1565	PyUnicodeObject *v;
				1566	Py_UNICODE p, buf;
				1567	const char *end;
				1568	const char *bs;
				1569
				1570	/* Escaped strings will always be longer than the resulting
				1571	Unicode string, so we start with size here and then reduce the
				1572	length after conversion to the true value. */
				1573	v = _PyUnicode_New(size);
				1574	if (v == NULL)
				1575	goto onError;
				1576	if (size == 0)
				1577	return (PyObject *)v;
				1578	p = buf = PyUnicode_AS_UNICODE(v);
				1579	end = s + size;
				1580	while (s < end) {
				1581	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1582	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1583	int i;
				1584
				1585	/* Non-escape characters are interpreted as Unicode ordinals */
				1586	if (*s != '\\') {
				1587	p++ = (unsigned char)s++;
				1588	continue;
				1589	}
				1590
				1591	/* \u-escapes are only interpreted iff the number of leading
				1592	backslashes if odd */
				1593	bs = s;
				1594	for (;s < end;) {
				1595	if (*s != '\\')
				1596	break;
				1597	p++ = (unsigned char)s++;
				1598	}
				1599	if (((s - bs) & 1) == 0 \|\|
				1600	s >= end \|\|
				1601	*s != 'u') {
				1602	continue;
				1603	}
				1604	p--;
				1605	s++;
				1606
				1607	/* \uXXXX with 4 hex digits */
				1608	for (x = 0, i = 0; i < 4; i++) {
				1609	c = (unsigned char)s[i];
				1610	if (!isxdigit(c)) {
				1611	if (unicodeescape_decoding_error(&s, &x, errors,
				1612	"truncated \\uXXXX"))
				1613	goto onError;
				1614	i++;
				1615	break;
				1616	}
				1617	x = (x<<4) & ~0xF;
				1618	if (c >= '0' && c <= '9')
				1619	x += c - '0';
				1620	else if (c >= 'a' && c <= 'f')
				1621	x += 10 + c - 'a';
				1622	else
				1623	x += 10 + c - 'A';
				1624	}
				1625	s += i;
				1626	*p++ = x;
				1627	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1628	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1629	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1630	return (PyObject *)v;
				1631
				1632	onError:
				1633	Py_XDECREF(v);
				1634	return NULL;
				1635	}
				1636
				1637	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1638	int size)
				1639	{
				1640	PyObject *repr;
				1641	char *p;
				1642	char *q;
				1643
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1644	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1645
				1646	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1647	if (repr == NULL)
				1648	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1649	if (size == 0)
				1650	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1651
				1652	p = q = PyString_AS_STRING(repr);
				1653	while (size-- > 0) {
				1654	Py_UNICODE ch = *s++;
				1655	/* Map 16-bit characters to '\uxxxx' */
				1656	if (ch >= 256) {
				1657	*p++ = '\\';
				1658	*p++ = 'u';
				1659	*p++ = hexdigit[(ch >> 12) & 0xf];
				1660	*p++ = hexdigit[(ch >> 8) & 0xf];
				1661	*p++ = hexdigit[(ch >> 4) & 0xf];
				1662	*p++ = hexdigit[ch & 15];
				1663	}
				1664	/* Copy everything else as-is */
				1665	else
				1666	*p++ = (char) ch;
				1667	}
				1668	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1669	if (_PyString_Resize(&repr, p - q))
				1670	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1671
				1672	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1673
				1674	onError:
				1675	Py_DECREF(repr);
				1676	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1677	}
				1678
				1679	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				1680	{
				1681	if (!PyUnicode_Check(unicode)) {
				1682	PyErr_BadArgument();
				1683	return NULL;
				1684	}
				1685	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1686	PyUnicode_GET_SIZE(unicode));
				1687	}
				1688
				1689	/* --- Latin-1 Codec ------------------------------------------------------ */
				1690
				1691	PyObject PyUnicode_DecodeLatin1(const char s,
				1692	int size,
				1693	const char *errors)
				1694	{
				1695	PyUnicodeObject *v;
				1696	Py_UNICODE *p;
				1697
				1698	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1699	if (size == 1 && (unsigned char)s < 256) {
				1700	Py_UNICODE r = (unsigned char)s;
				1701	return PyUnicode_FromUnicode(&r, 1);
				1702	}
				1703
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1704	v = _PyUnicode_New(size);
				1705	if (v == NULL)
				1706	goto onError;
				1707	if (size == 0)
				1708	return (PyObject *)v;
				1709	p = PyUnicode_AS_UNICODE(v);
				1710	while (size-- > 0)
				1711	p++ = (unsigned char)s++;
				1712	return (PyObject *)v;
				1713
				1714	onError:
				1715	Py_XDECREF(v);
				1716	return NULL;
				1717	}
				1718
				1719	static
				1720	int latin1_encoding_error(const Py_UNICODE **source,
				1721	char **dest,
				1722	const char *errors,
				1723	const char *details)
				1724	{
				1725	if ((errors == NULL) \|\|
				1726	(strcmp(errors,"strict") == 0)) {
				1727	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1728	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1729	details);
				1730	return -1;
				1731	}
				1732	else if (strcmp(errors,"ignore") == 0) {
				1733	return 0;
				1734	}
				1735	else if (strcmp(errors,"replace") == 0) {
				1736	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1737	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1738	return 0;
				1739	}
				1740	else {
				1741	PyErr_Format(PyExc_ValueError,
				1742	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1743	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1744	errors);
				1745	return -1;
				1746	}
				1747	}
				1748
				1749	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				1750	int size,
				1751	const char *errors)
				1752	{
				1753	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1754	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1755
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1756	repr = PyString_FromStringAndSize(NULL, size);
				1757	if (repr == NULL)
				1758	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1759	if (size == 0)
				1760	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1761
				1762	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1763	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1764	while (size-- > 0) {
				1765	Py_UNICODE ch = *p++;
				1766	if (ch >= 256) {
				1767	if (latin1_encoding_error(&p, &s, errors,
				1768	"ordinal not in range(256)"))
				1769	goto onError;
				1770	}
				1771	else
				1772	*s++ = (char)ch;
				1773	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1774	/* Resize if error handling skipped some characters */
				1775	if (s - start < PyString_GET_SIZE(repr))
				1776	if (_PyString_Resize(&repr, s - start))
				1777	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1778	return repr;
				1779
				1780	onError:
				1781	Py_DECREF(repr);
				1782	return NULL;
				1783	}
				1784
				1785	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				1786	{
				1787	if (!PyUnicode_Check(unicode)) {
				1788	PyErr_BadArgument();
				1789	return NULL;
				1790	}
				1791	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				1792	PyUnicode_GET_SIZE(unicode),
				1793	NULL);
				1794	}
				1795
				1796	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				1797
				1798	static
				1799	int ascii_decoding_error(const char **source,
				1800	Py_UNICODE **dest,
				1801	const char *errors,
				1802	const char *details)
				1803	{
				1804	if ((errors == NULL) \|\|
				1805	(strcmp(errors,"strict") == 0)) {
				1806	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1807	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1808	details);
				1809	return -1;
				1810	}
				1811	else if (strcmp(errors,"ignore") == 0) {
				1812	return 0;
				1813	}
				1814	else if (strcmp(errors,"replace") == 0) {
				1815	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1816	(*dest)++;
				1817	return 0;
				1818	}
				1819	else {
				1820	PyErr_Format(PyExc_ValueError,
				1821	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1822	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1823	errors);
				1824	return -1;
				1825	}
				1826	}
				1827
				1828	PyObject PyUnicode_DecodeASCII(const char s,
				1829	int size,
				1830	const char *errors)
				1831	{
				1832	PyUnicodeObject *v;
				1833	Py_UNICODE *p;
				1834
				1835	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1836	if (size == 1 && (unsigned char)s < 128) {
				1837	Py_UNICODE r = (unsigned char)s;
				1838	return PyUnicode_FromUnicode(&r, 1);
				1839	}
				1840
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1841	v = _PyUnicode_New(size);
				1842	if (v == NULL)
				1843	goto onError;
				1844	if (size == 0)
				1845	return (PyObject *)v;
				1846	p = PyUnicode_AS_UNICODE(v);
				1847	while (size-- > 0) {
				1848	register unsigned char c;
				1849
				1850	c = (unsigned char)*s++;
				1851	if (c < 128)
				1852	*p++ = c;
				1853	else if (ascii_decoding_error(&s, &p, errors,
				1854	"ordinal not in range(128)"))
				1855	goto onError;
				1856	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1857	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1858	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1859	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1860	return (PyObject *)v;
				1861
				1862	onError:
				1863	Py_XDECREF(v);
				1864	return NULL;
				1865	}
				1866
				1867	static
				1868	int ascii_encoding_error(const Py_UNICODE **source,
				1869	char **dest,
				1870	const char *errors,
				1871	const char *details)
				1872	{
				1873	if ((errors == NULL) \|\|
				1874	(strcmp(errors,"strict") == 0)) {
				1875	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1876	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1877	details);
				1878	return -1;
				1879	}
				1880	else if (strcmp(errors,"ignore") == 0) {
				1881	return 0;
				1882	}
				1883	else if (strcmp(errors,"replace") == 0) {
				1884	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1885	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1886	return 0;
				1887	}
				1888	else {
				1889	PyErr_Format(PyExc_ValueError,
				1890	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1891	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1892	errors);
				1893	return -1;
				1894	}
				1895	}
				1896
				1897	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				1898	int size,
				1899	const char *errors)
				1900	{
				1901	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1902	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1903
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1904	repr = PyString_FromStringAndSize(NULL, size);
				1905	if (repr == NULL)
				1906	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1907	if (size == 0)
				1908	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1909
				1910	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1911	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1912	while (size-- > 0) {
				1913	Py_UNICODE ch = *p++;
				1914	if (ch >= 128) {
				1915	if (ascii_encoding_error(&p, &s, errors,
				1916	"ordinal not in range(128)"))
				1917	goto onError;
				1918	}
				1919	else
				1920	*s++ = (char)ch;
				1921	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1922	/* Resize if error handling skipped some characters */
				1923	if (s - start < PyString_GET_SIZE(repr))
				1924	if (_PyString_Resize(&repr, s - start))
				1925	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1926	return repr;
				1927
				1928	onError:
				1929	Py_DECREF(repr);
				1930	return NULL;
				1931	}
				1932
				1933	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				1934	{
				1935	if (!PyUnicode_Check(unicode)) {
				1936	PyErr_BadArgument();
				1937	return NULL;
				1938	}
				1939	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				1940	PyUnicode_GET_SIZE(unicode),
				1941	NULL);
				1942	}
				1943
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	1944	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1945
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1946	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	1947
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1948	PyObject PyUnicode_DecodeMBCS(const char s,
				1949	int size,
				1950	const char *errors)
				1951	{
				1952	PyUnicodeObject *v;
				1953	Py_UNICODE *p;
				1954
				1955	/* First get the size of the result */
				1956	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1957	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1958	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1959
				1960	v = _PyUnicode_New(usize);
				1961	if (v == NULL)
				1962	return NULL;
				1963	if (usize == 0)
				1964	return (PyObject *)v;
				1965	p = PyUnicode_AS_UNICODE(v);
				1966	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				1967	Py_DECREF(v);
				1968	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1969	}
				1970
				1971	return (PyObject *)v;
				1972	}
				1973
				1974	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				1975	int size,
				1976	const char *errors)
				1977	{
				1978	PyObject *repr;
				1979	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1980	DWORD mbcssize;
				1981
				1982	/* If there are no characters, bail now! */
				1983	if (size==0)
				1984	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1985
				1986	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	1987	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1988	if (mbcssize==0)
				1989	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				1990
				1991	repr = PyString_FromStringAndSize(NULL, mbcssize);
				1992	if (repr == NULL)
				1993	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1994	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	1995	return repr;
				1996
				1997	/* Do the conversion */
				1998	s = PyString_AS_STRING(repr);
				1999	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				2000	Py_DECREF(repr);
				2001	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2002	}
				2003	return repr;
				2004	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2005
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2006	#endif /* MS_WIN32 */
				2007
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2008	/* --- Character Mapping Codec -------------------------------------------- */
				2009
				2010	static
				2011	int charmap_decoding_error(const char **source,
				2012	Py_UNICODE **dest,
				2013	const char *errors,
				2014	const char *details)
				2015	{
				2016	if ((errors == NULL) \|\|
				2017	(strcmp(errors,"strict") == 0)) {
				2018	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2019	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2020	details);
				2021	return -1;
				2022	}
				2023	else if (strcmp(errors,"ignore") == 0) {
				2024	return 0;
				2025	}
				2026	else if (strcmp(errors,"replace") == 0) {
				2027	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2028	(*dest)++;
				2029	return 0;
				2030	}
				2031	else {
				2032	PyErr_Format(PyExc_ValueError,
				2033	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2034	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2035	errors);
				2036	return -1;
				2037	}
				2038	}
				2039
				2040	PyObject PyUnicode_DecodeCharmap(const char s,
				2041	int size,
				2042	PyObject *mapping,
				2043	const char *errors)
				2044	{
				2045	PyUnicodeObject *v;
				2046	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2047	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2048
				2049	/* Default to Latin-1 */
				2050	if (mapping == NULL)
				2051	return PyUnicode_DecodeLatin1(s, size, errors);
				2052
				2053	v = _PyUnicode_New(size);
				2054	if (v == NULL)
				2055	goto onError;
				2056	if (size == 0)
				2057	return (PyObject *)v;
				2058	p = PyUnicode_AS_UNICODE(v);
				2059	while (size-- > 0) {
				2060	unsigned char ch = *s++;
				2061	PyObject w, x;
				2062
				2063	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2064	w = PyInt_FromLong((long)ch);
				2065	if (w == NULL)
				2066	goto onError;
				2067	x = PyObject_GetItem(mapping, w);
				2068	Py_DECREF(w);
				2069	if (x == NULL) {
				2070	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2071	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2072	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2073	x = Py_None;
				2074	Py_INCREF(x);
				2075	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2076	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2077	}
				2078
				2079	/* Apply mapping */
				2080	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2081	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2082	if (value < 0 \|\| value > 65535) {
				2083	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2084	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2085	Py_DECREF(x);
				2086	goto onError;
				2087	}
				2088	*p++ = (Py_UNICODE)value;
				2089	}
				2090	else if (x == Py_None) {
				2091	/* undefined mapping */
				2092	if (charmap_decoding_error(&s, &p, errors,
				2093	"character maps to <undefined>")) {
				2094	Py_DECREF(x);
				2095	goto onError;
				2096	}
				2097	}
				2098	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2099	int targetsize = PyUnicode_GET_SIZE(x);
				2100
				2101	if (targetsize == 1)
				2102	/* 1-1 mapping */
				2103	p++ = PyUnicode_AS_UNICODE(x);
				2104
				2105	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2106	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2107	if (targetsize > extrachars) {
				2108	/* resize first */
				2109	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2110	int needed = (targetsize - extrachars) + \
				2111	(targetsize << 2);
				2112	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2113	if (_PyUnicode_Resize(&v,
				2114	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2115	Py_DECREF(x);
				2116	goto onError;
				2117	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2118	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2119	}
				2120	Py_UNICODE_COPY(p,
				2121	PyUnicode_AS_UNICODE(x),
				2122	targetsize);
				2123	p += targetsize;
				2124	extrachars -= targetsize;
				2125	}
				2126	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2127	}
				2128	else {
				2129	/* wrong return value */
				2130	PyErr_SetString(PyExc_TypeError,
				2131	"character mapping must return integer, None or unicode");
				2132	Py_DECREF(x);
				2133	goto onError;
				2134	}
				2135	Py_DECREF(x);
				2136	}
				2137	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2138	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2139	goto onError;
				2140	return (PyObject *)v;
				2141
				2142	onError:
				2143	Py_XDECREF(v);
				2144	return NULL;
				2145	}
				2146
				2147	static
				2148	int charmap_encoding_error(const Py_UNICODE **source,
				2149	char **dest,
				2150	const char *errors,
				2151	const char *details)
				2152	{
				2153	if ((errors == NULL) \|\|
				2154	(strcmp(errors,"strict") == 0)) {
				2155	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2156	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2157	details);
				2158	return -1;
				2159	}
				2160	else if (strcmp(errors,"ignore") == 0) {
				2161	return 0;
				2162	}
				2163	else if (strcmp(errors,"replace") == 0) {
				2164	**dest = '?';
				2165	(*dest)++;
				2166	return 0;
				2167	}
				2168	else {
				2169	PyErr_Format(PyExc_ValueError,
				2170	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2171	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2172	errors);
				2173	return -1;
				2174	}
				2175	}
				2176
				2177	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2178	int size,
				2179	PyObject *mapping,
				2180	const char *errors)
				2181	{
				2182	PyObject *v;
				2183	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2184	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2185
				2186	/* Default to Latin-1 */
				2187	if (mapping == NULL)
				2188	return PyUnicode_EncodeLatin1(p, size, errors);
				2189
				2190	v = PyString_FromStringAndSize(NULL, size);
				2191	if (v == NULL)
				2192	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2193	if (size == 0)
				2194	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2195	s = PyString_AS_STRING(v);
				2196	while (size-- > 0) {
				2197	Py_UNICODE ch = *p++;
				2198	PyObject w, x;
				2199
				2200	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2201	w = PyInt_FromLong((long)ch);
				2202	if (w == NULL)
				2203	goto onError;
				2204	x = PyObject_GetItem(mapping, w);
				2205	Py_DECREF(w);
				2206	if (x == NULL) {
				2207	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2208	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2209	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2210	x = Py_None;
				2211	Py_INCREF(x);
				2212	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2213	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2214	}
				2215
				2216	/* Apply mapping */
				2217	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2218	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2219	if (value < 0 \|\| value > 255) {
				2220	PyErr_SetString(PyExc_TypeError,
				2221	"character mapping must be in range(256)");
				2222	Py_DECREF(x);
				2223	goto onError;
				2224	}
				2225	*s++ = (char)value;
				2226	}
				2227	else if (x == Py_None) {
				2228	/* undefined mapping */
				2229	if (charmap_encoding_error(&p, &s, errors,
				2230	"character maps to <undefined>")) {
				2231	Py_DECREF(x);
				2232	goto onError;
				2233	}
				2234	}
				2235	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2236	int targetsize = PyString_GET_SIZE(x);
				2237
				2238	if (targetsize == 1)
				2239	/* 1-1 mapping */
				2240	s++ = PyString_AS_STRING(x);
				2241
				2242	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2243	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2244	if (targetsize > extrachars) {
				2245	/* resize first */
				2246	int oldpos = (int)(s - PyString_AS_STRING(v));
				2247	int needed = (targetsize - extrachars) + \
				2248	(targetsize << 2);
				2249	extrachars += needed;
				2250	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2251	Py_DECREF(x);
				2252	goto onError;
				2253	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2254	s = PyString_AS_STRING(v) + oldpos;
				2255	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2256	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2257	s += targetsize;
				2258	extrachars -= targetsize;
				2259	}
				2260	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2261	}
				2262	else {
				2263	/* wrong return value */
				2264	PyErr_SetString(PyExc_TypeError,
				2265	"character mapping must return integer, None or unicode");
				2266	Py_DECREF(x);
				2267	goto onError;
				2268	}
				2269	Py_DECREF(x);
				2270	}
				2271	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2272	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2273	goto onError;
				2274	return v;
				2275
				2276	onError:
				2277	Py_DECREF(v);
				2278	return NULL;
				2279	}
				2280
				2281	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2282	PyObject *mapping)
				2283	{
				2284	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2285	PyErr_BadArgument();
				2286	return NULL;
				2287	}
				2288	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2289	PyUnicode_GET_SIZE(unicode),
				2290	mapping,
				2291	NULL);
				2292	}
				2293
				2294	static
				2295	int translate_error(const Py_UNICODE **source,
				2296	Py_UNICODE **dest,
				2297	const char *errors,
				2298	const char *details)
				2299	{
				2300	if ((errors == NULL) \|\|
				2301	(strcmp(errors,"strict") == 0)) {
				2302	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2303	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2304	details);
				2305	return -1;
				2306	}
				2307	else if (strcmp(errors,"ignore") == 0) {
				2308	return 0;
				2309	}
				2310	else if (strcmp(errors,"replace") == 0) {
				2311	**dest = '?';
				2312	(*dest)++;
				2313	return 0;
				2314	}
				2315	else {
				2316	PyErr_Format(PyExc_ValueError,
				2317	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2318	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2319	errors);
				2320	return -1;
				2321	}
				2322	}
				2323
				2324	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2325	int size,
				2326	PyObject *mapping,
				2327	const char *errors)
				2328	{
				2329	PyUnicodeObject *v;
				2330	Py_UNICODE *p;
				2331
				2332	if (mapping == NULL) {
				2333	PyErr_BadArgument();
				2334	return NULL;
				2335	}
				2336
				2337	/* Output will never be longer than input */
				2338	v = _PyUnicode_New(size);
				2339	if (v == NULL)
				2340	goto onError;
				2341	if (size == 0)
				2342	goto done;
				2343	p = PyUnicode_AS_UNICODE(v);
				2344	while (size-- > 0) {
				2345	Py_UNICODE ch = *s++;
				2346	PyObject w, x;
				2347
				2348	/* Get mapping */
				2349	w = PyInt_FromLong(ch);
				2350	if (w == NULL)
				2351	goto onError;
				2352	x = PyObject_GetItem(mapping, w);
				2353	Py_DECREF(w);
				2354	if (x == NULL) {
				2355	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2356	/* No mapping found: default to 1-1 mapping */
				2357	PyErr_Clear();
				2358	*p++ = ch;
				2359	continue;
				2360	}
				2361	goto onError;
				2362	}
				2363
				2364	/* Apply mapping */
				2365	if (PyInt_Check(x))
				2366	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2367	else if (x == Py_None) {
				2368	/* undefined mapping */
				2369	if (translate_error(&s, &p, errors,
				2370	"character maps to <undefined>")) {
				2371	Py_DECREF(x);
				2372	goto onError;
				2373	}
				2374	}
				2375	else if (PyUnicode_Check(x)) {
				2376	if (PyUnicode_GET_SIZE(x) != 1) {
				2377	/* 1-n mapping */
				2378	PyErr_SetString(PyExc_NotImplementedError,
				2379	"1-n mappings are currently not implemented");
				2380	Py_DECREF(x);
				2381	goto onError;
				2382	}
				2383	p++ = PyUnicode_AS_UNICODE(x);
				2384	}
				2385	else {
				2386	/* wrong return value */
				2387	PyErr_SetString(PyExc_TypeError,
				2388	"translate mapping must return integer, None or unicode");
				2389	Py_DECREF(x);
				2390	goto onError;
				2391	}
				2392	Py_DECREF(x);
				2393	}
				2394	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2395	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2396	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2397
				2398	done:
				2399	return (PyObject *)v;
				2400
				2401	onError:
				2402	Py_XDECREF(v);
				2403	return NULL;
				2404	}
				2405
				2406	PyObject PyUnicode_Translate(PyObject str,
				2407	PyObject *mapping,
				2408	const char *errors)
				2409	{
				2410	PyObject *result;
				2411
				2412	str = PyUnicode_FromObject(str);
				2413	if (str == NULL)
				2414	goto onError;
				2415	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2416	PyUnicode_GET_SIZE(str),
				2417	mapping,
				2418	errors);
				2419	Py_DECREF(str);
				2420	return result;
				2421
				2422	onError:
				2423	Py_XDECREF(str);
				2424	return NULL;
				2425	}
				2426
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2427	/* --- Decimal Encoder ---------------------------------------------------- */
				2428
				2429	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2430	int length,
				2431	char *output,
				2432	const char *errors)
				2433	{
				2434	Py_UNICODE p, end;
				2435
				2436	if (output == NULL) {
				2437	PyErr_BadArgument();
				2438	return -1;
				2439	}
				2440
				2441	p = s;
				2442	end = s + length;
				2443	while (p < end) {
				2444	register Py_UNICODE ch = *p++;
				2445	int decimal;
				2446
				2447	if (Py_UNICODE_ISSPACE(ch)) {
				2448	*output++ = ' ';
				2449	continue;
				2450	}
				2451	decimal = Py_UNICODE_TODECIMAL(ch);
				2452	if (decimal >= 0) {
				2453	*output++ = '0' + decimal;
				2454	continue;
				2455	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2456	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2457	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2458	continue;
				2459	}
				2460	/* All other characters are considered invalid */
				2461	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2462	PyErr_SetString(PyExc_ValueError,
				2463	"invalid decimal Unicode string");
				2464	goto onError;
				2465	}
				2466	else if (strcmp(errors, "ignore") == 0)
				2467	continue;
				2468	else if (strcmp(errors, "replace") == 0) {
				2469	*output++ = '?';
				2470	continue;
				2471	}
				2472	}
				2473	/* 0-terminate the output string */
				2474	*output++ = '\0';
				2475	return 0;
				2476
				2477	onError:
				2478	return -1;
				2479	}
				2480
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2481	/* --- Helpers ------------------------------------------------------------ */
				2482
				2483	static
				2484	int count(PyUnicodeObject *self,
				2485	int start,
				2486	int end,
				2487	PyUnicodeObject *substring)
				2488	{
				2489	int count = 0;
				2490
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2491	if (start < 0)
				2492	start += self->length;
				2493	if (start < 0)
				2494	start = 0;
				2495	if (end > self->length)
				2496	end = self->length;
				2497	if (end < 0)
				2498	end += self->length;
				2499	if (end < 0)
				2500	end = 0;
				2501
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2502	if (substring->length == 0)
				2503	return (end - start + 1);
				2504
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2505	end -= substring->length;
				2506
				2507	while (start <= end)
				2508	if (Py_UNICODE_MATCH(self, start, substring)) {
				2509	count++;
				2510	start += substring->length;
				2511	} else
				2512	start++;
				2513
				2514	return count;
				2515	}
				2516
				2517	int PyUnicode_Count(PyObject *str,
				2518	PyObject *substr,
				2519	int start,
				2520	int end)
				2521	{
				2522	int result;
				2523
				2524	str = PyUnicode_FromObject(str);
				2525	if (str == NULL)
				2526	return -1;
				2527	substr = PyUnicode_FromObject(substr);
				2528	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2529	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2530	return -1;
				2531	}
				2532
				2533	result = count((PyUnicodeObject *)str,
				2534	start, end,
				2535	(PyUnicodeObject *)substr);
				2536
				2537	Py_DECREF(str);
				2538	Py_DECREF(substr);
				2539	return result;
				2540	}
				2541
				2542	static
				2543	int findstring(PyUnicodeObject *self,
				2544	PyUnicodeObject *substring,
				2545	int start,
				2546	int end,
				2547	int direction)
				2548	{
				2549	if (start < 0)
				2550	start += self->length;
				2551	if (start < 0)
				2552	start = 0;
				2553
				2554	if (substring->length == 0)
				2555	return start;
				2556
				2557	if (end > self->length)
				2558	end = self->length;
				2559	if (end < 0)
				2560	end += self->length;
				2561	if (end < 0)
				2562	end = 0;
				2563
				2564	end -= substring->length;
				2565
				2566	if (direction < 0) {
				2567	for (; end >= start; end--)
				2568	if (Py_UNICODE_MATCH(self, end, substring))
				2569	return end;
				2570	} else {
				2571	for (; start <= end; start++)
				2572	if (Py_UNICODE_MATCH(self, start, substring))
				2573	return start;
				2574	}
				2575
				2576	return -1;
				2577	}
				2578
				2579	int PyUnicode_Find(PyObject *str,
				2580	PyObject *substr,
				2581	int start,
				2582	int end,
				2583	int direction)
				2584	{
				2585	int result;
				2586
				2587	str = PyUnicode_FromObject(str);
				2588	if (str == NULL)
				2589	return -1;
				2590	substr = PyUnicode_FromObject(substr);
				2591	if (substr == NULL) {
				2592	Py_DECREF(substr);
				2593	return -1;
				2594	}
				2595
				2596	result = findstring((PyUnicodeObject *)str,
				2597	(PyUnicodeObject *)substr,
				2598	start, end, direction);
				2599	Py_DECREF(str);
				2600	Py_DECREF(substr);
				2601	return result;
				2602	}
				2603
				2604	static
				2605	int tailmatch(PyUnicodeObject *self,
				2606	PyUnicodeObject *substring,
				2607	int start,
				2608	int end,
				2609	int direction)
				2610	{
				2611	if (start < 0)
				2612	start += self->length;
				2613	if (start < 0)
				2614	start = 0;
				2615
				2616	if (substring->length == 0)
				2617	return 1;
				2618
				2619	if (end > self->length)
				2620	end = self->length;
				2621	if (end < 0)
				2622	end += self->length;
				2623	if (end < 0)
				2624	end = 0;
				2625
				2626	end -= substring->length;
				2627	if (end < start)
				2628	return 0;
				2629
				2630	if (direction > 0) {
				2631	if (Py_UNICODE_MATCH(self, end, substring))
				2632	return 1;
				2633	} else {
				2634	if (Py_UNICODE_MATCH(self, start, substring))
				2635	return 1;
				2636	}
				2637
				2638	return 0;
				2639	}
				2640
				2641	int PyUnicode_Tailmatch(PyObject *str,
				2642	PyObject *substr,
				2643	int start,
				2644	int end,
				2645	int direction)
				2646	{
				2647	int result;
				2648
				2649	str = PyUnicode_FromObject(str);
				2650	if (str == NULL)
				2651	return -1;
				2652	substr = PyUnicode_FromObject(substr);
				2653	if (substr == NULL) {
				2654	Py_DECREF(substr);
				2655	return -1;
				2656	}
				2657
				2658	result = tailmatch((PyUnicodeObject *)str,
				2659	(PyUnicodeObject *)substr,
				2660	start, end, direction);
				2661	Py_DECREF(str);
				2662	Py_DECREF(substr);
				2663	return result;
				2664	}
				2665
				2666	static
				2667	const Py_UNICODE findchar(const Py_UNICODE s,
				2668	int size,
				2669	Py_UNICODE ch)
				2670	{
				2671	/* like wcschr, but doesn't stop at NULL characters */
				2672
				2673	while (size-- > 0) {
				2674	if (*s == ch)
				2675	return s;
				2676	s++;
				2677	}
				2678
				2679	return NULL;
				2680	}
				2681
				2682	/* Apply fixfct filter to the Unicode object self and return a
				2683	reference to the modified object */
				2684
				2685	static
				2686	PyObject fixup(PyUnicodeObject self,
				2687	int (fixfct)(PyUnicodeObject s))
				2688	{
				2689
				2690	PyUnicodeObject *u;
				2691
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2692	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2693	if (u == NULL)
				2694	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2695
				2696	Py_UNICODE_COPY(u->str, self->str, self->length);
				2697
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2698	if (!fixfct(u)) {
				2699	/* fixfct should return TRUE if it modified the buffer. If
				2700	FALSE, return a reference to the original buffer instead
				2701	(to save space, not time) */
				2702	Py_INCREF(self);
				2703	Py_DECREF(u);
				2704	return (PyObject*) self;
				2705	}
				2706	return (PyObject*) u;
				2707	}
				2708
				2709	static
				2710	int fixupper(PyUnicodeObject *self)
				2711	{
				2712	int len = self->length;
				2713	Py_UNICODE *s = self->str;
				2714	int status = 0;
				2715
				2716	while (len-- > 0) {
				2717	register Py_UNICODE ch;
				2718
				2719	ch = Py_UNICODE_TOUPPER(*s);
				2720	if (ch != *s) {
				2721	status = 1;
				2722	*s = ch;
				2723	}
				2724	s++;
				2725	}
				2726
				2727	return status;
				2728	}
				2729
				2730	static
				2731	int fixlower(PyUnicodeObject *self)
				2732	{
				2733	int len = self->length;
				2734	Py_UNICODE *s = self->str;
				2735	int status = 0;
				2736
				2737	while (len-- > 0) {
				2738	register Py_UNICODE ch;
				2739
				2740	ch = Py_UNICODE_TOLOWER(*s);
				2741	if (ch != *s) {
				2742	status = 1;
				2743	*s = ch;
				2744	}
				2745	s++;
				2746	}
				2747
				2748	return status;
				2749	}
				2750
				2751	static
				2752	int fixswapcase(PyUnicodeObject *self)
				2753	{
				2754	int len = self->length;
				2755	Py_UNICODE *s = self->str;
				2756	int status = 0;
				2757
				2758	while (len-- > 0) {
				2759	if (Py_UNICODE_ISUPPER(*s)) {
				2760	s = Py_UNICODE_TOLOWER(s);
				2761	status = 1;
				2762	} else if (Py_UNICODE_ISLOWER(*s)) {
				2763	s = Py_UNICODE_TOUPPER(s);
				2764	status = 1;
				2765	}
				2766	s++;
				2767	}
				2768
				2769	return status;
				2770	}
				2771
				2772	static
				2773	int fixcapitalize(PyUnicodeObject *self)
				2774	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2775	int len = self->length;
				2776	Py_UNICODE *s = self->str;
				2777	int status = 0;
				2778
				2779	if (len == 0)
				2780	return 0;
				2781	if (Py_UNICODE_ISLOWER(*s)) {
				2782	s = Py_UNICODE_TOUPPER(s);
				2783	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2784	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	2785	s++;
				2786	while (--len > 0) {
				2787	if (Py_UNICODE_ISUPPER(*s)) {
				2788	s = Py_UNICODE_TOLOWER(s);
				2789	status = 1;
				2790	}
				2791	s++;
				2792	}
				2793	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2794	}
				2795
				2796	static
				2797	int fixtitle(PyUnicodeObject *self)
				2798	{
				2799	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				2800	register Py_UNICODE *e;
				2801	int previous_is_cased;
				2802
				2803	/* Shortcut for single character strings */
				2804	if (PyUnicode_GET_SIZE(self) == 1) {
				2805	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				2806	if (*p != ch) {
				2807	*p = ch;
				2808	return 1;
				2809	}
				2810	else
				2811	return 0;
				2812	}
				2813
				2814	e = p + PyUnicode_GET_SIZE(self);
				2815	previous_is_cased = 0;
				2816	for (; p < e; p++) {
				2817	register const Py_UNICODE ch = *p;
				2818
				2819	if (previous_is_cased)
				2820	*p = Py_UNICODE_TOLOWER(ch);
				2821	else
				2822	*p = Py_UNICODE_TOTITLE(ch);
				2823
				2824	if (Py_UNICODE_ISLOWER(ch) \|\|
				2825	Py_UNICODE_ISUPPER(ch) \|\|
				2826	Py_UNICODE_ISTITLE(ch))
				2827	previous_is_cased = 1;
				2828	else
				2829	previous_is_cased = 0;
				2830	}
				2831	return 1;
				2832	}
				2833
				2834	PyObject PyUnicode_Join(PyObject separator,
				2835	PyObject *seq)
				2836	{
				2837	Py_UNICODE *sep;
				2838	int seplen;
				2839	PyUnicodeObject *res = NULL;
				2840	int reslen = 0;
				2841	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2842	int sz = 100;
				2843	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2844	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2845
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2846	it = PyObject_GetIter(seq);
				2847	if (it == NULL)
				2848	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2849
				2850	if (separator == NULL) {
				2851	Py_UNICODE blank = ' ';
				2852	sep = &blank;
				2853	seplen = 1;
				2854	}
				2855	else {
				2856	separator = PyUnicode_FromObject(separator);
				2857	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2858	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2859	sep = PyUnicode_AS_UNICODE(separator);
				2860	seplen = PyUnicode_GET_SIZE(separator);
				2861	}
				2862
				2863	res = _PyUnicode_New(sz);
				2864	if (res == NULL)
				2865	goto onError;
				2866	p = PyUnicode_AS_UNICODE(res);
				2867	reslen = 0;
				2868
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2869	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2870	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2871	PyObject *item = PyIter_Next(it);
				2872	if (item == NULL) {
				2873	if (PyErr_Occurred())
				2874	goto onError;
				2875	break;
				2876	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2877	if (!PyUnicode_Check(item)) {
				2878	PyObject *v;
				2879	v = PyUnicode_FromObject(item);
				2880	Py_DECREF(item);
				2881	item = v;
				2882	if (item == NULL)
				2883	goto onError;
				2884	}
				2885	itemlen = PyUnicode_GET_SIZE(item);
				2886	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2887	if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2888	goto onError;
				2889	sz *= 2;
				2890	p = PyUnicode_AS_UNICODE(res) + reslen;
				2891	}
				2892	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2893	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2894	p += seplen;
				2895	reslen += seplen;
				2896	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2897	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2898	p += itemlen;
				2899	reslen += itemlen;
				2900	Py_DECREF(item);
				2901	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2902	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2903	goto onError;
				2904
				2905	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2906	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2907	return (PyObject *)res;
				2908
				2909	onError:
				2910	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	2911	Py_XDECREF(res);
				2912	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2913	return NULL;
				2914	}
				2915
				2916	static
				2917	PyUnicodeObject pad(PyUnicodeObject self,
				2918	int left,
				2919	int right,
				2920	Py_UNICODE fill)
				2921	{
				2922	PyUnicodeObject *u;
				2923
				2924	if (left < 0)
				2925	left = 0;
				2926	if (right < 0)
				2927	right = 0;
				2928
				2929	if (left == 0 && right == 0) {
				2930	Py_INCREF(self);
				2931	return self;
				2932	}
				2933
				2934	u = _PyUnicode_New(left + self->length + right);
				2935	if (u) {
				2936	if (left)
				2937	Py_UNICODE_FILL(u->str, fill, left);
				2938	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				2939	if (right)
				2940	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				2941	}
				2942
				2943	return u;
				2944	}
				2945
				2946	#define SPLIT_APPEND(data, left, right) \
				2947	str = PyUnicode_FromUnicode(data + left, right - left); \
				2948	if (!str) \
				2949	goto onError; \
				2950	if (PyList_Append(list, str)) { \
				2951	Py_DECREF(str); \
				2952	goto onError; \
				2953	} \
				2954	else \
				2955	Py_DECREF(str);
				2956
				2957	static
				2958	PyObject split_whitespace(PyUnicodeObject self,
				2959	PyObject *list,
				2960	int maxcount)
				2961	{
				2962	register int i;
				2963	register int j;
				2964	int len = self->length;
				2965	PyObject *str;
				2966
				2967	for (i = j = 0; i < len; ) {
				2968	/* find a token */
				2969	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2970	i++;
				2971	j = i;
				2972	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				2973	i++;
				2974	if (j < i) {
				2975	if (maxcount-- <= 0)
				2976	break;
				2977	SPLIT_APPEND(self->str, j, i);
				2978	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				2979	i++;
				2980	j = i;
				2981	}
				2982	}
				2983	if (j < len) {
				2984	SPLIT_APPEND(self->str, j, len);
				2985	}
				2986	return list;
				2987
				2988	onError:
				2989	Py_DECREF(list);
				2990	return NULL;
				2991	}
				2992
				2993	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	2994	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2995	{
				2996	register int i;
				2997	register int j;
				2998	int len;
				2999	PyObject *list;
				3000	PyObject *str;
				3001	Py_UNICODE *data;
				3002
				3003	string = PyUnicode_FromObject(string);
				3004	if (string == NULL)
				3005	return NULL;
				3006	data = PyUnicode_AS_UNICODE(string);
				3007	len = PyUnicode_GET_SIZE(string);
				3008
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3009	list = PyList_New(0);
				3010	if (!list)
				3011	goto onError;
				3012
				3013	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3014	int eol;
				3015
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3016	/* Find a line and append it */
				3017	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				3018	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3019
				3020	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3021	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3022	if (i < len) {
				3023	if (data[i] == '\r' && i + 1 < len &&
				3024	data[i+1] == '\n')
				3025	i += 2;
				3026	else
				3027	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3028	if (keepends)
				3029	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3030	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3031	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3032	j = i;
				3033	}
				3034	if (j < len) {
				3035	SPLIT_APPEND(data, j, len);
				3036	}
				3037
				3038	Py_DECREF(string);
				3039	return list;
				3040
				3041	onError:
				3042	Py_DECREF(list);
				3043	Py_DECREF(string);
				3044	return NULL;
				3045	}
				3046
				3047	static
				3048	PyObject split_char(PyUnicodeObject self,
				3049	PyObject *list,
				3050	Py_UNICODE ch,
				3051	int maxcount)
				3052	{
				3053	register int i;
				3054	register int j;
				3055	int len = self->length;
				3056	PyObject *str;
				3057
				3058	for (i = j = 0; i < len; ) {
				3059	if (self->str[i] == ch) {
				3060	if (maxcount-- <= 0)
				3061	break;
				3062	SPLIT_APPEND(self->str, j, i);
				3063	i = j = i + 1;
				3064	} else
				3065	i++;
				3066	}
				3067	if (j <= len) {
				3068	SPLIT_APPEND(self->str, j, len);
				3069	}
				3070	return list;
				3071
				3072	onError:
				3073	Py_DECREF(list);
				3074	return NULL;
				3075	}
				3076
				3077	static
				3078	PyObject split_substring(PyUnicodeObject self,
				3079	PyObject *list,
				3080	PyUnicodeObject *substring,
				3081	int maxcount)
				3082	{
				3083	register int i;
				3084	register int j;
				3085	int len = self->length;
				3086	int sublen = substring->length;
				3087	PyObject *str;
				3088
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3089	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3090	if (Py_UNICODE_MATCH(self, i, substring)) {
				3091	if (maxcount-- <= 0)
				3092	break;
				3093	SPLIT_APPEND(self->str, j, i);
				3094	i = j = i + sublen;
				3095	} else
				3096	i++;
				3097	}
				3098	if (j <= len) {
				3099	SPLIT_APPEND(self->str, j, len);
				3100	}
				3101	return list;
				3102
				3103	onError:
				3104	Py_DECREF(list);
				3105	return NULL;
				3106	}
				3107
				3108	#undef SPLIT_APPEND
				3109
				3110	static
				3111	PyObject split(PyUnicodeObject self,
				3112	PyUnicodeObject *substring,
				3113	int maxcount)
				3114	{
				3115	PyObject *list;
				3116
				3117	if (maxcount < 0)
				3118	maxcount = INT_MAX;
				3119
				3120	list = PyList_New(0);
				3121	if (!list)
				3122	return NULL;
				3123
				3124	if (substring == NULL)
				3125	return split_whitespace(self,list,maxcount);
				3126
				3127	else if (substring->length == 1)
				3128	return split_char(self,list,substring->str[0],maxcount);
				3129
				3130	else if (substring->length == 0) {
				3131	Py_DECREF(list);
				3132	PyErr_SetString(PyExc_ValueError, "empty separator");
				3133	return NULL;
				3134	}
				3135	else
				3136	return split_substring(self,list,substring,maxcount);
				3137	}
				3138
				3139	static
				3140	PyObject strip(PyUnicodeObject self,
				3141	int left,
				3142	int right)
				3143	{
				3144	Py_UNICODE *p = self->str;
				3145	int start = 0;
				3146	int end = self->length;
				3147
				3148	if (left)
				3149	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3150	start++;
				3151
				3152	if (right)
				3153	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3154	end--;
				3155
				3156	if (start == 0 && end == self->length) {
				3157	/* couldn't strip anything off, return original string */
				3158	Py_INCREF(self);
				3159	return (PyObject*) self;
				3160	}
				3161
				3162	return (PyObject*) PyUnicode_FromUnicode(
				3163	self->str + start,
				3164	end - start
				3165	);
				3166	}
				3167
				3168	static
				3169	PyObject replace(PyUnicodeObject self,
				3170	PyUnicodeObject *str1,
				3171	PyUnicodeObject *str2,
				3172	int maxcount)
				3173	{
				3174	PyUnicodeObject *u;
				3175
				3176	if (maxcount < 0)
				3177	maxcount = INT_MAX;
				3178
				3179	if (str1->length == 1 && str2->length == 1) {
				3180	int i;
				3181
				3182	/* replace characters */
				3183	if (!findchar(self->str, self->length, str1->str[0])) {
				3184	/* nothing to replace, return original string */
				3185	Py_INCREF(self);
				3186	u = self;
				3187	} else {
				3188	Py_UNICODE u1 = str1->str[0];
				3189	Py_UNICODE u2 = str2->str[0];
				3190
				3191	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3192	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3193	self->length
				3194	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3195	if (u != NULL) {
				3196	Py_UNICODE_COPY(u->str, self->str,
				3197	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3198	for (i = 0; i < u->length; i++)
				3199	if (u->str[i] == u1) {
				3200	if (--maxcount < 0)
				3201	break;
				3202	u->str[i] = u2;
				3203	}
				3204	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3205	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3206
				3207	} else {
				3208	int n, i;
				3209	Py_UNICODE *p;
				3210
				3211	/* replace strings */
				3212	n = count(self, 0, self->length, str1);
				3213	if (n > maxcount)
				3214	n = maxcount;
				3215	if (n == 0) {
				3216	/* nothing to replace, return original string */
				3217	Py_INCREF(self);
				3218	u = self;
				3219	} else {
				3220	u = _PyUnicode_New(
				3221	self->length + n * (str2->length - str1->length));
				3222	if (u) {
				3223	i = 0;
				3224	p = u->str;
				3225	while (i <= self->length - str1->length)
				3226	if (Py_UNICODE_MATCH(self, i, str1)) {
				3227	/* replace string segment */
				3228	Py_UNICODE_COPY(p, str2->str, str2->length);
				3229	p += str2->length;
				3230	i += str1->length;
				3231	if (--n <= 0) {
				3232	/* copy remaining part */
				3233	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3234	break;
				3235	}
				3236	} else
				3237	*p++ = self->str[i++];
				3238	}
				3239	}
				3240	}
				3241
				3242	return (PyObject *) u;
				3243	}
				3244
				3245	/* --- Unicode Object Methods --------------------------------------------- */
				3246
				3247	static char title__doc__[] =
				3248	"S.title() -> unicode\n\
				3249	\n\
				3250	Return a titlecased version of S, i.e. words start with title case\n\
				3251	characters, all remaining cased characters have lower case.";
				3252
				3253	static PyObject*
				3254	unicode_title(PyUnicodeObject self, PyObject args)
				3255	{
				3256	if (!PyArg_NoArgs(args))
				3257	return NULL;
				3258	return fixup(self, fixtitle);
				3259	}
				3260
				3261	static char capitalize__doc__[] =
				3262	"S.capitalize() -> unicode\n\
				3263	\n\
				3264	Return a capitalized version of S, i.e. make the first character\n\
				3265	have upper case.";
				3266
				3267	static PyObject*
				3268	unicode_capitalize(PyUnicodeObject self, PyObject args)
				3269	{
				3270	if (!PyArg_NoArgs(args))
				3271	return NULL;
				3272	return fixup(self, fixcapitalize);
				3273	}
				3274
				3275	#if 0
				3276	static char capwords__doc__[] =
				3277	"S.capwords() -> unicode\n\
				3278	\n\
				3279	Apply .capitalize() to all words in S and return the result with\n\
				3280	normalized whitespace (all whitespace strings are replaced by ' ').";
				3281
				3282	static PyObject*
				3283	unicode_capwords(PyUnicodeObject self, PyObject args)
				3284	{
				3285	PyObject *list;
				3286	PyObject *item;
				3287	int i;
				3288
				3289	if (!PyArg_NoArgs(args))
				3290	return NULL;
				3291
				3292	/* Split into words */
				3293	list = split(self, NULL, -1);
				3294	if (!list)
				3295	return NULL;
				3296
				3297	/* Capitalize each word */
				3298	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3299	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3300	fixcapitalize);
				3301	if (item == NULL)
				3302	goto onError;
				3303	Py_DECREF(PyList_GET_ITEM(list, i));
				3304	PyList_SET_ITEM(list, i, item);
				3305	}
				3306
				3307	/* Join the words to form a new string */
				3308	item = PyUnicode_Join(NULL, list);
				3309
				3310	onError:
				3311	Py_DECREF(list);
				3312	return (PyObject *)item;
				3313	}
				3314	#endif
				3315
				3316	static char center__doc__[] =
				3317	"S.center(width) -> unicode\n\
				3318	\n\
				3319	Return S centered in a Unicode string of length width. Padding is done\n\
				3320	using spaces.";
				3321
				3322	static PyObject *
				3323	unicode_center(PyUnicodeObject self, PyObject args)
				3324	{
				3325	int marg, left;
				3326	int width;
				3327
				3328	if (!PyArg_ParseTuple(args, "i:center", &width))
				3329	return NULL;
				3330
				3331	if (self->length >= width) {
				3332	Py_INCREF(self);
				3333	return (PyObject*) self;
				3334	}
				3335
				3336	marg = width - self->length;
				3337	left = marg / 2 + (marg & width & 1);
				3338
				3339	return (PyObject*) pad(self, left, marg - left, ' ');
				3340	}
				3341
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3342	#if 0
				3343
				3344	/* This code should go into some future Unicode collation support
				3345	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3346	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3347
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3348	/* speedy UTF-16 code point order comparison */
				3349	/* gleaned from: */
				3350	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3351
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3352	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3353	{
				3354	0, 0, 0, 0, 0, 0, 0, 0,
				3355	0, 0, 0, 0, 0, 0, 0, 0,
				3356	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3357	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3358	};
				3359
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3360	static int
				3361	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3362	{
				3363	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3364
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3365	Py_UNICODE *s1 = str1->str;
				3366	Py_UNICODE *s2 = str2->str;
				3367
				3368	len1 = str1->length;
				3369	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3370
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3371	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3372	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3373
				3374	c1 = *s1++;
				3375	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3376
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3377	if (c1 > (1<<11) * 26)
				3378	c1 += utf16Fixup[c1>>11];
				3379	if (c2 > (1<<11) * 26)
				3380	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3381	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3382
				3383	if (c1 != c2)
				3384	return (c1 < c2) ? -1 : 1;
				3385
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3386	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3387	}
				3388
				3389	return (len1 < len2) ? -1 : (len1 != len2);
				3390	}
				3391
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3392	#else
				3393
				3394	static int
				3395	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3396	{
				3397	register int len1, len2;
				3398
				3399	Py_UNICODE *s1 = str1->str;
				3400	Py_UNICODE *s2 = str2->str;
				3401
				3402	len1 = str1->length;
				3403	len2 = str2->length;
				3404
				3405	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3406	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3407
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3408	c1 = *s1++;
				3409	c2 = *s2++;
				3410
				3411	if (c1 != c2)
				3412	return (c1 < c2) ? -1 : 1;
				3413
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3414	len1--; len2--;
				3415	}
				3416
				3417	return (len1 < len2) ? -1 : (len1 != len2);
				3418	}
				3419
				3420	#endif
				3421
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3422	int PyUnicode_Compare(PyObject *left,
				3423	PyObject *right)
				3424	{
				3425	PyUnicodeObject u = NULL, v = NULL;
				3426	int result;
				3427
				3428	/* Coerce the two arguments */
				3429	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3430	if (u == NULL)
				3431	goto onError;
				3432	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3433	if (v == NULL)
				3434	goto onError;
				3435
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3436	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3437	if (v == u) {
				3438	Py_DECREF(u);
				3439	Py_DECREF(v);
				3440	return 0;
				3441	}
				3442
				3443	result = unicode_compare(u, v);
				3444
				3445	Py_DECREF(u);
				3446	Py_DECREF(v);
				3447	return result;
				3448
				3449	onError:
				3450	Py_XDECREF(u);
				3451	Py_XDECREF(v);
				3452	return -1;
				3453	}
				3454
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3455	int PyUnicode_Contains(PyObject *container,
				3456	PyObject *element)
				3457	{
				3458	PyUnicodeObject u = NULL, v = NULL;
				3459	int result;
				3460	register const Py_UNICODE p, e;
				3461	register Py_UNICODE ch;
				3462
				3463	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3464	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3465	if (v == NULL) {
				3466	PyErr_SetString(PyExc_TypeError,
				3467	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3468	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3469	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3470	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3471	if (u == NULL) {
				3472	Py_DECREF(v);
				3473	goto onError;
				3474	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3475
				3476	/* Check v in u */
				3477	if (PyUnicode_GET_SIZE(v) != 1) {
				3478	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3479	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3480	goto onError;
				3481	}
				3482	ch = *PyUnicode_AS_UNICODE(v);
				3483	p = PyUnicode_AS_UNICODE(u);
				3484	e = p + PyUnicode_GET_SIZE(u);
				3485	result = 0;
				3486	while (p < e) {
				3487	if (*p++ == ch) {
				3488	result = 1;
				3489	break;
				3490	}
				3491	}
				3492
				3493	Py_DECREF(u);
				3494	Py_DECREF(v);
				3495	return result;
				3496
				3497	onError:
				3498	Py_XDECREF(u);
				3499	Py_XDECREF(v);
				3500	return -1;
				3501	}
				3502
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3503	/* Concat to string or Unicode object giving a new Unicode object. */
				3504
				3505	PyObject PyUnicode_Concat(PyObject left,
				3506	PyObject *right)
				3507	{
				3508	PyUnicodeObject u = NULL, v = NULL, *w;
				3509
				3510	/* Coerce the two arguments */
				3511	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3512	if (u == NULL)
				3513	goto onError;
				3514	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3515	if (v == NULL)
				3516	goto onError;
				3517
				3518	/* Shortcuts */
				3519	if (v == unicode_empty) {
				3520	Py_DECREF(v);
				3521	return (PyObject *)u;
				3522	}
				3523	if (u == unicode_empty) {
				3524	Py_DECREF(u);
				3525	return (PyObject *)v;
				3526	}
				3527
				3528	/* Concat the two Unicode strings */
				3529	w = _PyUnicode_New(u->length + v->length);
				3530	if (w == NULL)
				3531	goto onError;
				3532	Py_UNICODE_COPY(w->str, u->str, u->length);
				3533	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3534
				3535	Py_DECREF(u);
				3536	Py_DECREF(v);
				3537	return (PyObject *)w;
				3538
				3539	onError:
				3540	Py_XDECREF(u);
				3541	Py_XDECREF(v);
				3542	return NULL;
				3543	}
				3544
				3545	static char count__doc__[] =
				3546	"S.count(sub[, start[, end]]) -> int\n\
				3547	\n\
				3548	Return the number of occurrences of substring sub in Unicode string\n\
				3549	S[start:end]. Optional arguments start and end are\n\
				3550	interpreted as in slice notation.";
				3551
				3552	static PyObject *
				3553	unicode_count(PyUnicodeObject self, PyObject args)
				3554	{
				3555	PyUnicodeObject *substring;
				3556	int start = 0;
				3557	int end = INT_MAX;
				3558	PyObject *result;
				3559
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3560	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3561	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3562	return NULL;
				3563
				3564	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3565	(PyObject *)substring);
				3566	if (substring == NULL)
				3567	return NULL;
				3568
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3569	if (start < 0)
				3570	start += self->length;
				3571	if (start < 0)
				3572	start = 0;
				3573	if (end > self->length)
				3574	end = self->length;
				3575	if (end < 0)
				3576	end += self->length;
				3577	if (end < 0)
				3578	end = 0;
				3579
				3580	result = PyInt_FromLong((long) count(self, start, end, substring));
				3581
				3582	Py_DECREF(substring);
				3583	return result;
				3584	}
				3585
				3586	static char encode__doc__[] =
				3587	"S.encode([encoding[,errors]]) -> string\n\
				3588	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3589	Return an encoded string version of S. Default encoding is the current\n\
				3590	default string encoding. errors may be given to set a different error\n\
				3591	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3592	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3593
				3594	static PyObject *
				3595	unicode_encode(PyUnicodeObject self, PyObject args)
				3596	{
				3597	char *encoding = NULL;
				3598	char *errors = NULL;
				3599	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3600	return NULL;
				3601	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3602	}
				3603
				3604	static char expandtabs__doc__[] =
				3605	"S.expandtabs([tabsize]) -> unicode\n\
				3606	\n\
				3607	Return a copy of S where all tab characters are expanded using spaces.\n\
				3608	If tabsize is not given, a tab size of 8 characters is assumed.";
				3609
				3610	static PyObject*
				3611	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3612	{
				3613	Py_UNICODE *e;
				3614	Py_UNICODE *p;
				3615	Py_UNICODE *q;
				3616	int i, j;
				3617	PyUnicodeObject *u;
				3618	int tabsize = 8;
				3619
				3620	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3621	return NULL;
				3622
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3623	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3624	i = j = 0;
				3625	e = self->str + self->length;
				3626	for (p = self->str; p < e; p++)
				3627	if (*p == '\t') {
				3628	if (tabsize > 0)
				3629	j += tabsize - (j % tabsize);
				3630	}
				3631	else {
				3632	j++;
				3633	if (p == '\n' \|\| p == '\r') {
				3634	i += j;
				3635	j = 0;
				3636	}
				3637	}
				3638
				3639	/* Second pass: create output string and fill it */
				3640	u = _PyUnicode_New(i + j);
				3641	if (!u)
				3642	return NULL;
				3643
				3644	j = 0;
				3645	q = u->str;
				3646
				3647	for (p = self->str; p < e; p++)
				3648	if (*p == '\t') {
				3649	if (tabsize > 0) {
				3650	i = tabsize - (j % tabsize);
				3651	j += i;
				3652	while (i--)
				3653	*q++ = ' ';
				3654	}
				3655	}
				3656	else {
				3657	j++;
				3658	q++ = p;
				3659	if (p == '\n' \|\| p == '\r')
				3660	j = 0;
				3661	}
				3662
				3663	return (PyObject*) u;
				3664	}
				3665
				3666	static char find__doc__[] =
				3667	"S.find(sub [,start [,end]]) -> int\n\
				3668	\n\
				3669	Return the lowest index in S where substring sub is found,\n\
				3670	such that sub is contained within s[start,end]. Optional\n\
				3671	arguments start and end are interpreted as in slice notation.\n\
				3672	\n\
				3673	Return -1 on failure.";
				3674
				3675	static PyObject *
				3676	unicode_find(PyUnicodeObject self, PyObject args)
				3677	{
				3678	PyUnicodeObject *substring;
				3679	int start = 0;
				3680	int end = INT_MAX;
				3681	PyObject *result;
				3682
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3683	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				3684	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3685	return NULL;
				3686	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3687	(PyObject *)substring);
				3688	if (substring == NULL)
				3689	return NULL;
				3690
				3691	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				3692
				3693	Py_DECREF(substring);
				3694	return result;
				3695	}
				3696
				3697	static PyObject *
				3698	unicode_getitem(PyUnicodeObject *self, int index)
				3699	{
				3700	if (index < 0 \|\| index >= self->length) {
				3701	PyErr_SetString(PyExc_IndexError, "string index out of range");
				3702	return NULL;
				3703	}
				3704
				3705	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				3706	}
				3707
				3708	static long
				3709	unicode_hash(PyUnicodeObject *self)
				3710	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3711	/* Since Unicode objects compare equal to their ASCII string
				3712	counterparts, they should use the individual character values
				3713	as basis for their hash value. This is needed to assure that
				3714	strings and Unicode objects behave in the same way as
				3715	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3716
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3717	register int len;
				3718	register Py_UNICODE *p;
				3719	register long x;
				3720
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3721	if (self->hash != -1)
				3722	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	3723	len = PyUnicode_GET_SIZE(self);
				3724	p = PyUnicode_AS_UNICODE(self);
				3725	x = *p << 7;
				3726	while (--len >= 0)
				3727	x = (1000003x) ^ p++;
				3728	x ^= PyUnicode_GET_SIZE(self);
				3729	if (x == -1)
				3730	x = -2;
				3731	self->hash = x;
				3732	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3733	}
				3734
				3735	static char index__doc__[] =
				3736	"S.index(sub [,start [,end]]) -> int\n\
				3737	\n\
				3738	Like S.find() but raise ValueError when the substring is not found.";
				3739
				3740	static PyObject *
				3741	unicode_index(PyUnicodeObject self, PyObject args)
				3742	{
				3743	int result;
				3744	PyUnicodeObject *substring;
				3745	int start = 0;
				3746	int end = INT_MAX;
				3747
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3748	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				3749	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3750	return NULL;
				3751
				3752	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3753	(PyObject *)substring);
				3754	if (substring == NULL)
				3755	return NULL;
				3756
				3757	result = findstring(self, substring, start, end, 1);
				3758
				3759	Py_DECREF(substring);
				3760	if (result < 0) {
				3761	PyErr_SetString(PyExc_ValueError, "substring not found");
				3762	return NULL;
				3763	}
				3764	return PyInt_FromLong(result);
				3765	}
				3766
				3767	static char islower__doc__[] =
				3768	"S.islower() -> int\n\
				3769	\n\
				3770	Return 1 if all cased characters in S are lowercase and there is\n\
				3771	at least one cased character in S, 0 otherwise.";
				3772
				3773	static PyObject*
				3774	unicode_islower(PyUnicodeObject self, PyObject args)
				3775	{
				3776	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3777	register const Py_UNICODE *e;
				3778	int cased;
				3779
				3780	if (!PyArg_NoArgs(args))
				3781	return NULL;
				3782
				3783	/* Shortcut for single character strings */
				3784	if (PyUnicode_GET_SIZE(self) == 1)
				3785	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				3786
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3787	/* Special case for empty strings */
				3788	if (PyString_GET_SIZE(self) == 0)
				3789	return PyInt_FromLong(0);
				3790
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3791	e = p + PyUnicode_GET_SIZE(self);
				3792	cased = 0;
				3793	for (; p < e; p++) {
				3794	register const Py_UNICODE ch = *p;
				3795
				3796	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3797	return PyInt_FromLong(0);
				3798	else if (!cased && Py_UNICODE_ISLOWER(ch))
				3799	cased = 1;
				3800	}
				3801	return PyInt_FromLong(cased);
				3802	}
				3803
				3804	static char isupper__doc__[] =
				3805	"S.isupper() -> int\n\
				3806	\n\
				3807	Return 1 if all cased characters in S are uppercase and there is\n\
				3808	at least one cased character in S, 0 otherwise.";
				3809
				3810	static PyObject*
				3811	unicode_isupper(PyUnicodeObject self, PyObject args)
				3812	{
				3813	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3814	register const Py_UNICODE *e;
				3815	int cased;
				3816
				3817	if (!PyArg_NoArgs(args))
				3818	return NULL;
				3819
				3820	/* Shortcut for single character strings */
				3821	if (PyUnicode_GET_SIZE(self) == 1)
				3822	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				3823
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3824	/* Special case for empty strings */
				3825	if (PyString_GET_SIZE(self) == 0)
				3826	return PyInt_FromLong(0);
				3827
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3828	e = p + PyUnicode_GET_SIZE(self);
				3829	cased = 0;
				3830	for (; p < e; p++) {
				3831	register const Py_UNICODE ch = *p;
				3832
				3833	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				3834	return PyInt_FromLong(0);
				3835	else if (!cased && Py_UNICODE_ISUPPER(ch))
				3836	cased = 1;
				3837	}
				3838	return PyInt_FromLong(cased);
				3839	}
				3840
				3841	static char istitle__doc__[] =
				3842	"S.istitle() -> int\n\
				3843	\n\
				3844	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				3845	may only follow uncased characters and lowercase characters only cased\n\
				3846	ones. Return 0 otherwise.";
				3847
				3848	static PyObject*
				3849	unicode_istitle(PyUnicodeObject self, PyObject args)
				3850	{
				3851	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3852	register const Py_UNICODE *e;
				3853	int cased, previous_is_cased;
				3854
				3855	if (!PyArg_NoArgs(args))
				3856	return NULL;
				3857
				3858	/* Shortcut for single character strings */
				3859	if (PyUnicode_GET_SIZE(self) == 1)
				3860	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				3861	(Py_UNICODE_ISUPPER(*p) != 0));
				3862
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3863	/* Special case for empty strings */
				3864	if (PyString_GET_SIZE(self) == 0)
				3865	return PyInt_FromLong(0);
				3866
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3867	e = p + PyUnicode_GET_SIZE(self);
				3868	cased = 0;
				3869	previous_is_cased = 0;
				3870	for (; p < e; p++) {
				3871	register const Py_UNICODE ch = *p;
				3872
				3873	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				3874	if (previous_is_cased)
				3875	return PyInt_FromLong(0);
				3876	previous_is_cased = 1;
				3877	cased = 1;
				3878	}
				3879	else if (Py_UNICODE_ISLOWER(ch)) {
				3880	if (!previous_is_cased)
				3881	return PyInt_FromLong(0);
				3882	previous_is_cased = 1;
				3883	cased = 1;
				3884	}
				3885	else
				3886	previous_is_cased = 0;
				3887	}
				3888	return PyInt_FromLong(cased);
				3889	}
				3890
				3891	static char isspace__doc__[] =
				3892	"S.isspace() -> int\n\
				3893	\n\
				3894	Return 1 if there are only whitespace characters in S,\n\
				3895	0 otherwise.";
				3896
				3897	static PyObject*
				3898	unicode_isspace(PyUnicodeObject self, PyObject args)
				3899	{
				3900	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3901	register const Py_UNICODE *e;
				3902
				3903	if (!PyArg_NoArgs(args))
				3904	return NULL;
				3905
				3906	/* Shortcut for single character strings */
				3907	if (PyUnicode_GET_SIZE(self) == 1 &&
				3908	Py_UNICODE_ISSPACE(*p))
				3909	return PyInt_FromLong(1);
				3910
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	3911	/* Special case for empty strings */
				3912	if (PyString_GET_SIZE(self) == 0)
				3913	return PyInt_FromLong(0);
				3914
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3915	e = p + PyUnicode_GET_SIZE(self);
				3916	for (; p < e; p++) {
				3917	if (!Py_UNICODE_ISSPACE(*p))
				3918	return PyInt_FromLong(0);
				3919	}
				3920	return PyInt_FromLong(1);
				3921	}
				3922
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	3923	static char isalpha__doc__[] =
				3924	"S.isalpha() -> int\n\
				3925	\n\
				3926	Return 1 if all characters in S are alphabetic\n\
				3927	and there is at least one character in S, 0 otherwise.";
				3928
				3929	static PyObject*
				3930	unicode_isalpha(PyUnicodeObject self, PyObject args)
				3931	{
				3932	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3933	register const Py_UNICODE *e;
				3934
				3935	if (!PyArg_NoArgs(args))
				3936	return NULL;
				3937
				3938	/* Shortcut for single character strings */
				3939	if (PyUnicode_GET_SIZE(self) == 1 &&
				3940	Py_UNICODE_ISALPHA(*p))
				3941	return PyInt_FromLong(1);
				3942
				3943	/* Special case for empty strings */
				3944	if (PyString_GET_SIZE(self) == 0)
				3945	return PyInt_FromLong(0);
				3946
				3947	e = p + PyUnicode_GET_SIZE(self);
				3948	for (; p < e; p++) {
				3949	if (!Py_UNICODE_ISALPHA(*p))
				3950	return PyInt_FromLong(0);
				3951	}
				3952	return PyInt_FromLong(1);
				3953	}
				3954
				3955	static char isalnum__doc__[] =
				3956	"S.isalnum() -> int\n\
				3957	\n\
				3958	Return 1 if all characters in S are alphanumeric\n\
				3959	and there is at least one character in S, 0 otherwise.";
				3960
				3961	static PyObject*
				3962	unicode_isalnum(PyUnicodeObject self, PyObject args)
				3963	{
				3964	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3965	register const Py_UNICODE *e;
				3966
				3967	if (!PyArg_NoArgs(args))
				3968	return NULL;
				3969
				3970	/* Shortcut for single character strings */
				3971	if (PyUnicode_GET_SIZE(self) == 1 &&
				3972	Py_UNICODE_ISALNUM(*p))
				3973	return PyInt_FromLong(1);
				3974
				3975	/* Special case for empty strings */
				3976	if (PyString_GET_SIZE(self) == 0)
				3977	return PyInt_FromLong(0);
				3978
				3979	e = p + PyUnicode_GET_SIZE(self);
				3980	for (; p < e; p++) {
				3981	if (!Py_UNICODE_ISALNUM(*p))
				3982	return PyInt_FromLong(0);
				3983	}
				3984	return PyInt_FromLong(1);
				3985	}
				3986
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3987	static char isdecimal__doc__[] =
				3988	"S.isdecimal() -> int\n\
				3989	\n\
				3990	Return 1 if there are only decimal characters in S,\n\
				3991	0 otherwise.";
				3992
				3993	static PyObject*
				3994	unicode_isdecimal(PyUnicodeObject self, PyObject args)
				3995	{
				3996	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3997	register const Py_UNICODE *e;
				3998
				3999	if (!PyArg_NoArgs(args))
				4000	return NULL;
				4001
				4002	/* Shortcut for single character strings */
				4003	if (PyUnicode_GET_SIZE(self) == 1 &&
				4004	Py_UNICODE_ISDECIMAL(*p))
				4005	return PyInt_FromLong(1);
				4006
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4007	/* Special case for empty strings */
				4008	if (PyString_GET_SIZE(self) == 0)
				4009	return PyInt_FromLong(0);
				4010
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4011	e = p + PyUnicode_GET_SIZE(self);
				4012	for (; p < e; p++) {
				4013	if (!Py_UNICODE_ISDECIMAL(*p))
				4014	return PyInt_FromLong(0);
				4015	}
				4016	return PyInt_FromLong(1);
				4017	}
				4018
				4019	static char isdigit__doc__[] =
				4020	"S.isdigit() -> int\n\
				4021	\n\
				4022	Return 1 if there are only digit characters in S,\n\
				4023	0 otherwise.";
				4024
				4025	static PyObject*
				4026	unicode_isdigit(PyUnicodeObject self, PyObject args)
				4027	{
				4028	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4029	register const Py_UNICODE *e;
				4030
				4031	if (!PyArg_NoArgs(args))
				4032	return NULL;
				4033
				4034	/* Shortcut for single character strings */
				4035	if (PyUnicode_GET_SIZE(self) == 1 &&
				4036	Py_UNICODE_ISDIGIT(*p))
				4037	return PyInt_FromLong(1);
				4038
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4039	/* Special case for empty strings */
				4040	if (PyString_GET_SIZE(self) == 0)
				4041	return PyInt_FromLong(0);
				4042
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4043	e = p + PyUnicode_GET_SIZE(self);
				4044	for (; p < e; p++) {
				4045	if (!Py_UNICODE_ISDIGIT(*p))
				4046	return PyInt_FromLong(0);
				4047	}
				4048	return PyInt_FromLong(1);
				4049	}
				4050
				4051	static char isnumeric__doc__[] =
				4052	"S.isnumeric() -> int\n\
				4053	\n\
				4054	Return 1 if there are only numeric characters in S,\n\
				4055	0 otherwise.";
				4056
				4057	static PyObject*
				4058	unicode_isnumeric(PyUnicodeObject self, PyObject args)
				4059	{
				4060	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4061	register const Py_UNICODE *e;
				4062
				4063	if (!PyArg_NoArgs(args))
				4064	return NULL;
				4065
				4066	/* Shortcut for single character strings */
				4067	if (PyUnicode_GET_SIZE(self) == 1 &&
				4068	Py_UNICODE_ISNUMERIC(*p))
				4069	return PyInt_FromLong(1);
				4070
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4071	/* Special case for empty strings */
				4072	if (PyString_GET_SIZE(self) == 0)
				4073	return PyInt_FromLong(0);
				4074
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4075	e = p + PyUnicode_GET_SIZE(self);
				4076	for (; p < e; p++) {
				4077	if (!Py_UNICODE_ISNUMERIC(*p))
				4078	return PyInt_FromLong(0);
				4079	}
				4080	return PyInt_FromLong(1);
				4081	}
				4082
				4083	static char join__doc__[] =
				4084	"S.join(sequence) -> unicode\n\
				4085	\n\
				4086	Return a string which is the concatenation of the strings in the\n\
				4087	sequence. The separator between elements is S.";
				4088
				4089	static PyObject*
				4090	unicode_join(PyUnicodeObject self, PyObject args)
				4091	{
				4092	PyObject *data;
				4093	if (!PyArg_ParseTuple(args, "O:join", &data))
				4094	return NULL;
				4095
				4096	return PyUnicode_Join((PyObject *)self, data);
				4097	}
				4098
				4099	static int
				4100	unicode_length(PyUnicodeObject *self)
				4101	{
				4102	return self->length;
				4103	}
				4104
				4105	static char ljust__doc__[] =
				4106	"S.ljust(width) -> unicode\n\
				4107	\n\
				4108	Return S left justified in a Unicode string of length width. Padding is\n\
				4109	done using spaces.";
				4110
				4111	static PyObject *
				4112	unicode_ljust(PyUnicodeObject self, PyObject args)
				4113	{
				4114	int width;
				4115	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4116	return NULL;
				4117
				4118	if (self->length >= width) {
				4119	Py_INCREF(self);
				4120	return (PyObject*) self;
				4121	}
				4122
				4123	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4124	}
				4125
				4126	static char lower__doc__[] =
				4127	"S.lower() -> unicode\n\
				4128	\n\
				4129	Return a copy of the string S converted to lowercase.";
				4130
				4131	static PyObject*
				4132	unicode_lower(PyUnicodeObject self, PyObject args)
				4133	{
				4134	if (!PyArg_NoArgs(args))
				4135	return NULL;
				4136	return fixup(self, fixlower);
				4137	}
				4138
				4139	static char lstrip__doc__[] =
				4140	"S.lstrip() -> unicode\n\
				4141	\n\
				4142	Return a copy of the string S with leading whitespace removed.";
				4143
				4144	static PyObject *
				4145	unicode_lstrip(PyUnicodeObject self, PyObject args)
				4146	{
				4147	if (!PyArg_NoArgs(args))
				4148	return NULL;
				4149	return strip(self, 1, 0);
				4150	}
				4151
				4152	static PyObject*
				4153	unicode_repeat(PyUnicodeObject *str, int len)
				4154	{
				4155	PyUnicodeObject *u;
				4156	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4157	int nchars;
				4158	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4159
				4160	if (len < 0)
				4161	len = 0;
				4162
				4163	if (len == 1) {
				4164	/* no repeat, return original string */
				4165	Py_INCREF(str);
				4166	return (PyObject*) str;
				4167	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4168
				4169	/* ensure # of chars needed doesn't overflow int and # of bytes
				4170	* needed doesn't overflow size_t
				4171	*/
				4172	nchars = len * str->length;
				4173	if (len && nchars / len != str->length) {
				4174	PyErr_SetString(PyExc_OverflowError,
				4175	"repeated string is too long");
				4176	return NULL;
				4177	}
				4178	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4179	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4180	PyErr_SetString(PyExc_OverflowError,
				4181	"repeated string is too long");
				4182	return NULL;
				4183	}
				4184	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4185	if (!u)
				4186	return NULL;
				4187
				4188	p = u->str;
				4189
				4190	while (len-- > 0) {
				4191	Py_UNICODE_COPY(p, str->str, str->length);
				4192	p += str->length;
				4193	}
				4194
				4195	return (PyObject*) u;
				4196	}
				4197
				4198	PyObject PyUnicode_Replace(PyObject obj,
				4199	PyObject *subobj,
				4200	PyObject *replobj,
				4201	int maxcount)
				4202	{
				4203	PyObject *self;
				4204	PyObject *str1;
				4205	PyObject *str2;
				4206	PyObject *result;
				4207
				4208	self = PyUnicode_FromObject(obj);
				4209	if (self == NULL)
				4210	return NULL;
				4211	str1 = PyUnicode_FromObject(subobj);
				4212	if (str1 == NULL) {
				4213	Py_DECREF(self);
				4214	return NULL;
				4215	}
				4216	str2 = PyUnicode_FromObject(replobj);
				4217	if (str2 == NULL) {
				4218	Py_DECREF(self);
				4219	Py_DECREF(str1);
				4220	return NULL;
				4221	}
				4222	result = replace((PyUnicodeObject *)self,
				4223	(PyUnicodeObject *)str1,
				4224	(PyUnicodeObject *)str2,
				4225	maxcount);
				4226	Py_DECREF(self);
				4227	Py_DECREF(str1);
				4228	Py_DECREF(str2);
				4229	return result;
				4230	}
				4231
				4232	static char replace__doc__[] =
				4233	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4234	\n\
				4235	Return a copy of S with all occurrences of substring\n\
				4236	old replaced by new. If the optional argument maxsplit is\n\
				4237	given, only the first maxsplit occurrences are replaced.";
				4238
				4239	static PyObject*
				4240	unicode_replace(PyUnicodeObject self, PyObject args)
				4241	{
				4242	PyUnicodeObject *str1;
				4243	PyUnicodeObject *str2;
				4244	int maxcount = -1;
				4245	PyObject *result;
				4246
				4247	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4248	return NULL;
				4249	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4250	if (str1 == NULL)
				4251	return NULL;
				4252	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4253	if (str2 == NULL)
				4254	return NULL;
				4255
				4256	result = replace(self, str1, str2, maxcount);
				4257
				4258	Py_DECREF(str1);
				4259	Py_DECREF(str2);
				4260	return result;
				4261	}
				4262
				4263	static
				4264	PyObject unicode_repr(PyObject unicode)
				4265	{
				4266	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4267	PyUnicode_GET_SIZE(unicode),
				4268	1);
				4269	}
				4270
				4271	static char rfind__doc__[] =
				4272	"S.rfind(sub [,start [,end]]) -> int\n\
				4273	\n\
				4274	Return the highest index in S where substring sub is found,\n\
				4275	such that sub is contained within s[start,end]. Optional\n\
				4276	arguments start and end are interpreted as in slice notation.\n\
				4277	\n\
				4278	Return -1 on failure.";
				4279
				4280	static PyObject *
				4281	unicode_rfind(PyUnicodeObject self, PyObject args)
				4282	{
				4283	PyUnicodeObject *substring;
				4284	int start = 0;
				4285	int end = INT_MAX;
				4286	PyObject *result;
				4287
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4288	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4289	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4290	return NULL;
				4291	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4292	(PyObject *)substring);
				4293	if (substring == NULL)
				4294	return NULL;
				4295
				4296	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4297
				4298	Py_DECREF(substring);
				4299	return result;
				4300	}
				4301
				4302	static char rindex__doc__[] =
				4303	"S.rindex(sub [,start [,end]]) -> int\n\
				4304	\n\
				4305	Like S.rfind() but raise ValueError when the substring is not found.";
				4306
				4307	static PyObject *
				4308	unicode_rindex(PyUnicodeObject self, PyObject args)
				4309	{
				4310	int result;
				4311	PyUnicodeObject *substring;
				4312	int start = 0;
				4313	int end = INT_MAX;
				4314
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4315	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4316	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4317	return NULL;
				4318	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4319	(PyObject *)substring);
				4320	if (substring == NULL)
				4321	return NULL;
				4322
				4323	result = findstring(self, substring, start, end, -1);
				4324
				4325	Py_DECREF(substring);
				4326	if (result < 0) {
				4327	PyErr_SetString(PyExc_ValueError, "substring not found");
				4328	return NULL;
				4329	}
				4330	return PyInt_FromLong(result);
				4331	}
				4332
				4333	static char rjust__doc__[] =
				4334	"S.rjust(width) -> unicode\n\
				4335	\n\
				4336	Return S right justified in a Unicode string of length width. Padding is\n\
				4337	done using spaces.";
				4338
				4339	static PyObject *
				4340	unicode_rjust(PyUnicodeObject self, PyObject args)
				4341	{
				4342	int width;
				4343	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4344	return NULL;
				4345
				4346	if (self->length >= width) {
				4347	Py_INCREF(self);
				4348	return (PyObject*) self;
				4349	}
				4350
				4351	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4352	}
				4353
				4354	static char rstrip__doc__[] =
				4355	"S.rstrip() -> unicode\n\
				4356	\n\
				4357	Return a copy of the string S with trailing whitespace removed.";
				4358
				4359	static PyObject *
				4360	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4361	{
				4362	if (!PyArg_NoArgs(args))
				4363	return NULL;
				4364	return strip(self, 0, 1);
				4365	}
				4366
				4367	static PyObject*
				4368	unicode_slice(PyUnicodeObject *self, int start, int end)
				4369	{
				4370	/* standard clamping */
				4371	if (start < 0)
				4372	start = 0;
				4373	if (end < 0)
				4374	end = 0;
				4375	if (end > self->length)
				4376	end = self->length;
				4377	if (start == 0 && end == self->length) {
				4378	/* full slice, return original string */
				4379	Py_INCREF(self);
				4380	return (PyObject*) self;
				4381	}
				4382	if (start > end)
				4383	start = end;
				4384	/* copy slice */
				4385	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4386	end - start);
				4387	}
				4388
				4389	PyObject PyUnicode_Split(PyObject s,
				4390	PyObject *sep,
				4391	int maxsplit)
				4392	{
				4393	PyObject *result;
				4394
				4395	s = PyUnicode_FromObject(s);
				4396	if (s == NULL)
				4397	return NULL;
				4398	if (sep != NULL) {
				4399	sep = PyUnicode_FromObject(sep);
				4400	if (sep == NULL) {
				4401	Py_DECREF(s);
				4402	return NULL;
				4403	}
				4404	}
				4405
				4406	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4407
				4408	Py_DECREF(s);
				4409	Py_XDECREF(sep);
				4410	return result;
				4411	}
				4412
				4413	static char split__doc__[] =
				4414	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4415	\n\
				4416	Return a list of the words in S, using sep as the\n\
				4417	delimiter string. If maxsplit is given, at most maxsplit\n\
				4418	splits are done. If sep is not specified, any whitespace string\n\
				4419	is a separator.";
				4420
				4421	static PyObject*
				4422	unicode_split(PyUnicodeObject self, PyObject args)
				4423	{
				4424	PyObject *substring = Py_None;
				4425	int maxcount = -1;
				4426
				4427	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4428	return NULL;
				4429
				4430	if (substring == Py_None)
				4431	return split(self, NULL, maxcount);
				4432	else if (PyUnicode_Check(substring))
				4433	return split(self, (PyUnicodeObject *)substring, maxcount);
				4434	else
				4435	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4436	}
				4437
				4438	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4439	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4440	\n\
				4441	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4442	Line breaks are not included in the resulting list unless keepends\n\
				4443	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4444
				4445	static PyObject*
				4446	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4447	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4448	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4449
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4450	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4451	return NULL;
				4452
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4453	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4454	}
				4455
				4456	static
				4457	PyObject unicode_str(PyUnicodeObject self)
				4458	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4459	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4460	}
				4461
				4462	static char strip__doc__[] =
				4463	"S.strip() -> unicode\n\
				4464	\n\
				4465	Return a copy of S with leading and trailing whitespace removed.";
				4466
				4467	static PyObject *
				4468	unicode_strip(PyUnicodeObject self, PyObject args)
				4469	{
				4470	if (!PyArg_NoArgs(args))
				4471	return NULL;
				4472	return strip(self, 1, 1);
				4473	}
				4474
				4475	static char swapcase__doc__[] =
				4476	"S.swapcase() -> unicode\n\
				4477	\n\
				4478	Return a copy of S with uppercase characters converted to lowercase\n\
				4479	and vice versa.";
				4480
				4481	static PyObject*
				4482	unicode_swapcase(PyUnicodeObject self, PyObject args)
				4483	{
				4484	if (!PyArg_NoArgs(args))
				4485	return NULL;
				4486	return fixup(self, fixswapcase);
				4487	}
				4488
				4489	static char translate__doc__[] =
				4490	"S.translate(table) -> unicode\n\
				4491	\n\
				4492	Return a copy of the string S, where all characters have been mapped\n\
				4493	through the given translation table, which must be a mapping of\n\
				4494	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4495	are left untouched. Characters mapped to None are deleted.";
				4496
				4497	static PyObject*
				4498	unicode_translate(PyUnicodeObject self, PyObject args)
				4499	{
				4500	PyObject *table;
				4501
				4502	if (!PyArg_ParseTuple(args, "O:translate", &table))
				4503	return NULL;
				4504	return PyUnicode_TranslateCharmap(self->str,
				4505	self->length,
				4506	table,
				4507	"ignore");
				4508	}
				4509
				4510	static char upper__doc__[] =
				4511	"S.upper() -> unicode\n\
				4512	\n\
				4513	Return a copy of S converted to uppercase.";
				4514
				4515	static PyObject*
				4516	unicode_upper(PyUnicodeObject self, PyObject args)
				4517	{
				4518	if (!PyArg_NoArgs(args))
				4519	return NULL;
				4520	return fixup(self, fixupper);
				4521	}
				4522
				4523	#if 0
				4524	static char zfill__doc__[] =
				4525	"S.zfill(width) -> unicode\n\
				4526	\n\
				4527	Pad a numeric string x with zeros on the left, to fill a field\n\
				4528	of the specified width. The string x is never truncated.";
				4529
				4530	static PyObject *
				4531	unicode_zfill(PyUnicodeObject self, PyObject args)
				4532	{
				4533	int fill;
				4534	PyUnicodeObject *u;
				4535
				4536	int width;
				4537	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4538	return NULL;
				4539
				4540	if (self->length >= width) {
				4541	Py_INCREF(self);
				4542	return (PyObject*) self;
				4543	}
				4544
				4545	fill = width - self->length;
				4546
				4547	u = pad(self, fill, 0, '0');
				4548
				4549	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4550	/* move sign to beginning of string */
				4551	u->str[0] = u->str[fill];
				4552	u->str[fill] = '0';
				4553	}
				4554
				4555	return (PyObject*) u;
				4556	}
				4557	#endif
				4558
				4559	#if 0
				4560	static PyObject*
				4561	unicode_freelistsize(PyUnicodeObject self, PyObject args)
				4562	{
				4563	if (!PyArg_NoArgs(args))
				4564	return NULL;
				4565	return PyInt_FromLong(unicode_freelist_size);
				4566	}
				4567	#endif
				4568
				4569	static char startswith__doc__[] =
				4570	"S.startswith(prefix[, start[, end]]) -> int\n\
				4571	\n\
				4572	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4573	optional start, test S beginning at that position. With optional end, stop\n\
				4574	comparing S at that position.";
				4575
				4576	static PyObject *
				4577	unicode_startswith(PyUnicodeObject *self,
				4578	PyObject *args)
				4579	{
				4580	PyUnicodeObject *substring;
				4581	int start = 0;
				4582	int end = INT_MAX;
				4583	PyObject *result;
				4584
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4585	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4586	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4587	return NULL;
				4588	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4589	(PyObject *)substring);
				4590	if (substring == NULL)
				4591	return NULL;
				4592
				4593	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4594
				4595	Py_DECREF(substring);
				4596	return result;
				4597	}
				4598
				4599
				4600	static char endswith__doc__[] =
				4601	"S.endswith(suffix[, start[, end]]) -> int\n\
				4602	\n\
				4603	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4604	optional start, test S beginning at that position. With optional end, stop\n\
				4605	comparing S at that position.";
				4606
				4607	static PyObject *
				4608	unicode_endswith(PyUnicodeObject *self,
				4609	PyObject *args)
				4610	{
				4611	PyUnicodeObject *substring;
				4612	int start = 0;
				4613	int end = INT_MAX;
				4614	PyObject *result;
				4615
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4616	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4617	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4618	return NULL;
				4619	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4620	(PyObject *)substring);
				4621	if (substring == NULL)
				4622	return NULL;
				4623
				4624	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4625
				4626	Py_DECREF(substring);
				4627	return result;
				4628	}
				4629
				4630
				4631	static PyMethodDef unicode_methods[] = {
				4632
				4633	/* Order is according to common usage: often used methods should
				4634	appear first, since lookup is done sequentially. */
				4635
				4636	{"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
				4637	{"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
				4638	{"split", (PyCFunction) unicode_split, 1, split__doc__},
				4639	{"join", (PyCFunction) unicode_join, 1, join__doc__},
				4640	{"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
				4641	{"title", (PyCFunction) unicode_title, 0, title__doc__},
				4642	{"center", (PyCFunction) unicode_center, 1, center__doc__},
				4643	{"count", (PyCFunction) unicode_count, 1, count__doc__},
				4644	{"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
				4645	{"find", (PyCFunction) unicode_find, 1, find__doc__},
				4646	{"index", (PyCFunction) unicode_index, 1, index__doc__},
				4647	{"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
				4648	{"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
				4649	{"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
				4650	/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
				4651	{"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
				4652	{"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
				4653	{"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
				4654	{"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
				4655	{"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
				4656	{"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
				4657	{"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
				4658	{"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
				4659	{"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
				4660	{"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
				4661	{"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
				4662	{"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
				4663	{"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
				4664	{"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
				4665	{"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
				4666	{"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
				4667	{"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
				4668	{"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4669	{"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
				4670	{"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4671	#if 0
				4672	{"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
				4673	{"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
				4674	#endif
				4675
				4676	#if 0
				4677	/* This one is just used for debugging the implementation. */
				4678	{"freelistsize", (PyCFunction) unicode_freelistsize, 0},
				4679	#endif
				4680
				4681	{NULL, NULL}
				4682	};
				4683
				4684	static PyObject *
				4685	unicode_getattr(PyUnicodeObject self, char name)
				4686	{
				4687	return Py_FindMethod(unicode_methods, (PyObject*) self, name);
				4688	}
				4689
				4690	static PySequenceMethods unicode_as_sequence = {
				4691	(inquiry) unicode_length, /* sq_length */
				4692	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4693	(intargfunc) unicode_repeat, /* sq_repeat */
				4694	(intargfunc) unicode_getitem, /* sq_item */
				4695	(intintargfunc) unicode_slice, /* sq_slice */
				4696	0, /* sq_ass_item */
				4697	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4698	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4699	};
				4700
				4701	static int
				4702	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4703	int index,
				4704	const void **ptr)
				4705	{
				4706	if (index != 0) {
				4707	PyErr_SetString(PyExc_SystemError,
				4708	"accessing non-existent unicode segment");
				4709	return -1;
				4710	}
				4711	ptr = (void ) self->str;
				4712	return PyUnicode_GET_DATA_SIZE(self);
				4713	}
				4714
				4715	static int
				4716	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4717	const void **ptr)
				4718	{
				4719	PyErr_SetString(PyExc_TypeError,
				4720	"cannot use unicode as modifyable buffer");
				4721	return -1;
				4722	}
				4723
				4724	static int
				4725	unicode_buffer_getsegcount(PyUnicodeObject *self,
				4726	int *lenp)
				4727	{
				4728	if (lenp)
				4729	*lenp = PyUnicode_GET_DATA_SIZE(self);
				4730	return 1;
				4731	}
				4732
				4733	static int
				4734	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				4735	int index,
				4736	const void **ptr)
				4737	{
				4738	PyObject *str;
				4739
				4740	if (index != 0) {
				4741	PyErr_SetString(PyExc_SystemError,
				4742	"accessing non-existent unicode segment");
				4743	return -1;
				4744	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	4745	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4746	if (str == NULL)
				4747	return -1;
				4748	ptr = (void ) PyString_AS_STRING(str);
				4749	return PyString_GET_SIZE(str);
				4750	}
				4751
				4752	/* Helpers for PyUnicode_Format() */
				4753
				4754	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	4755	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4756	{
				4757	int argidx = *p_argidx;
				4758	if (argidx < arglen) {
				4759	(*p_argidx)++;
				4760	if (arglen < 0)
				4761	return args;
				4762	else
				4763	return PyTuple_GetItem(args, argidx);
				4764	}
				4765	PyErr_SetString(PyExc_TypeError,
				4766	"not enough arguments for format string");
				4767	return NULL;
				4768	}
				4769
				4770	#define F_LJUST (1<<0)
				4771	#define F_SIGN (1<<1)
				4772	#define F_BLANK (1<<2)
				4773	#define F_ALT (1<<3)
				4774	#define F_ZERO (1<<4)
				4775
				4776	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4777	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4778	{
				4779	register int i;
				4780	int len;
				4781	va_list va;
				4782	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4783	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4784
				4785	/* First, format the string as char array, then expand to Py_UNICODE
				4786	array. */
				4787	charbuffer = (char *)buffer;
				4788	len = vsprintf(charbuffer, format, va);
				4789	for (i = len - 1; i >= 0; i--)
				4790	buffer[i] = (Py_UNICODE) charbuffer[i];
				4791
				4792	va_end(va);
				4793	return len;
				4794	}
				4795
				4796	static int
				4797	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4798	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4799	int flags,
				4800	int prec,
				4801	int type,
				4802	PyObject *v)
				4803	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4804	/* fmt = '%#.' + `prec` + `type`
				4805	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4806	char fmt[20];
				4807	double x;
				4808
				4809	x = PyFloat_AsDouble(v);
				4810	if (x == -1.0 && PyErr_Occurred())
				4811	return -1;
				4812	if (prec < 0)
				4813	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4814	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				4815	type = 'g';
				4816	sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4817	/* worst case length calc to ensure no buffer overrun:
				4818	fmt = %#.<prec>g
				4819	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				4820	for any double rep.)
				4821	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				4822	If prec=0 the effective precision is 1 (the leading digit is
				4823	always given), therefore increase by one to 10+prec. */
				4824	if (buflen <= (size_t)10 + (size_t)prec) {
				4825	PyErr_SetString(PyExc_OverflowError,
				4826	"formatted float is too long (precision too long?)");
				4827	return -1;
				4828	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4829	return usprintf(buf, fmt, x);
				4830	}
				4831
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4832	static PyObject*
				4833	formatlong(PyObject *val, int flags, int prec, int type)
				4834	{
				4835	char *buf;
				4836	int i, len;
				4837	PyObject str; / temporary string object. */
				4838	PyUnicodeObject *result;
				4839
				4840	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				4841	if (!str)
				4842	return NULL;
				4843	result = _PyUnicode_New(len);
				4844	for (i = 0; i < len; i++)
				4845	result->str[i] = buf[i];
				4846	result->str[len] = 0;
				4847	Py_DECREF(str);
				4848	return (PyObject*)result;
				4849	}
				4850
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4851	static int
				4852	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4853	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4854	int flags,
				4855	int prec,
				4856	int type,
				4857	PyObject *v)
				4858	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4859	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	4860	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				4861	+ 1 + 1 = 24*/
				4862	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4863	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4864	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4865
				4866	x = PyInt_AsLong(v);
				4867	if (x == -1 && PyErr_Occurred())
				4868	return -1;
				4869	if (prec < 0)
				4870	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4871	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				4872	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				4873	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				4874	PyErr_SetString(PyExc_OverflowError,
				4875	"formatted integer is too long (precision too long?)");
				4876	return -1;
				4877	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4878	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				4879	* but we want it (for consistency with other %#x conversions, and
				4880	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4881	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				4882	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				4883	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	4884	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	4885	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				4886	/* Only way to know what the platform does is to try it. */
				4887	sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
				4888	if (fmt[1] != (char)type) {
				4889	/* Supply our own leading 0x/0X -- needed under std C */
				4890	use_native_c_format = 0;
				4891	sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
				4892	}
				4893	}
				4894	if (use_native_c_format)
				4895	sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4896	return usprintf(buf, fmt, x);
				4897	}
				4898
				4899	static int
				4900	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4901	size_t buflen,
				4902	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4903	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4904	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4905	if (PyUnicode_Check(v)) {
				4906	if (PyUnicode_GET_SIZE(v) != 1)
				4907	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4908	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4909	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4910
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4911	else if (PyString_Check(v)) {
				4912	if (PyString_GET_SIZE(v) != 1)
				4913	goto onError;
				4914	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				4915	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4916
				4917	else {
				4918	/* Integer input truncated to a character */
				4919	long x;
				4920	x = PyInt_AsLong(v);
				4921	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4922	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4923	buf[0] = (char) x;
				4924	}
				4925	buf[1] = '\0';
				4926	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	4927
				4928	onError:
				4929	PyErr_SetString(PyExc_TypeError,
				4930	"%c requires int or char");
				4931	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4932	}
				4933
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	4934	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				4935
				4936	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				4937	chars are formatted. XXX This is a magic number. Each formatting
				4938	routine does bounds checking to ensure no overflow, but a better
				4939	solution may be to malloc a buffer of appropriate size for each
				4940	format. For now, the current solution is sufficient.
				4941	*/
				4942	#define FORMATBUFLEN (size_t)120
				4943
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4944	PyObject PyUnicode_Format(PyObject format,
				4945	PyObject *args)
				4946	{
				4947	Py_UNICODE fmt, res;
				4948	int fmtcnt, rescnt, reslen, arglen, argidx;
				4949	int args_owned = 0;
				4950	PyUnicodeObject *result = NULL;
				4951	PyObject *dict = NULL;
				4952	PyObject *uformat;
				4953
				4954	if (format == NULL \|\| args == NULL) {
				4955	PyErr_BadInternalCall();
				4956	return NULL;
				4957	}
				4958	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4959	if (uformat == NULL)
				4960	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4961	fmt = PyUnicode_AS_UNICODE(uformat);
				4962	fmtcnt = PyUnicode_GET_SIZE(uformat);
				4963
				4964	reslen = rescnt = fmtcnt + 100;
				4965	result = _PyUnicode_New(reslen);
				4966	if (result == NULL)
				4967	goto onError;
				4968	res = PyUnicode_AS_UNICODE(result);
				4969
				4970	if (PyTuple_Check(args)) {
				4971	arglen = PyTuple_Size(args);
				4972	argidx = 0;
				4973	}
				4974	else {
				4975	arglen = -1;
				4976	argidx = -2;
				4977	}
				4978	if (args->ob_type->tp_as_mapping)
				4979	dict = args;
				4980
				4981	while (--fmtcnt >= 0) {
				4982	if (*fmt != '%') {
				4983	if (--rescnt < 0) {
				4984	rescnt = fmtcnt + 100;
				4985	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	4986	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4987	return NULL;
				4988	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				4989	--rescnt;
				4990	}
				4991	res++ = fmt++;
				4992	}
				4993	else {
				4994	/* Got a format specifier */
				4995	int flags = 0;
				4996	int width = -1;
				4997	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4998	Py_UNICODE c = '\0';
				4999	Py_UNICODE fill;
				5000	PyObject *v = NULL;
				5001	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5002	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5003	Py_UNICODE sign;
				5004	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5005	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5006
				5007	fmt++;
				5008	if (*fmt == '(') {
				5009	Py_UNICODE *keystart;
				5010	int keylen;
				5011	PyObject *key;
				5012	int pcount = 1;
				5013
				5014	if (dict == NULL) {
				5015	PyErr_SetString(PyExc_TypeError,
				5016	"format requires a mapping");
				5017	goto onError;
				5018	}
				5019	++fmt;
				5020	--fmtcnt;
				5021	keystart = fmt;
				5022	/* Skip over balanced parentheses */
				5023	while (pcount > 0 && --fmtcnt >= 0) {
				5024	if (*fmt == ')')
				5025	--pcount;
				5026	else if (*fmt == '(')
				5027	++pcount;
				5028	fmt++;
				5029	}
				5030	keylen = fmt - keystart - 1;
				5031	if (fmtcnt < 0 \|\| pcount > 0) {
				5032	PyErr_SetString(PyExc_ValueError,
				5033	"incomplete format key");
				5034	goto onError;
				5035	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5036	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5037	then looked up since Python uses strings to hold
				5038	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5039	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5040	key = PyUnicode_EncodeUTF8(keystart,
				5041	keylen,
				5042	NULL);
				5043	if (key == NULL)
				5044	goto onError;
				5045	if (args_owned) {
				5046	Py_DECREF(args);
				5047	args_owned = 0;
				5048	}
				5049	args = PyObject_GetItem(dict, key);
				5050	Py_DECREF(key);
				5051	if (args == NULL) {
				5052	goto onError;
				5053	}
				5054	args_owned = 1;
				5055	arglen = -1;
				5056	argidx = -2;
				5057	}
				5058	while (--fmtcnt >= 0) {
				5059	switch (c = *fmt++) {
				5060	case '-': flags \|= F_LJUST; continue;
				5061	case '+': flags \|= F_SIGN; continue;
				5062	case ' ': flags \|= F_BLANK; continue;
				5063	case '#': flags \|= F_ALT; continue;
				5064	case '0': flags \|= F_ZERO; continue;
				5065	}
				5066	break;
				5067	}
				5068	if (c == '*') {
				5069	v = getnextarg(args, arglen, &argidx);
				5070	if (v == NULL)
				5071	goto onError;
				5072	if (!PyInt_Check(v)) {
				5073	PyErr_SetString(PyExc_TypeError,
				5074	"* wants int");
				5075	goto onError;
				5076	}
				5077	width = PyInt_AsLong(v);
				5078	if (width < 0) {
				5079	flags \|= F_LJUST;
				5080	width = -width;
				5081	}
				5082	if (--fmtcnt >= 0)
				5083	c = *fmt++;
				5084	}
				5085	else if (c >= '0' && c <= '9') {
				5086	width = c - '0';
				5087	while (--fmtcnt >= 0) {
				5088	c = *fmt++;
				5089	if (c < '0' \|\| c > '9')
				5090	break;
				5091	if ((width*10) / 10 != width) {
				5092	PyErr_SetString(PyExc_ValueError,
				5093	"width too big");
				5094	goto onError;
				5095	}
				5096	width = width*10 + (c - '0');
				5097	}
				5098	}
				5099	if (c == '.') {
				5100	prec = 0;
				5101	if (--fmtcnt >= 0)
				5102	c = *fmt++;
				5103	if (c == '*') {
				5104	v = getnextarg(args, arglen, &argidx);
				5105	if (v == NULL)
				5106	goto onError;
				5107	if (!PyInt_Check(v)) {
				5108	PyErr_SetString(PyExc_TypeError,
				5109	"* wants int");
				5110	goto onError;
				5111	}
				5112	prec = PyInt_AsLong(v);
				5113	if (prec < 0)
				5114	prec = 0;
				5115	if (--fmtcnt >= 0)
				5116	c = *fmt++;
				5117	}
				5118	else if (c >= '0' && c <= '9') {
				5119	prec = c - '0';
				5120	while (--fmtcnt >= 0) {
				5121	c = Py_CHARMASK(*fmt++);
				5122	if (c < '0' \|\| c > '9')
				5123	break;
				5124	if ((prec*10) / 10 != prec) {
				5125	PyErr_SetString(PyExc_ValueError,
				5126	"prec too big");
				5127	goto onError;
				5128	}
				5129	prec = prec*10 + (c - '0');
				5130	}
				5131	}
				5132	} /* prec */
				5133	if (fmtcnt >= 0) {
				5134	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5135	if (--fmtcnt >= 0)
				5136	c = *fmt++;
				5137	}
				5138	}
				5139	if (fmtcnt < 0) {
				5140	PyErr_SetString(PyExc_ValueError,
				5141	"incomplete format");
				5142	goto onError;
				5143	}
				5144	if (c != '%') {
				5145	v = getnextarg(args, arglen, &argidx);
				5146	if (v == NULL)
				5147	goto onError;
				5148	}
				5149	sign = 0;
				5150	fill = ' ';
				5151	switch (c) {
				5152
				5153	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5154	pbuf = formatbuf;
				5155	/* presume that buffer length is at least 1 */
				5156	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5157	len = 1;
				5158	break;
				5159
				5160	case 's':
				5161	case 'r':
				5162	if (PyUnicode_Check(v) && c == 's') {
				5163	temp = v;
				5164	Py_INCREF(temp);
				5165	}
				5166	else {
				5167	PyObject *unicode;
				5168	if (c == 's')
				5169	temp = PyObject_Str(v);
				5170	else
				5171	temp = PyObject_Repr(v);
				5172	if (temp == NULL)
				5173	goto onError;
				5174	if (!PyString_Check(temp)) {
				5175	/* XXX Note: this should never happen, since
				5176	PyObject_Repr() and PyObject_Str() assure
				5177	this */
				5178	Py_DECREF(temp);
				5179	PyErr_SetString(PyExc_TypeError,
				5180	"%s argument has non-string str()");
				5181	goto onError;
				5182	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5183	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5184	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5185	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5186	"strict");
				5187	Py_DECREF(temp);
				5188	temp = unicode;
				5189	if (temp == NULL)
				5190	goto onError;
				5191	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5192	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5193	len = PyUnicode_GET_SIZE(temp);
				5194	if (prec >= 0 && len > prec)
				5195	len = prec;
				5196	break;
				5197
				5198	case 'i':
				5199	case 'd':
				5200	case 'u':
				5201	case 'o':
				5202	case 'x':
				5203	case 'X':
				5204	if (c == 'i')
				5205	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5206	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5207	temp = formatlong(v, flags, prec, c);
				5208	if (!temp)
				5209	goto onError;
				5210	pbuf = PyUnicode_AS_UNICODE(temp);
				5211	len = PyUnicode_GET_SIZE(temp);
				5212	/* unbounded ints can always produce
				5213	a sign character! */
				5214	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5215	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5216	else {
				5217	pbuf = formatbuf;
				5218	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5219	flags, prec, c, v);
				5220	if (len < 0)
				5221	goto onError;
				5222	/* only d conversion is signed */
				5223	sign = c == 'd';
				5224	}
				5225	if (flags & F_ZERO)
				5226	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5227	break;
				5228
				5229	case 'e':
				5230	case 'E':
				5231	case 'f':
				5232	case 'g':
				5233	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5234	pbuf = formatbuf;
				5235	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5236	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5237	if (len < 0)
				5238	goto onError;
				5239	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5240	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5241	fill = '0';
				5242	break;
				5243
				5244	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5245	pbuf = formatbuf;
				5246	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5247	if (len < 0)
				5248	goto onError;
				5249	break;
				5250
				5251	default:
				5252	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5253	"unsupported format character '%c' (0x%x) "
				5254	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5255	(31<=c && c<=126) ? c : '?',
				5256	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5257	goto onError;
				5258	}
				5259	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5260	if (pbuf == '-' \|\| pbuf == '+') {
				5261	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5262	len--;
				5263	}
				5264	else if (flags & F_SIGN)
				5265	sign = '+';
				5266	else if (flags & F_BLANK)
				5267	sign = ' ';
				5268	else
				5269	sign = 0;
				5270	}
				5271	if (width < len)
				5272	width = len;
				5273	if (rescnt < width + (sign != 0)) {
				5274	reslen -= rescnt;
				5275	rescnt = width + fmtcnt + 100;
				5276	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5277	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5278	return NULL;
				5279	res = PyUnicode_AS_UNICODE(result)
				5280	+ reslen - rescnt;
				5281	}
				5282	if (sign) {
				5283	if (fill != ' ')
				5284	*res++ = sign;
				5285	rescnt--;
				5286	if (width > len)
				5287	width--;
				5288	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5289	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5290	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5291	assert(pbuf[1] == c);
				5292	if (fill != ' ') {
				5293	res++ = pbuf++;
				5294	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5295	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5296	rescnt -= 2;
				5297	width -= 2;
				5298	if (width < 0)
				5299	width = 0;
				5300	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5301	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5302	if (width > len && !(flags & F_LJUST)) {
				5303	do {
				5304	--rescnt;
				5305	*res++ = fill;
				5306	} while (--width > len);
				5307	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5308	if (fill == ' ') {
				5309	if (sign)
				5310	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5311	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5312	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5313	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5314	res++ = pbuf++;
				5315	res++ = pbuf++;
				5316	}
				5317	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5318	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5319	res += len;
				5320	rescnt -= len;
				5321	while (--width >= len) {
				5322	--rescnt;
				5323	*res++ = ' ';
				5324	}
				5325	if (dict && (argidx < arglen) && c != '%') {
				5326	PyErr_SetString(PyExc_TypeError,
				5327	"not all arguments converted");
				5328	goto onError;
				5329	}
				5330	Py_XDECREF(temp);
				5331	} /* '%' */
				5332	} /* until end */
				5333	if (argidx < arglen && !dict) {
				5334	PyErr_SetString(PyExc_TypeError,
				5335	"not all arguments converted");
				5336	goto onError;
				5337	}
				5338
				5339	if (args_owned) {
				5340	Py_DECREF(args);
				5341	}
				5342	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5343	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5344	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5345	return (PyObject *)result;
				5346
				5347	onError:
				5348	Py_XDECREF(result);
				5349	Py_DECREF(uformat);
				5350	if (args_owned) {
				5351	Py_DECREF(args);
				5352	}
				5353	return NULL;
				5354	}
				5355
				5356	static PyBufferProcs unicode_as_buffer = {
				5357	(getreadbufferproc) unicode_buffer_getreadbuf,
				5358	(getwritebufferproc) unicode_buffer_getwritebuf,
				5359	(getsegcountproc) unicode_buffer_getsegcount,
				5360	(getcharbufferproc) unicode_buffer_getcharbuf,
				5361	};
				5362
				5363	PyTypeObject PyUnicode_Type = {
				5364	PyObject_HEAD_INIT(&PyType_Type)
				5365	0, /* ob_size */
				5366	"unicode", /* tp_name */
				5367	sizeof(PyUnicodeObject), /* tp_size */
				5368	0, /* tp_itemsize */
				5369	/* Slots */
				5370	(destructor)_PyUnicode_Free, /* tp_dealloc */
				5371	0, /* tp_print */
				5372	(getattrfunc)unicode_getattr, /* tp_getattr */
				5373	0, /* tp_setattr */
				5374	(cmpfunc) unicode_compare, /* tp_compare */
				5375	(reprfunc) unicode_repr, /* tp_repr */
				5376	0, /* tp_as_number */
				5377	&unicode_as_sequence, /* tp_as_sequence */
				5378	0, /* tp_as_mapping */
				5379	(hashfunc) unicode_hash, /* tp_hash*/
				5380	0, /* tp_call*/
				5381	(reprfunc) unicode_str, /* tp_str */
				5382	(getattrofunc) NULL, /* tp_getattro */
				5383	(setattrofunc) NULL, /* tp_setattro */
				5384	&unicode_as_buffer, /* tp_as_buffer */
				5385	Py_TPFLAGS_DEFAULT, /* tp_flags */
				5386	};
				5387
				5388	/* Initialize the Unicode implementation */
				5389
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5390	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5391	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5392	int i;
				5393
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5394	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5395	unicode_freelist = NULL;
				5396	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5397	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5398	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5399	for (i = 0; i < 256; i++)
				5400	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5401	}
				5402
				5403	/* Finalize the Unicode implementation */
				5404
				5405	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5406	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5407	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5408	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5409	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5410
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5411	Py_XDECREF(unicode_empty);
				5412	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5413
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5414	for (i = 0; i < 256; i++) {
				5415	if (unicode_latin1[i]) {
				5416	Py_DECREF(unicode_latin1[i]);
				5417	unicode_latin1[i] = NULL;
				5418	}
				5419	}
				5420
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5421	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5422	PyUnicodeObject *v = u;
				5423	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5424	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5425	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5426	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5427	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5428	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5429	unicode_freelist = NULL;
				5430	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5431	}