Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 1d0508cc8b3ad36d3a8e0692cc2148ed8080a94d [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	107	PyUnicode_GetMax(void)
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	108	{
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	109	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
				204	unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
				205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	222	PyObject_DEL(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	227	void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	228	{
Guido van Rossum	604ddf8	2001-12-06 20:03:56 +0000	[diff] [blame]	229	if (PyUnicode_CheckExact(unicode) &&
				230	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	231	/* Keep-Alive optimization */
				232	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	233	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	234	unicode->str = NULL;
				235	unicode->length = 0;
				236	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	237	if (unicode->defenc) {
				238	Py_DECREF(unicode->defenc);
				239	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	240	}
				241	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	242	(PyUnicodeObject *)unicode = unicode_freelist;
				243	unicode_freelist = unicode;
				244	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	245	}
				246	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	247	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	248	Py_XDECREF(unicode->defenc);
Guido van Rossum	604ddf8	2001-12-06 20:03:56 +0000	[diff] [blame]	249	unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	250	}
				251	}
				252
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	253	int PyUnicode_Resize(PyObject **unicode,
				254	int length)
				255	{
				256	register PyUnicodeObject *v;
				257
				258	/* Argument checks */
				259	if (unicode == NULL) {
				260	PyErr_BadInternalCall();
				261	return -1;
				262	}
				263	v = (PyUnicodeObject )unicode;
				264	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				265	PyErr_BadInternalCall();
				266	return -1;
				267	}
				268
				269	/* Resizing unicode_empty and single character objects is not
				270	possible since these are being shared. We simply return a fresh
				271	copy with the same Unicode content. */
				272	if (v->length != length &&
				273	(v == unicode_empty \|\| v->length == 1)) {
				274	PyUnicodeObject *w = _PyUnicode_New(length);
				275	if (w == NULL)
				276	return -1;
				277	Py_UNICODE_COPY(w->str, v->str,
				278	length < v->length ? length : v->length);
				279	unicode = (PyObject )w;
				280	return 0;
				281	}
				282
				283	/* Note that we don't have to modify *unicode for unshared Unicode
				284	objects, since we can modify them in-place. */
				285	return unicode_resize(v, length);
				286	}
				287
				288	/* Internal API for use in unicodeobject.c only ! */
				289	#define _PyUnicode_Resize(unicodevar, length) \
				290	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				291
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	292	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				293	int size)
				294	{
				295	PyUnicodeObject *unicode;
				296
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	297	/* If the Unicode data is known at construction time, we can apply
				298	some optimizations which share commonly used objects. */
				299	if (u != NULL) {
				300
				301	/* Optimization for empty strings */
				302	if (size == 0 && unicode_empty != NULL) {
				303	Py_INCREF(unicode_empty);
				304	return (PyObject *)unicode_empty;
				305	}
				306
				307	/* Single character Unicode objects in the Latin-1 range are
				308	shared when using this constructor */
				309	if (size == 1 && *u < 256) {
				310	unicode = unicode_latin1[*u];
				311	if (!unicode) {
				312	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	313	if (!unicode)
				314	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	315	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	316	unicode_latin1[*u] = unicode;
				317	}
				318	Py_INCREF(unicode);
				319	return (PyObject *)unicode;
				320	}
				321	}
				322
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	323	unicode = _PyUnicode_New(size);
				324	if (!unicode)
				325	return NULL;
				326
				327	/* Copy the Unicode data into the new object */
				328	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	329	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	330
				331	return (PyObject *)unicode;
				332	}
				333
				334	#ifdef HAVE_WCHAR_H
				335
				336	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				337	int size)
				338	{
				339	PyUnicodeObject *unicode;
				340
				341	if (w == NULL) {
				342	PyErr_BadInternalCall();
				343	return NULL;
				344	}
				345
				346	unicode = _PyUnicode_New(size);
				347	if (!unicode)
				348	return NULL;
				349
				350	/* Copy the wchar_t data into the new object */
				351	#ifdef HAVE_USABLE_WCHAR_T
				352	memcpy(unicode->str, w, size * sizeof(wchar_t));
				353	#else
				354	{
				355	register Py_UNICODE *u;
				356	register int i;
				357	u = PyUnicode_AS_UNICODE(unicode);
				358	for (i = size; i >= 0; i--)
				359	u++ = w++;
				360	}
				361	#endif
				362
				363	return (PyObject *)unicode;
				364	}
				365
				366	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				367	register wchar_t *w,
				368	int size)
				369	{
				370	if (unicode == NULL) {
				371	PyErr_BadInternalCall();
				372	return -1;
				373	}
				374	if (size > PyUnicode_GET_SIZE(unicode))
				375	size = PyUnicode_GET_SIZE(unicode);
				376	#ifdef HAVE_USABLE_WCHAR_T
				377	memcpy(w, unicode->str, size * sizeof(wchar_t));
				378	#else
				379	{
				380	register Py_UNICODE *u;
				381	register int i;
				382	u = PyUnicode_AS_UNICODE(unicode);
				383	for (i = size; i >= 0; i--)
				384	w++ = u++;
				385	}
				386	#endif
				387
				388	return size;
				389	}
				390
				391	#endif
				392
				393	PyObject PyUnicode_FromObject(register PyObject obj)
				394	{
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	395	/* XXX Perhaps we should make this API an alias of
				396	PyObject_Unicode() instead ?! */
				397	if (PyUnicode_CheckExact(obj)) {
				398	Py_INCREF(obj);
				399	return obj;
				400	}
				401	if (PyUnicode_Check(obj)) {
				402	/* For a Unicode subtype that's not a Unicode object,
				403	return a true Unicode object with the same data. */
				404	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
				405	PyUnicode_GET_SIZE(obj));
				406	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	407	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				408	}
				409
				410	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				411	const char *encoding,
				412	const char *errors)
				413	{
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	414	const char *s = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	415	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	416	int owned = 0;
				417	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	418
				419	if (obj == NULL) {
				420	PyErr_BadInternalCall();
				421	return NULL;
				422	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	423
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	424	#if 0
				425	/* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburg	b5507ec	2001-10-19 12:02:29 +0000	[diff] [blame]	426	that no encodings is given and then redirect to
				427	PyObject_Unicode() which then applies the additional logic for
				428	Unicode subclasses.
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	429
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	430	NOTE: This API should really only be used for object which
				431	represent encoded Unicode !
				432
				433	*/
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	434	if (PyUnicode_Check(obj)) {
				435	if (encoding) {
				436	PyErr_SetString(PyExc_TypeError,
				437	"decoding Unicode is not supported");
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	438	return NULL;
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	439	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	440	return PyObject_Unicode(obj);
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	441	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	442	#else
				443	if (PyUnicode_Check(obj)) {
				444	PyErr_SetString(PyExc_TypeError,
				445	"decoding Unicode is not supported");
				446	return NULL;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	447	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	448	#endif
				449
				450	/* Coerce object */
				451	if (PyString_Check(obj)) {
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	452	s = PyString_AS_STRING(obj);
				453	len = PyString_GET_SIZE(obj);
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	454	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	455	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				456	/* Overwrite the error message with something more useful in
				457	case of a TypeError. */
				458	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	459	PyErr_Format(PyExc_TypeError,
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	460	"coercing to Unicode: need string or buffer, "
				461	"%.80s found",
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	462	obj->ob_type->tp_name);
				463	goto onError;
				464	}
				465
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	466	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	467	if (len == 0) {
				468	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	469	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	470	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	471	else
				472	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	473
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	474	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	475	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	476	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	477	return v;
				478
				479	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	480	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	481	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	482	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	483	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	484	}
				485
				486	PyObject PyUnicode_Decode(const char s,
				487	int size,
				488	const char *encoding,
				489	const char *errors)
				490	{
				491	PyObject buffer = NULL, unicode;
				492
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	493	if (encoding == NULL)
				494	encoding = PyUnicode_GetDefaultEncoding();
				495
				496	/* Shortcuts for common default encodings */
				497	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	498	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	499	else if (strcmp(encoding, "latin-1") == 0)
				500	return PyUnicode_DecodeLatin1(s, size, errors);
				501	else if (strcmp(encoding, "ascii") == 0)
				502	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	503
				504	/* Decode via the codec registry */
				505	buffer = PyBuffer_FromMemory((void *)s, size);
				506	if (buffer == NULL)
				507	goto onError;
				508	unicode = PyCodec_Decode(buffer, encoding, errors);
				509	if (unicode == NULL)
				510	goto onError;
				511	if (!PyUnicode_Check(unicode)) {
				512	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	513	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	514	unicode->ob_type->tp_name);
				515	Py_DECREF(unicode);
				516	goto onError;
				517	}
				518	Py_DECREF(buffer);
				519	return unicode;
				520
				521	onError:
				522	Py_XDECREF(buffer);
				523	return NULL;
				524	}
				525
				526	PyObject PyUnicode_Encode(const Py_UNICODE s,
				527	int size,
				528	const char *encoding,
				529	const char *errors)
				530	{
				531	PyObject v, unicode;
				532
				533	unicode = PyUnicode_FromUnicode(s, size);
				534	if (unicode == NULL)
				535	return NULL;
				536	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				537	Py_DECREF(unicode);
				538	return v;
				539	}
				540
				541	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				542	const char *encoding,
				543	const char *errors)
				544	{
				545	PyObject *v;
				546
				547	if (!PyUnicode_Check(unicode)) {
				548	PyErr_BadArgument();
				549	goto onError;
				550	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	551
				552	if (encoding == NULL)
				553	encoding = PyUnicode_GetDefaultEncoding();
				554
				555	/* Shortcuts for common default encodings */
				556	if (errors == NULL) {
				557	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	558	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	559	else if (strcmp(encoding, "latin-1") == 0)
				560	return PyUnicode_AsLatin1String(unicode);
				561	else if (strcmp(encoding, "ascii") == 0)
				562	return PyUnicode_AsASCIIString(unicode);
				563	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	564
				565	/* Encode via the codec registry */
				566	v = PyCodec_Encode(unicode, encoding, errors);
				567	if (v == NULL)
				568	goto onError;
				569	/* XXX Should we really enforce this ? */
				570	if (!PyString_Check(v)) {
				571	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	572	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	573	v->ob_type->tp_name);
				574	Py_DECREF(v);
				575	goto onError;
				576	}
				577	return v;
				578
				579	onError:
				580	return NULL;
				581	}
				582
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	583	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				584	const char *errors)
				585	{
				586	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				587
				588	if (v)
				589	return v;
				590	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				591	if (v && errors == NULL)
				592	((PyUnicodeObject *)unicode)->defenc = v;
				593	return v;
				594	}
				595
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	596	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				597	{
				598	if (!PyUnicode_Check(unicode)) {
				599	PyErr_BadArgument();
				600	goto onError;
				601	}
				602	return PyUnicode_AS_UNICODE(unicode);
				603
				604	onError:
				605	return NULL;
				606	}
				607
				608	int PyUnicode_GetSize(PyObject *unicode)
				609	{
				610	if (!PyUnicode_Check(unicode)) {
				611	PyErr_BadArgument();
				612	goto onError;
				613	}
				614	return PyUnicode_GET_SIZE(unicode);
				615
				616	onError:
				617	return -1;
				618	}
				619
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	620	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	621	{
				622	return unicode_default_encoding;
				623	}
				624
				625	int PyUnicode_SetDefaultEncoding(const char *encoding)
				626	{
				627	PyObject *v;
				628
				629	/* Make sure the encoding is valid. As side effect, this also
				630	loads the encoding into the codec registry cache. */
				631	v = _PyCodec_Lookup(encoding);
				632	if (v == NULL)
				633	goto onError;
				634	Py_DECREF(v);
				635	strncpy(unicode_default_encoding,
				636	encoding,
				637	sizeof(unicode_default_encoding));
				638	return 0;
				639
				640	onError:
				641	return -1;
				642	}
				643
Marc-André Lemburg	c60e6f7	2001-09-20 10:35:46 +0000	[diff] [blame]	644	/* --- UTF-7 Codec -------------------------------------------------------- */
				645
				646	/* see RFC2152 for details */
				647
				648	static
				649	char utf7_special[128] = {
				650	/* indicate whether a UTF-7 character is special i.e. cannot be directly
				651	encoded:
				652	0 - not special
				653	1 - special
				654	2 - whitespace (optional)
				655	3 - RFC2152 Set O (optional) */
				656	1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
				657	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				658	2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
				659	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
				660	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				661	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
				662	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				663	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
				664
				665	};
				666
				667	#define SPECIAL(c, encodeO, encodeWS) \
				668	(((c)>127 \|\| utf7_special[(c)] == 1) \|\| \
				669	(encodeWS && (utf7_special[(c)] == 2)) \|\| \
				670	(encodeO && (utf7_special[(c)] == 3)))
				671
				672	#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
				673	#define B64CHAR(c) (isalnum(c) \|\| (c) == '+' \|\| (c) == '/')
				674	#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
				675	(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
				676
				677	#define ENCODE(out, ch, bits) \
				678	while (bits >= 6) { \
				679	*out++ = B64(ch >> (bits-6)); \
				680	bits -= 6; \
				681	}
				682
				683	#define DECODE(out, ch, bits, surrogate) \
				684	while (bits >= 16) { \
				685	Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
				686	bits -= 16; \
				687	if (surrogate) { \
				688	/* We have already generated an error for the high surrogate
				689	so let's not bother seeing if the low surrogate is correct or not */\
				690	surrogate = 0; \
				691	} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
				692	/* This is a surrogate pair. Unfortunately we can't represent \
				693	it in a 16-bit character */ \
				694	surrogate = 1; \
				695	errmsg = "code pairs are not supported"; \
				696	goto utf7Error; \
				697	} else { \
				698	*out++ = outCh; \
				699	} \
				700	} \
				701
				702	static
				703	int utf7_decoding_error(Py_UNICODE **dest,
				704	const char *errors,
				705	const char *details)
				706	{
				707	if ((errors == NULL) \|\|
				708	(strcmp(errors,"strict") == 0)) {
				709	PyErr_Format(PyExc_UnicodeError,
				710	"UTF-7 decoding error: %.400s",
				711	details);
				712	return -1;
				713	}
				714	else if (strcmp(errors,"ignore") == 0) {
				715	return 0;
				716	}
				717	else if (strcmp(errors,"replace") == 0) {
				718	if (dest != NULL) {
				719	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				720	(*dest)++;
				721	}
				722	return 0;
				723	}
				724	else {
				725	PyErr_Format(PyExc_ValueError,
				726	"UTF-7 decoding error; unknown error handling code: %.400s",
				727	errors);
				728	return -1;
				729	}
				730	}
				731
				732	PyObject PyUnicode_DecodeUTF7(const char s,
				733	int size,
				734	const char *errors)
				735	{
				736	const char *e;
				737	PyUnicodeObject *unicode;
				738	Py_UNICODE *p;
				739	const char *errmsg = "";
				740	int inShift = 0;
				741	unsigned int bitsleft = 0;
				742	unsigned long charsleft = 0;
				743	int surrogate = 0;
				744
				745	unicode = _PyUnicode_New(size);
				746	if (!unicode)
				747	return NULL;
				748	if (size == 0)
				749	return (PyObject *)unicode;
				750
				751	p = unicode->str;
				752	e = s + size;
				753
				754	while (s < e) {
				755	Py_UNICODE ch = *s;
				756
				757	if (inShift) {
				758	if ((ch == '-') \|\| !B64CHAR(ch)) {
				759	inShift = 0;
				760	s++;
				761
				762	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				763	if (bitsleft >= 6) {
				764	/* The shift sequence has a partial character in it. If
				765	bitsleft < 6 then we could just classify it as padding
				766	but that is not the case here */
				767
				768	errmsg = "partial character in shift sequence";
				769	goto utf7Error;
				770	}
				771	/* According to RFC2152 the remaining bits should be zero. We
				772	choose to signal an error/insert a replacement character
				773	here so indicate the potential of a misencoded character. */
				774
				775	/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
				776	if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
				777	errmsg = "non-zero padding bits in shift sequence";
				778	goto utf7Error;
				779	}
				780
				781	if (ch == '-') {
				782	if ((s < e) && (*(s) == '-')) {
				783	*p++ = '-';
				784	inShift = 1;
				785	}
				786	} else if (SPECIAL(ch,0,0)) {
				787	errmsg = "unexpected special character";
				788	goto utf7Error;
				789	} else {
				790	*p++ = ch;
				791	}
				792	} else {
				793	charsleft = (charsleft << 6) \| UB64(ch);
				794	bitsleft += 6;
				795	s++;
				796	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				797	}
				798	}
				799	else if ( ch == '+' ) {
				800	s++;
				801	if (s < e && *s == '-') {
				802	s++;
				803	*p++ = '+';
				804	} else
				805	{
				806	inShift = 1;
				807	bitsleft = 0;
				808	}
				809	}
				810	else if (SPECIAL(ch,0,0)) {
				811	errmsg = "unexpected special character";
				812	s++;
				813	goto utf7Error;
				814	}
				815	else {
				816	*p++ = ch;
				817	s++;
				818	}
				819	continue;
				820	utf7Error:
				821	if (utf7_decoding_error(&p, errors, errmsg))
				822	goto onError;
				823	}
				824
				825	if (inShift) {
				826	if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
				827	goto onError;
				828	}
				829
				830	if (_PyUnicode_Resize(&unicode, p - unicode->str))
				831	goto onError;
				832
				833	return (PyObject *)unicode;
				834
				835	onError:
				836	Py_DECREF(unicode);
				837	return NULL;
				838	}
				839
				840
				841	PyObject PyUnicode_EncodeUTF7(const Py_UNICODE s,
				842	int size,
				843	int encodeSetO,
				844	int encodeWhiteSpace,
				845	const char *errors)
				846	{
				847	PyObject *v;
				848	/* It might be possible to tighten this worst case */
				849	unsigned int cbAllocated = 5 * size;
				850	int inShift = 0;
				851	int i = 0;
				852	unsigned int bitsleft = 0;
				853	unsigned long charsleft = 0;
				854	char * out;
				855	char * start;
				856
				857	if (size == 0)
				858	return PyString_FromStringAndSize(NULL, 0);
				859
				860	v = PyString_FromStringAndSize(NULL, cbAllocated);
				861	if (v == NULL)
				862	return NULL;
				863
				864	start = out = PyString_AS_STRING(v);
				865	for (;i < size; ++i) {
				866	Py_UNICODE ch = s[i];
				867
				868	if (!inShift) {
				869	if (ch == '+') {
				870	*out++ = '+';
				871	*out++ = '-';
				872	} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				873	charsleft = ch;
				874	bitsleft = 16;
				875	*out++ = '+';
				876	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				877	inShift = bitsleft > 0;
				878	} else {
				879	*out++ = (char) ch;
				880	}
				881	} else {
				882	if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				883	*out++ = B64(charsleft << (6-bitsleft));
				884	charsleft = 0;
				885	bitsleft = 0;
				886	/* Characters not in the BASE64 set implicitly unshift the sequence
				887	so no '-' is required, except if the character is itself a '-' */
				888	if (B64CHAR(ch) \|\| ch == '-') {
				889	*out++ = '-';
				890	}
				891	inShift = 0;
				892	*out++ = (char) ch;
				893	} else {
				894	bitsleft += 16;
				895	charsleft = (charsleft << 16) \| ch;
				896	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				897
				898	/* If the next character is special then we dont' need to terminate
				899	the shift sequence. If the next character is not a BASE64 character
				900	or '-' then the shift sequence will be terminated implicitly and we
				901	don't have to insert a '-'. */
				902
				903	if (bitsleft == 0) {
				904	if (i + 1 < size) {
				905	Py_UNICODE ch2 = s[i+1];
				906
				907	if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
				908
				909	} else if (B64CHAR(ch2) \|\| ch2 == '-') {
				910	*out++ = '-';
				911	inShift = 0;
				912	} else {
				913	inShift = 0;
				914	}
				915
				916	}
				917	else {
				918	*out++ = '-';
				919	inShift = 0;
				920	}
				921	}
				922	}
				923	}
				924	}
				925	if (bitsleft) {
				926	*out++= B64(charsleft << (6-bitsleft) );
				927	*out++ = '-';
				928	}
				929
				930	if (_PyString_Resize(&v, out - start)) {
				931	Py_DECREF(v);
				932	return NULL;
				933	}
				934	return v;
				935	}
				936
				937	#undef SPECIAL
				938	#undef B64
				939	#undef B64CHAR
				940	#undef UB64
				941	#undef ENCODE
				942	#undef DECODE
				943
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	944	/* --- UTF-8 Codec -------------------------------------------------------- */
				945
				946	static
				947	char utf8_code_length[256] = {
				948	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				949	illegal prefix. see RFC 2279 for details */
				950	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				951	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				952	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				953	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				954	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				955	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				956	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				957	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				958	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				959	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				960	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				961	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				962	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				963	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				964	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				965	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				966	};
				967
				968	static
				969	int utf8_decoding_error(const char **source,
				970	Py_UNICODE **dest,
				971	const char *errors,
				972	const char *details)
				973	{
				974	if ((errors == NULL) \|\|
				975	(strcmp(errors,"strict") == 0)) {
				976	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	977	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	978	details);
				979	return -1;
				980	}
				981	else if (strcmp(errors,"ignore") == 0) {
				982	(*source)++;
				983	return 0;
				984	}
				985	else if (strcmp(errors,"replace") == 0) {
				986	(*source)++;
				987	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				988	(*dest)++;
				989	return 0;
				990	}
				991	else {
				992	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	993	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	994	errors);
				995	return -1;
				996	}
				997	}
				998
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	999	PyObject PyUnicode_DecodeUTF8(const char s,
				1000	int size,
				1001	const char *errors)
				1002	{
				1003	int n;
				1004	const char *e;
				1005	PyUnicodeObject *unicode;
				1006	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1007	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1008
				1009	/* Note: size will always be longer than the resulting Unicode
				1010	character count */
				1011	unicode = _PyUnicode_New(size);
				1012	if (!unicode)
				1013	return NULL;
				1014	if (size == 0)
				1015	return (PyObject *)unicode;
				1016
				1017	/* Unpack UTF-8 encoded data */
				1018	p = unicode->str;
				1019	e = s + size;
				1020
				1021	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1022	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1023
				1024	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1025	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1026	s++;
				1027	continue;
				1028	}
				1029
				1030	n = utf8_code_length[ch];
				1031
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1032	if (s + n > e) {
				1033	errmsg = "unexpected end of data";
				1034	goto utf8Error;
				1035	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1036
				1037	switch (n) {
				1038
				1039	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1040	errmsg = "unexpected code byte";
				1041	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1042
				1043	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1044	errmsg = "internal error";
				1045	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1046
				1047	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1048	if ((s[1] & 0xc0) != 0x80) {
				1049	errmsg = "invalid data";
				1050	goto utf8Error;
				1051	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1052	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1053	if (ch < 0x80) {
				1054	errmsg = "illegal encoding";
				1055	goto utf8Error;
				1056	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1057	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1058	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1059	break;
				1060
				1061	case 3:
				1062	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1063	(s[2] & 0xc0) != 0x80) {
				1064	errmsg = "invalid data";
				1065	goto utf8Error;
				1066	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1067	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	bd3be8f	2002-02-07 11:33:49 +0000	[diff] [blame^]	1068	if (ch < 0x0800) {
				1069	/* Note: UTF-8 encodings of surrogates are considered
				1070	legal UTF-8 sequences;
				1071
				1072	XXX For wide builds (UCS-4) we should probably try
				1073	to recombine the surrogates into a single code
				1074	unit.
				1075	*/
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1076	errmsg = "illegal encoding";
				1077	goto utf8Error;
				1078	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1079	else
Marc-André Lemburg	bd3be8f	2002-02-07 11:33:49 +0000	[diff] [blame^]	1080	*p++ = (Py_UNICODE)ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1081	break;
				1082
				1083	case 4:
				1084	if ((s[1] & 0xc0) != 0x80 \|\|
				1085	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1086	(s[3] & 0xc0) != 0x80) {
				1087	errmsg = "invalid data";
				1088	goto utf8Error;
				1089	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1090	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				1091	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				1092	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1093	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	bd3be8f	2002-02-07 11:33:49 +0000	[diff] [blame^]	1094	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1095	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	bd3be8f	2002-02-07 11:33:49 +0000	[diff] [blame^]	1096	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1097	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1098	errmsg = "illegal encoding";
				1099	goto utf8Error;
				1100	}
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1101	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1102	*p++ = (Py_UNICODE)ch;
				1103	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1104	/* compute and append the two surrogates: */
				1105
				1106	/* translate from 10000..10FFFF to 0..FFFF */
				1107	ch -= 0x10000;
				1108
				1109	/* high surrogate = top 10 bits added to D800 */
				1110	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				1111
				1112	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1113	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1114	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1115	break;
				1116
				1117	default:
				1118	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1119	errmsg = "unsupported Unicode code range";
				1120	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1121	}
				1122	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1123	continue;
				1124
				1125	utf8Error:
				1126	if (utf8_decoding_error(&s, &p, errors, errmsg))
				1127	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1128	}
				1129
				1130	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1131	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1132	goto onError;
				1133
				1134	return (PyObject *)unicode;
				1135
				1136	onError:
				1137	Py_DECREF(unicode);
				1138	return NULL;
				1139	}
				1140
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1141	/* Not used anymore, now that the encoder supports UTF-16
				1142	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1143	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1144	static
				1145	int utf8_encoding_error(const Py_UNICODE **source,
				1146	char **dest,
				1147	const char *errors,
				1148	const char *details)
				1149	{
				1150	if ((errors == NULL) \|\|
				1151	(strcmp(errors,"strict") == 0)) {
				1152	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1153	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1154	details);
				1155	return -1;
				1156	}
				1157	else if (strcmp(errors,"ignore") == 0) {
				1158	return 0;
				1159	}
				1160	else if (strcmp(errors,"replace") == 0) {
				1161	**dest = '?';
				1162	(*dest)++;
				1163	return 0;
				1164	}
				1165	else {
				1166	PyErr_Format(PyExc_ValueError,
				1167	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1168	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1169	errors);
				1170	return -1;
				1171	}
				1172	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1173	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1174
				1175	PyObject PyUnicode_EncodeUTF8(const Py_UNICODE s,
				1176	int size,
				1177	const char *errors)
				1178	{
				1179	PyObject *v;
				1180	char *p;
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1181	unsigned int cbAllocated = 2 * size;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1182	unsigned int cbWritten = 0;
				1183	int i = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1184
Marc-André Lemburg	bd3be8f	2002-02-07 11:33:49 +0000	[diff] [blame^]	1185	/* Short-cut for emtpy strings */
				1186	if (size == 0)
				1187	return PyString_FromStringAndSize(NULL, 0);
				1188
				1189	/* We allocate 4 more bytes to have room for at least one full
				1190	UTF-8 sequence; saves a few cycles in the loop below */
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1191	v = PyString_FromStringAndSize(NULL, cbAllocated + 4);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1192	if (v == NULL)
				1193	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1194
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1195	p = PyString_AS_STRING(v);
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1196	while (i < size) {
				1197	Py_UCS4 ch = s[i++];
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1198
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1199	if (ch < 0x80) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1200	*p++ = (char) ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1201	cbWritten++;
				1202	}
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1203
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1204	else if (ch < 0x0800) {
Marc-André Lemburg	dc724d6	2002-02-06 18:20:19 +0000	[diff] [blame]	1205	*p++ = (char)(0xc0 \| (ch >> 6));
				1206	*p++ = (char)(0x80 \| (ch & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1207	cbWritten += 2;
				1208	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1209
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1210	else {
				1211
				1212	/* Assure that we have enough room for high order Unicode
				1213	ordinals */
				1214	if (cbWritten >= cbAllocated) {
				1215	cbAllocated += 4 * 10;
				1216	if (_PyString_Resize(&v, cbAllocated + 4))
Marc-André Lemburg	e7c6ee4	2002-02-06 18:18:03 +0000	[diff] [blame]	1217	goto onError;
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1218	p = PyString_AS_STRING(v) + cbWritten;
Marc-André Lemburg	e7c6ee4	2002-02-06 18:18:03 +0000	[diff] [blame]	1219	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1220
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1221	if (ch < 0x10000) {
				1222	/* Check for high surrogate */
				1223	if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
				1224	Py_UCS4 ch2 = s[i];
				1225	/* Check for low surrogate */
				1226	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1227	ch = ((ch - 0xD800)<<10 \| (ch2-0xDC00))+0x10000;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1228	*p++ = (char)((ch >> 18) \| 0xf0);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1229	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1230	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				1231	*p++ = (char)(0x80 \| (ch & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1232	i++;
				1233	cbWritten += 4;
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1234	continue;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1235	}
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1236	/* Fall through: handles isolated high surrogates */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1237	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1238	*p++ = (char)(0xe0 \| (ch >> 12));
Marc-André Lemburg	e7c6ee4	2002-02-06 18:18:03 +0000	[diff] [blame]	1239	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				1240	*p++ = (char)(0x80 \| (ch & 0x3f));
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1241	cbWritten += 3;
				1242
Marc-André Lemburg	e7c6ee4	2002-02-06 18:18:03 +0000	[diff] [blame]	1243	} else {
Marc-André Lemburg	dc724d6	2002-02-06 18:20:19 +0000	[diff] [blame]	1244	*p++ = (char)(0xf0 \| (ch>>18));
				1245	*p++ = (char)(0x80 \| ((ch>>12) & 0x3f));
				1246	*p++ = (char)(0x80 \| ((ch>>6) & 0x3f));
				1247	*p++ = (char)(0x80 \| (ch & 0x3f));
Marc-André Lemburg	e7c6ee4	2002-02-06 18:18:03 +0000	[diff] [blame]	1248	cbWritten += 4;
				1249	}
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1250	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1251	}
				1252	*p = '\0';
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1253	if (_PyString_Resize(&v, cbWritten))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1254	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1255	return v;
				1256
				1257	onError:
				1258	Py_DECREF(v);
				1259	return NULL;
				1260	}
				1261
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1262	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				1263	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1264	if (!PyUnicode_Check(unicode)) {
				1265	PyErr_BadArgument();
				1266	return NULL;
				1267	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	1268	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				1269	PyUnicode_GET_SIZE(unicode),
				1270	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1271	}
				1272
				1273	/* --- UTF-16 Codec ------------------------------------------------------- */
				1274
				1275	static
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1276	int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1277	const char *errors,
				1278	const char *details)
				1279	{
				1280	if ((errors == NULL) \|\|
				1281	(strcmp(errors,"strict") == 0)) {
				1282	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1283	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1284	details);
				1285	return -1;
				1286	}
				1287	else if (strcmp(errors,"ignore") == 0) {
				1288	return 0;
				1289	}
				1290	else if (strcmp(errors,"replace") == 0) {
				1291	if (dest) {
				1292	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1293	(*dest)++;
				1294	}
				1295	return 0;
				1296	}
				1297	else {
				1298	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	1299	"UTF-16 decoding error; "
				1300	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1301	errors);
				1302	return -1;
				1303	}
				1304	}
				1305
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1306	PyObject *
				1307	PyUnicode_DecodeUTF16(const char *s,
				1308	int size,
				1309	const char *errors,
				1310	int *byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1311	{
				1312	PyUnicodeObject *unicode;
				1313	Py_UNICODE *p;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1314	const unsigned char q, e;
				1315	int bo = 0; /* assume native ordering by default */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1316	const char *errmsg = "";
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1317	/* Offsets from q for retrieving byte pairs in the right order. */
				1318	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1319	int ihi = 1, ilo = 0;
				1320	#else
				1321	int ihi = 0, ilo = 1;
				1322	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1323
				1324	/* size should be an even number */
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1325	if (size & 1) {
				1326	if (utf16_decoding_error(NULL, errors, "truncated data"))
				1327	return NULL;
				1328	--size; /* else ignore the oddball byte */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1329	}
				1330
				1331	/* Note: size will always be longer than the resulting Unicode
				1332	character count */
				1333	unicode = _PyUnicode_New(size);
				1334	if (!unicode)
				1335	return NULL;
				1336	if (size == 0)
				1337	return (PyObject *)unicode;
				1338
				1339	/* Unpack UTF-16 encoded data */
				1340	p = unicode->str;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1341	q = (unsigned char *)s;
				1342	e = q + size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1343
				1344	if (byteorder)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1345	bo = *byteorder;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1346
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1347	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1348	byte order setting accordingly. In native mode, the leading BOM
				1349	mark is skipped, in all other modes, it is copied to the output
				1350	stream as-is (giving a ZWNBSP character). */
				1351	if (bo == 0) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1352	const Py_UNICODE bom = (q[ihi] << 8) \| q[ilo];
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1353	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1354	if (bom == 0xFEFF) {
				1355	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1356	bo = -1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1357	}
				1358	else if (bom == 0xFFFE) {
				1359	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1360	bo = 1;
				1361	}
				1362	#else
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1363	if (bom == 0xFEFF) {
				1364	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1365	bo = 1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1366	}
				1367	else if (bom == 0xFFFE) {
				1368	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1369	bo = -1;
				1370	}
				1371	#endif
				1372	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1373
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1374	if (bo == -1) {
				1375	/* force LE */
				1376	ihi = 1;
				1377	ilo = 0;
				1378	}
				1379	else if (bo == 1) {
				1380	/* force BE */
				1381	ihi = 0;
				1382	ilo = 1;
				1383	}
				1384
				1385	while (q < e) {
				1386	Py_UNICODE ch = (q[ihi] << 8) \| q[ilo];
				1387	q += 2;
				1388
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1389	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1390	*p++ = ch;
				1391	continue;
				1392	}
				1393
				1394	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1395	if (q >= e) {
				1396	errmsg = "unexpected end of data";
				1397	goto utf16Error;
				1398	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1399	if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1400	Py_UNICODE ch2 = (q[ihi] << 8) \| q[ilo];
				1401	q += 2;
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1402	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1403	#ifndef Py_UNICODE_WIDE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1404	*p++ = ch;
				1405	*p++ = ch2;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1406	#else
				1407	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1408	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1409	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1410	}
				1411	else {
				1412	errmsg = "illegal UTF-16 surrogate";
				1413	goto utf16Error;
				1414	}
				1415
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1416	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1417	errmsg = "illegal encoding";
				1418	/* Fall through to report the error */
				1419
				1420	utf16Error:
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1421	if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1422	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1423	}
				1424
				1425	if (byteorder)
				1426	*byteorder = bo;
				1427
				1428	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1429	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1430	goto onError;
				1431
				1432	return (PyObject *)unicode;
				1433
				1434	onError:
				1435	Py_DECREF(unicode);
				1436	return NULL;
				1437	}
				1438
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1439	PyObject *
				1440	PyUnicode_EncodeUTF16(const Py_UNICODE *s,
				1441	int size,
				1442	const char *errors,
				1443	int byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1444	{
				1445	PyObject *v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1446	unsigned char *p;
				1447	int i, pairs;
				1448	/* Offsets from p for storing byte pairs in the right order. */
				1449	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1450	int ihi = 1, ilo = 0;
				1451	#else
				1452	int ihi = 0, ilo = 1;
				1453	#endif
				1454
				1455	#define STORECHAR(CH) \
				1456	do { \
				1457	p[ihi] = ((CH) >> 8) & 0xff; \
				1458	p[ilo] = (CH) & 0xff; \
				1459	p += 2; \
				1460	} while(0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1461
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1462	for (i = pairs = 0; i < size; i++)
				1463	if (s[i] >= 0x10000)
				1464	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1465	v = PyString_FromStringAndSize(NULL,
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1466	2 * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1467	if (v == NULL)
				1468	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1469
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1470	p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1471	if (byteorder == 0)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1472	STORECHAR(0xFEFF);
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1473	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1474	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1475
				1476	if (byteorder == -1) {
				1477	/* force LE */
				1478	ihi = 1;
				1479	ilo = 0;
				1480	}
				1481	else if (byteorder == 1) {
				1482	/* force BE */
				1483	ihi = 0;
				1484	ilo = 1;
				1485	}
				1486
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1487	while (size-- > 0) {
				1488	Py_UNICODE ch = *s++;
				1489	Py_UNICODE ch2 = 0;
				1490	if (ch >= 0x10000) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1491	ch2 = 0xDC00 \| ((ch-0x10000) & 0x3FF);
				1492	ch = 0xD800 \| ((ch-0x10000) >> 10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1493	}
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1494	STORECHAR(ch);
				1495	if (ch2)
				1496	STORECHAR(ch2);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1497	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1498	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1499	#undef STORECHAR
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1500	}
				1501
				1502	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1503	{
				1504	if (!PyUnicode_Check(unicode)) {
				1505	PyErr_BadArgument();
				1506	return NULL;
				1507	}
				1508	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1509	PyUnicode_GET_SIZE(unicode),
				1510	NULL,
				1511	0);
				1512	}
				1513
				1514	/* --- Unicode Escape Codec ----------------------------------------------- */
				1515
				1516	static
				1517	int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1518	Py_UNICODE *x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1519	const char *errors,
				1520	const char *details)
				1521	{
				1522	if ((errors == NULL) \|\|
				1523	(strcmp(errors,"strict") == 0)) {
				1524	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1525	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1526	details);
				1527	return -1;
				1528	}
				1529	else if (strcmp(errors,"ignore") == 0) {
				1530	return 0;
				1531	}
				1532	else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1533	*x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1534	return 0;
				1535	}
				1536	else {
				1537	PyErr_Format(PyExc_ValueError,
				1538	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1539	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1540	errors);
				1541	return -1;
				1542	}
				1543	}
				1544
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1545	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1546
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1547	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1548	int size,
				1549	const char *errors)
				1550	{
				1551	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1552	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1553	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1554	char* message;
				1555	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1556
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1557	/* Escaped strings will always be longer than the resulting
				1558	Unicode string, so we start with size here and then reduce the
				1559	length after conversion to the true value. */
				1560	v = _PyUnicode_New(size);
				1561	if (v == NULL)
				1562	goto onError;
				1563	if (size == 0)
				1564	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1565
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1566	p = buf = PyUnicode_AS_UNICODE(v);
				1567	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1568
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1569	while (s < end) {
				1570	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1571	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1572	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1573
				1574	/* Non-escape characters are interpreted as Unicode ordinals */
				1575	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1576	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1577	continue;
				1578	}
				1579
				1580	/* \ - Escapes */
				1581	s++;
				1582	switch (*s++) {
				1583
				1584	/* \x escapes */
				1585	case '\n': break;
				1586	case '\\': *p++ = '\\'; break;
				1587	case '\'': *p++ = '\''; break;
				1588	case '\"': *p++ = '\"'; break;
				1589	case 'b': *p++ = '\b'; break;
				1590	case 'f': p++ = '\014'; break; / FF */
				1591	case 't': *p++ = '\t'; break;
				1592	case 'n': *p++ = '\n'; break;
				1593	case 'r': *p++ = '\r'; break;
				1594	case 'v': p++ = '\013'; break; / VT */
				1595	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1596
				1597	/* \OOO (octal) escapes */
				1598	case '0': case '1': case '2': case '3':
				1599	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1600	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1601	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1602	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1603	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1604	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1605	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1606	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1607	break;
				1608
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1609	/* hex escapes */
				1610	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1611	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1612	digits = 2;
				1613	message = "truncated \\xXX escape";
				1614	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1615
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1616	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1617	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1618	digits = 4;
				1619	message = "truncated \\uXXXX escape";
				1620	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1621
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1622	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1623	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1624	digits = 8;
				1625	message = "truncated \\UXXXXXXXX escape";
				1626	hexescape:
				1627	chr = 0;
				1628	for (i = 0; i < digits; i++) {
				1629	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1630	if (!isxdigit(c)) {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1631	if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1632	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1633	chr = x;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1634	i++;
				1635	break;
				1636	}
				1637	chr = (chr<<4) & ~0xF;
				1638	if (c >= '0' && c <= '9')
				1639	chr += c - '0';
				1640	else if (c >= 'a' && c <= 'f')
				1641	chr += 10 + c - 'a';
				1642	else
				1643	chr += 10 + c - 'A';
				1644	}
				1645	s += i;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1646	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1647	/* when we get here, chr is a 32-bit unicode character */
				1648	if (chr <= 0xffff)
				1649	/* UCS-2 character */
				1650	*p++ = (Py_UNICODE) chr;
				1651	else if (chr <= 0x10ffff) {
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1652	/* UCS-4 character. Either store directly, or as
				1653	surrogate pair. */
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1654	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1655	*p++ = chr;
				1656	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1657	chr -= 0x10000L;
				1658	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1659	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1660	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1661	} else {
				1662	if (unicodeescape_decoding_error(
				1663	&s, &x, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1664	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1665	)
				1666	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1667	p++ = x; / store replacement character */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1668	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1669	break;
				1670
				1671	/* \N{name} */
				1672	case 'N':
				1673	message = "malformed \\N character escape";
				1674	if (ucnhash_CAPI == NULL) {
				1675	/* load the unicode data module */
				1676	PyObject m, v;
				1677	m = PyImport_ImportModule("unicodedata");
				1678	if (m == NULL)
				1679	goto ucnhashError;
				1680	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1681	Py_DECREF(m);
				1682	if (v == NULL)
				1683	goto ucnhashError;
				1684	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1685	Py_DECREF(v);
				1686	if (ucnhash_CAPI == NULL)
				1687	goto ucnhashError;
				1688	}
				1689	if (*s == '{') {
				1690	const char *start = s+1;
				1691	/* look for the closing brace */
				1692	while (*s != '}' && s < end)
				1693	s++;
				1694	if (s > start && s < end && *s == '}') {
				1695	/* found a name. look it up in the unicode database */
				1696	message = "unknown Unicode character name";
				1697	s++;
				1698	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1699	goto store;
				1700	}
				1701	}
				1702	if (unicodeescape_decoding_error(&s, &x, errors, message))
				1703	goto onError;
				1704	*p++ = x;
				1705	break;
				1706
				1707	default:
				1708	*p++ = '\\';
				1709	*p++ = (unsigned char)s[-1];
				1710	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1711	}
				1712	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1713	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	1714	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1715	return (PyObject *)v;
				1716
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1717	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1718	PyErr_SetString(
				1719	PyExc_UnicodeError,
				1720	"\\N escapes not supported (can't load unicodedata module)"
				1721	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1722	return NULL;
				1723
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1724	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1725	Py_XDECREF(v);
				1726	return NULL;
				1727	}
				1728
				1729	/* Return a Unicode-Escape string version of the Unicode object.
				1730
				1731	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1732	appropriate.
				1733
				1734	*/
				1735
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1736	static const Py_UNICODE findchar(const Py_UNICODE s,
				1737	int size,
				1738	Py_UNICODE ch);
				1739
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1740	static
				1741	PyObject unicodeescape_string(const Py_UNICODE s,
				1742	int size,
				1743	int quotes)
				1744	{
				1745	PyObject *repr;
				1746	char *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1747
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1748	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1749
				1750	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1751	if (repr == NULL)
				1752	return NULL;
				1753
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1754	p = PyString_AS_STRING(repr);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1755
				1756	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1757	*p++ = 'u';
				1758	*p++ = (findchar(s, size, '\'') &&
				1759	!findchar(s, size, '"')) ? '"' : '\'';
				1760	}
				1761	while (size-- > 0) {
				1762	Py_UNICODE ch = *s++;
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1763
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1764	/* Escape quotes */
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1765	if (quotes &&
				1766	(ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1767	*p++ = '\\';
				1768	*p++ = (char) ch;
Guido van Rossum	ad9744a	2001-09-21 15:38:17 +0000	[diff] [blame]	1769	continue;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1770	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1771
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1772	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1773	/* Map 21-bit characters to '\U00xxxxxx' */
				1774	else if (ch >= 0x10000) {
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1775	int offset = p - PyString_AS_STRING(repr);
				1776
				1777	/* Resize the string if necessary */
				1778	if (offset + 12 > PyString_GET_SIZE(repr)) {
				1779	if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
				1780	goto onError;
				1781	p = PyString_AS_STRING(repr) + offset;
				1782	}
				1783
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1784	*p++ = '\\';
				1785	*p++ = 'U';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1786	*p++ = hexdigit[(ch >> 28) & 0x0000000F];
				1787	*p++ = hexdigit[(ch >> 24) & 0x0000000F];
				1788	*p++ = hexdigit[(ch >> 20) & 0x0000000F];
				1789	*p++ = hexdigit[(ch >> 16) & 0x0000000F];
				1790	*p++ = hexdigit[(ch >> 12) & 0x0000000F];
				1791	*p++ = hexdigit[(ch >> 8) & 0x0000000F];
				1792	*p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1793	*p++ = hexdigit[ch & 0x0000000F];
				1794	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1795	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1796	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1797	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
				1798	else if (ch >= 0xD800 && ch < 0xDC00) {
				1799	Py_UNICODE ch2;
				1800	Py_UCS4 ucs;
				1801
				1802	ch2 = *s++;
				1803	size--;
				1804	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
				1805	ucs = (((ch & 0x03FF) << 10) \| (ch2 & 0x03FF)) + 0x00010000;
				1806	*p++ = '\\';
				1807	*p++ = 'U';
				1808	*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
				1809	*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
				1810	*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
				1811	*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
				1812	*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
				1813	*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
				1814	*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
				1815	*p++ = hexdigit[ucs & 0x0000000F];
				1816	continue;
				1817	}
				1818	/* Fall through: isolated surrogates are copied as-is */
				1819	s--;
				1820	size++;
				1821	}
				1822
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1823	/* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1824	if (ch >= 256) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1825	*p++ = '\\';
				1826	*p++ = 'u';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1827	*p++ = hexdigit[(ch >> 12) & 0x000F];
				1828	*p++ = hexdigit[(ch >> 8) & 0x000F];
				1829	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1830	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1831	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1832
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1833	/* Map special whitespace to '\t', \n', '\r' */
				1834	else if (ch == '\t') {
				1835	*p++ = '\\';
				1836	*p++ = 't';
				1837	}
				1838	else if (ch == '\n') {
				1839	*p++ = '\\';
				1840	*p++ = 'n';
				1841	}
				1842	else if (ch == '\r') {
				1843	*p++ = '\\';
				1844	*p++ = 'r';
				1845	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1846
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1847	/* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg	11326de	2001-11-28 12:56:20 +0000	[diff] [blame]	1848	else if (ch < ' ' \|\| ch >= 0x7F) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1849	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1850	*p++ = 'x';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1851	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1852	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1853	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1854
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1855	/* Copy everything else as-is */
				1856	else
				1857	*p++ = (char) ch;
				1858	}
				1859	if (quotes)
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1860	*p++ = PyString_AS_STRING(repr)[1];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1861
				1862	*p = '\0';
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1863	if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1864	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1865
				1866	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1867
				1868	onError:
				1869	Py_DECREF(repr);
				1870	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1871	}
				1872
				1873	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1874	int size)
				1875	{
				1876	return unicodeescape_string(s, size, 0);
				1877	}
				1878
				1879	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1880	{
				1881	if (!PyUnicode_Check(unicode)) {
				1882	PyErr_BadArgument();
				1883	return NULL;
				1884	}
				1885	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1886	PyUnicode_GET_SIZE(unicode));
				1887	}
				1888
				1889	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1890
				1891	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1892	int size,
				1893	const char *errors)
				1894	{
				1895	PyUnicodeObject *v;
				1896	Py_UNICODE p, buf;
				1897	const char *end;
				1898	const char *bs;
				1899
				1900	/* Escaped strings will always be longer than the resulting
				1901	Unicode string, so we start with size here and then reduce the
				1902	length after conversion to the true value. */
				1903	v = _PyUnicode_New(size);
				1904	if (v == NULL)
				1905	goto onError;
				1906	if (size == 0)
				1907	return (PyObject *)v;
				1908	p = buf = PyUnicode_AS_UNICODE(v);
				1909	end = s + size;
				1910	while (s < end) {
				1911	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1912	Py_UNICODE x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1913	int i;
				1914
				1915	/* Non-escape characters are interpreted as Unicode ordinals */
				1916	if (*s != '\\') {
				1917	p++ = (unsigned char)s++;
				1918	continue;
				1919	}
				1920
				1921	/* \u-escapes are only interpreted iff the number of leading
				1922	backslashes if odd */
				1923	bs = s;
				1924	for (;s < end;) {
				1925	if (*s != '\\')
				1926	break;
				1927	p++ = (unsigned char)s++;
				1928	}
				1929	if (((s - bs) & 1) == 0 \|\|
				1930	s >= end \|\|
				1931	*s != 'u') {
				1932	continue;
				1933	}
				1934	p--;
				1935	s++;
				1936
				1937	/* \uXXXX with 4 hex digits */
				1938	for (x = 0, i = 0; i < 4; i++) {
				1939	c = (unsigned char)s[i];
				1940	if (!isxdigit(c)) {
				1941	if (unicodeescape_decoding_error(&s, &x, errors,
				1942	"truncated \\uXXXX"))
				1943	goto onError;
				1944	i++;
				1945	break;
				1946	}
				1947	x = (x<<4) & ~0xF;
				1948	if (c >= '0' && c <= '9')
				1949	x += c - '0';
				1950	else if (c >= 'a' && c <= 'f')
				1951	x += 10 + c - 'a';
				1952	else
				1953	x += 10 + c - 'A';
				1954	}
				1955	s += i;
				1956	*p++ = x;
				1957	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1958	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1959	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1960	return (PyObject *)v;
				1961
				1962	onError:
				1963	Py_XDECREF(v);
				1964	return NULL;
				1965	}
				1966
				1967	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1968	int size)
				1969	{
				1970	PyObject *repr;
				1971	char *p;
				1972	char *q;
				1973
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1974	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1975
				1976	repr = PyString_FromStringAndSize(NULL, 6 * size);
				1977	if (repr == NULL)
				1978	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1979	if (size == 0)
				1980	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1981
				1982	p = q = PyString_AS_STRING(repr);
				1983	while (size-- > 0) {
				1984	Py_UNICODE ch = *s++;
				1985	/* Map 16-bit characters to '\uxxxx' */
				1986	if (ch >= 256) {
				1987	*p++ = '\\';
				1988	*p++ = 'u';
				1989	*p++ = hexdigit[(ch >> 12) & 0xf];
				1990	*p++ = hexdigit[(ch >> 8) & 0xf];
				1991	*p++ = hexdigit[(ch >> 4) & 0xf];
				1992	*p++ = hexdigit[ch & 15];
				1993	}
				1994	/* Copy everything else as-is */
				1995	else
				1996	*p++ = (char) ch;
				1997	}
				1998	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1999	if (_PyString_Resize(&repr, p - q))
				2000	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2001
				2002	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2003
				2004	onError:
				2005	Py_DECREF(repr);
				2006	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2007	}
				2008
				2009	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				2010	{
				2011	if (!PyUnicode_Check(unicode)) {
				2012	PyErr_BadArgument();
				2013	return NULL;
				2014	}
				2015	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				2016	PyUnicode_GET_SIZE(unicode));
				2017	}
				2018
				2019	/* --- Latin-1 Codec ------------------------------------------------------ */
				2020
				2021	PyObject PyUnicode_DecodeLatin1(const char s,
				2022	int size,
				2023	const char *errors)
				2024	{
				2025	PyUnicodeObject *v;
				2026	Py_UNICODE *p;
				2027
				2028	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2029	if (size == 1 && (unsigned char)s < 256) {
				2030	Py_UNICODE r = (unsigned char)s;
				2031	return PyUnicode_FromUnicode(&r, 1);
				2032	}
				2033
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2034	v = _PyUnicode_New(size);
				2035	if (v == NULL)
				2036	goto onError;
				2037	if (size == 0)
				2038	return (PyObject *)v;
				2039	p = PyUnicode_AS_UNICODE(v);
				2040	while (size-- > 0)
				2041	p++ = (unsigned char)s++;
				2042	return (PyObject *)v;
				2043
				2044	onError:
				2045	Py_XDECREF(v);
				2046	return NULL;
				2047	}
				2048
				2049	static
				2050	int latin1_encoding_error(const Py_UNICODE **source,
				2051	char **dest,
				2052	const char *errors,
				2053	const char *details)
				2054	{
				2055	if ((errors == NULL) \|\|
				2056	(strcmp(errors,"strict") == 0)) {
				2057	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2058	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2059	details);
				2060	return -1;
				2061	}
				2062	else if (strcmp(errors,"ignore") == 0) {
				2063	return 0;
				2064	}
				2065	else if (strcmp(errors,"replace") == 0) {
				2066	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2067	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2068	return 0;
				2069	}
				2070	else {
				2071	PyErr_Format(PyExc_ValueError,
				2072	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2073	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2074	errors);
				2075	return -1;
				2076	}
				2077	}
				2078
				2079	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				2080	int size,
				2081	const char *errors)
				2082	{
				2083	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2084	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2085
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2086	repr = PyString_FromStringAndSize(NULL, size);
				2087	if (repr == NULL)
				2088	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2089	if (size == 0)
				2090	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2091
				2092	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2093	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2094	while (size-- > 0) {
				2095	Py_UNICODE ch = *p++;
				2096	if (ch >= 256) {
				2097	if (latin1_encoding_error(&p, &s, errors,
				2098	"ordinal not in range(256)"))
				2099	goto onError;
				2100	}
				2101	else
				2102	*s++ = (char)ch;
				2103	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2104	/* Resize if error handling skipped some characters */
				2105	if (s - start < PyString_GET_SIZE(repr))
				2106	if (_PyString_Resize(&repr, s - start))
				2107	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2108	return repr;
				2109
				2110	onError:
				2111	Py_DECREF(repr);
				2112	return NULL;
				2113	}
				2114
				2115	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				2116	{
				2117	if (!PyUnicode_Check(unicode)) {
				2118	PyErr_BadArgument();
				2119	return NULL;
				2120	}
				2121	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				2122	PyUnicode_GET_SIZE(unicode),
				2123	NULL);
				2124	}
				2125
				2126	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				2127
				2128	static
				2129	int ascii_decoding_error(const char **source,
				2130	Py_UNICODE **dest,
				2131	const char *errors,
				2132	const char *details)
				2133	{
				2134	if ((errors == NULL) \|\|
				2135	(strcmp(errors,"strict") == 0)) {
				2136	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2137	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2138	details);
				2139	return -1;
				2140	}
				2141	else if (strcmp(errors,"ignore") == 0) {
				2142	return 0;
				2143	}
				2144	else if (strcmp(errors,"replace") == 0) {
				2145	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2146	(*dest)++;
				2147	return 0;
				2148	}
				2149	else {
				2150	PyErr_Format(PyExc_ValueError,
				2151	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2152	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2153	errors);
				2154	return -1;
				2155	}
				2156	}
				2157
				2158	PyObject PyUnicode_DecodeASCII(const char s,
				2159	int size,
				2160	const char *errors)
				2161	{
				2162	PyUnicodeObject *v;
				2163	Py_UNICODE *p;
				2164
				2165	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2166	if (size == 1 && (unsigned char)s < 128) {
				2167	Py_UNICODE r = (unsigned char)s;
				2168	return PyUnicode_FromUnicode(&r, 1);
				2169	}
				2170
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2171	v = _PyUnicode_New(size);
				2172	if (v == NULL)
				2173	goto onError;
				2174	if (size == 0)
				2175	return (PyObject *)v;
				2176	p = PyUnicode_AS_UNICODE(v);
				2177	while (size-- > 0) {
				2178	register unsigned char c;
				2179
				2180	c = (unsigned char)*s++;
				2181	if (c < 128)
				2182	*p++ = c;
				2183	else if (ascii_decoding_error(&s, &p, errors,
				2184	"ordinal not in range(128)"))
				2185	goto onError;
				2186	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2187	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2188	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2189	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2190	return (PyObject *)v;
				2191
				2192	onError:
				2193	Py_XDECREF(v);
				2194	return NULL;
				2195	}
				2196
				2197	static
				2198	int ascii_encoding_error(const Py_UNICODE **source,
				2199	char **dest,
				2200	const char *errors,
				2201	const char *details)
				2202	{
				2203	if ((errors == NULL) \|\|
				2204	(strcmp(errors,"strict") == 0)) {
				2205	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2206	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2207	details);
				2208	return -1;
				2209	}
				2210	else if (strcmp(errors,"ignore") == 0) {
				2211	return 0;
				2212	}
				2213	else if (strcmp(errors,"replace") == 0) {
				2214	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2215	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2216	return 0;
				2217	}
				2218	else {
				2219	PyErr_Format(PyExc_ValueError,
				2220	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2221	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2222	errors);
				2223	return -1;
				2224	}
				2225	}
				2226
				2227	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				2228	int size,
				2229	const char *errors)
				2230	{
				2231	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2232	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2233
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2234	repr = PyString_FromStringAndSize(NULL, size);
				2235	if (repr == NULL)
				2236	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2237	if (size == 0)
				2238	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2239
				2240	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2241	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2242	while (size-- > 0) {
				2243	Py_UNICODE ch = *p++;
				2244	if (ch >= 128) {
				2245	if (ascii_encoding_error(&p, &s, errors,
				2246	"ordinal not in range(128)"))
				2247	goto onError;
				2248	}
				2249	else
				2250	*s++ = (char)ch;
				2251	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2252	/* Resize if error handling skipped some characters */
				2253	if (s - start < PyString_GET_SIZE(repr))
				2254	if (_PyString_Resize(&repr, s - start))
				2255	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2256	return repr;
				2257
				2258	onError:
				2259	Py_DECREF(repr);
				2260	return NULL;
				2261	}
				2262
				2263	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				2264	{
				2265	if (!PyUnicode_Check(unicode)) {
				2266	PyErr_BadArgument();
				2267	return NULL;
				2268	}
				2269	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				2270	PyUnicode_GET_SIZE(unicode),
				2271	NULL);
				2272	}
				2273
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	2274	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2275
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2276	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2277
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2278	PyObject PyUnicode_DecodeMBCS(const char s,
				2279	int size,
				2280	const char *errors)
				2281	{
				2282	PyUnicodeObject *v;
				2283	Py_UNICODE *p;
				2284
				2285	/* First get the size of the result */
				2286	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2287	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2288	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2289
				2290	v = _PyUnicode_New(usize);
				2291	if (v == NULL)
				2292	return NULL;
				2293	if (usize == 0)
				2294	return (PyObject *)v;
				2295	p = PyUnicode_AS_UNICODE(v);
				2296	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				2297	Py_DECREF(v);
				2298	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2299	}
				2300
				2301	return (PyObject *)v;
				2302	}
				2303
				2304	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				2305	int size,
				2306	const char *errors)
				2307	{
				2308	PyObject *repr;
				2309	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2310	DWORD mbcssize;
				2311
				2312	/* If there are no characters, bail now! */
				2313	if (size==0)
				2314	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2315
				2316	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2317	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2318	if (mbcssize==0)
				2319	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2320
				2321	repr = PyString_FromStringAndSize(NULL, mbcssize);
				2322	if (repr == NULL)
				2323	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2324	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2325	return repr;
				2326
				2327	/* Do the conversion */
				2328	s = PyString_AS_STRING(repr);
				2329	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				2330	Py_DECREF(repr);
				2331	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2332	}
				2333	return repr;
				2334	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2335
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2336	#endif /* MS_WIN32 */
				2337
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2338	/* --- Character Mapping Codec -------------------------------------------- */
				2339
				2340	static
				2341	int charmap_decoding_error(const char **source,
				2342	Py_UNICODE **dest,
				2343	const char *errors,
				2344	const char *details)
				2345	{
				2346	if ((errors == NULL) \|\|
				2347	(strcmp(errors,"strict") == 0)) {
				2348	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2349	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2350	details);
				2351	return -1;
				2352	}
				2353	else if (strcmp(errors,"ignore") == 0) {
				2354	return 0;
				2355	}
				2356	else if (strcmp(errors,"replace") == 0) {
				2357	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2358	(*dest)++;
				2359	return 0;
				2360	}
				2361	else {
				2362	PyErr_Format(PyExc_ValueError,
				2363	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2364	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2365	errors);
				2366	return -1;
				2367	}
				2368	}
				2369
				2370	PyObject PyUnicode_DecodeCharmap(const char s,
				2371	int size,
				2372	PyObject *mapping,
				2373	const char *errors)
				2374	{
				2375	PyUnicodeObject *v;
				2376	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2377	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2378
				2379	/* Default to Latin-1 */
				2380	if (mapping == NULL)
				2381	return PyUnicode_DecodeLatin1(s, size, errors);
				2382
				2383	v = _PyUnicode_New(size);
				2384	if (v == NULL)
				2385	goto onError;
				2386	if (size == 0)
				2387	return (PyObject *)v;
				2388	p = PyUnicode_AS_UNICODE(v);
				2389	while (size-- > 0) {
				2390	unsigned char ch = *s++;
				2391	PyObject w, x;
				2392
				2393	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2394	w = PyInt_FromLong((long)ch);
				2395	if (w == NULL)
				2396	goto onError;
				2397	x = PyObject_GetItem(mapping, w);
				2398	Py_DECREF(w);
				2399	if (x == NULL) {
				2400	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2401	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2402	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2403	x = Py_None;
				2404	Py_INCREF(x);
				2405	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2406	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2407	}
				2408
				2409	/* Apply mapping */
				2410	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2411	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2412	if (value < 0 \|\| value > 65535) {
				2413	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2414	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2415	Py_DECREF(x);
				2416	goto onError;
				2417	}
				2418	*p++ = (Py_UNICODE)value;
				2419	}
				2420	else if (x == Py_None) {
				2421	/* undefined mapping */
				2422	if (charmap_decoding_error(&s, &p, errors,
				2423	"character maps to <undefined>")) {
				2424	Py_DECREF(x);
				2425	goto onError;
				2426	}
				2427	}
				2428	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2429	int targetsize = PyUnicode_GET_SIZE(x);
				2430
				2431	if (targetsize == 1)
				2432	/* 1-1 mapping */
				2433	p++ = PyUnicode_AS_UNICODE(x);
				2434
				2435	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2436	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2437	if (targetsize > extrachars) {
				2438	/* resize first */
				2439	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2440	int needed = (targetsize - extrachars) + \
				2441	(targetsize << 2);
				2442	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2443	if (_PyUnicode_Resize(&v,
				2444	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2445	Py_DECREF(x);
				2446	goto onError;
				2447	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2448	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2449	}
				2450	Py_UNICODE_COPY(p,
				2451	PyUnicode_AS_UNICODE(x),
				2452	targetsize);
				2453	p += targetsize;
				2454	extrachars -= targetsize;
				2455	}
				2456	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2457	}
				2458	else {
				2459	/* wrong return value */
				2460	PyErr_SetString(PyExc_TypeError,
				2461	"character mapping must return integer, None or unicode");
				2462	Py_DECREF(x);
				2463	goto onError;
				2464	}
				2465	Py_DECREF(x);
				2466	}
				2467	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2468	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2469	goto onError;
				2470	return (PyObject *)v;
				2471
				2472	onError:
				2473	Py_XDECREF(v);
				2474	return NULL;
				2475	}
				2476
				2477	static
				2478	int charmap_encoding_error(const Py_UNICODE **source,
				2479	char **dest,
				2480	const char *errors,
				2481	const char *details)
				2482	{
				2483	if ((errors == NULL) \|\|
				2484	(strcmp(errors,"strict") == 0)) {
				2485	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2486	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2487	details);
				2488	return -1;
				2489	}
				2490	else if (strcmp(errors,"ignore") == 0) {
				2491	return 0;
				2492	}
				2493	else if (strcmp(errors,"replace") == 0) {
				2494	**dest = '?';
				2495	(*dest)++;
				2496	return 0;
				2497	}
				2498	else {
				2499	PyErr_Format(PyExc_ValueError,
				2500	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2501	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2502	errors);
				2503	return -1;
				2504	}
				2505	}
				2506
				2507	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2508	int size,
				2509	PyObject *mapping,
				2510	const char *errors)
				2511	{
				2512	PyObject *v;
				2513	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2514	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2515
				2516	/* Default to Latin-1 */
				2517	if (mapping == NULL)
				2518	return PyUnicode_EncodeLatin1(p, size, errors);
				2519
				2520	v = PyString_FromStringAndSize(NULL, size);
				2521	if (v == NULL)
				2522	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2523	if (size == 0)
				2524	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2525	s = PyString_AS_STRING(v);
				2526	while (size-- > 0) {
				2527	Py_UNICODE ch = *p++;
				2528	PyObject w, x;
				2529
				2530	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2531	w = PyInt_FromLong((long)ch);
				2532	if (w == NULL)
				2533	goto onError;
				2534	x = PyObject_GetItem(mapping, w);
				2535	Py_DECREF(w);
				2536	if (x == NULL) {
				2537	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2538	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2539	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2540	x = Py_None;
				2541	Py_INCREF(x);
				2542	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2543	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2544	}
				2545
				2546	/* Apply mapping */
				2547	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2548	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2549	if (value < 0 \|\| value > 255) {
				2550	PyErr_SetString(PyExc_TypeError,
				2551	"character mapping must be in range(256)");
				2552	Py_DECREF(x);
				2553	goto onError;
				2554	}
				2555	*s++ = (char)value;
				2556	}
				2557	else if (x == Py_None) {
				2558	/* undefined mapping */
				2559	if (charmap_encoding_error(&p, &s, errors,
				2560	"character maps to <undefined>")) {
				2561	Py_DECREF(x);
				2562	goto onError;
				2563	}
				2564	}
				2565	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2566	int targetsize = PyString_GET_SIZE(x);
				2567
				2568	if (targetsize == 1)
				2569	/* 1-1 mapping */
				2570	s++ = PyString_AS_STRING(x);
				2571
				2572	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2573	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2574	if (targetsize > extrachars) {
				2575	/* resize first */
				2576	int oldpos = (int)(s - PyString_AS_STRING(v));
				2577	int needed = (targetsize - extrachars) + \
				2578	(targetsize << 2);
				2579	extrachars += needed;
				2580	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2581	Py_DECREF(x);
				2582	goto onError;
				2583	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2584	s = PyString_AS_STRING(v) + oldpos;
				2585	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2586	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2587	s += targetsize;
				2588	extrachars -= targetsize;
				2589	}
				2590	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2591	}
				2592	else {
				2593	/* wrong return value */
				2594	PyErr_SetString(PyExc_TypeError,
				2595	"character mapping must return integer, None or unicode");
				2596	Py_DECREF(x);
				2597	goto onError;
				2598	}
				2599	Py_DECREF(x);
				2600	}
				2601	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2602	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2603	goto onError;
				2604	return v;
				2605
				2606	onError:
				2607	Py_DECREF(v);
				2608	return NULL;
				2609	}
				2610
				2611	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2612	PyObject *mapping)
				2613	{
				2614	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2615	PyErr_BadArgument();
				2616	return NULL;
				2617	}
				2618	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2619	PyUnicode_GET_SIZE(unicode),
				2620	mapping,
				2621	NULL);
				2622	}
				2623
				2624	static
				2625	int translate_error(const Py_UNICODE **source,
				2626	Py_UNICODE **dest,
				2627	const char *errors,
				2628	const char *details)
				2629	{
				2630	if ((errors == NULL) \|\|
				2631	(strcmp(errors,"strict") == 0)) {
				2632	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2633	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2634	details);
				2635	return -1;
				2636	}
				2637	else if (strcmp(errors,"ignore") == 0) {
				2638	return 0;
				2639	}
				2640	else if (strcmp(errors,"replace") == 0) {
				2641	**dest = '?';
				2642	(*dest)++;
				2643	return 0;
				2644	}
				2645	else {
				2646	PyErr_Format(PyExc_ValueError,
				2647	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2648	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2649	errors);
				2650	return -1;
				2651	}
				2652	}
				2653
				2654	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2655	int size,
				2656	PyObject *mapping,
				2657	const char *errors)
				2658	{
				2659	PyUnicodeObject *v;
				2660	Py_UNICODE *p;
				2661
				2662	if (mapping == NULL) {
				2663	PyErr_BadArgument();
				2664	return NULL;
				2665	}
				2666
				2667	/* Output will never be longer than input */
				2668	v = _PyUnicode_New(size);
				2669	if (v == NULL)
				2670	goto onError;
				2671	if (size == 0)
				2672	goto done;
				2673	p = PyUnicode_AS_UNICODE(v);
				2674	while (size-- > 0) {
				2675	Py_UNICODE ch = *s++;
				2676	PyObject w, x;
				2677
				2678	/* Get mapping */
				2679	w = PyInt_FromLong(ch);
				2680	if (w == NULL)
				2681	goto onError;
				2682	x = PyObject_GetItem(mapping, w);
				2683	Py_DECREF(w);
				2684	if (x == NULL) {
				2685	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2686	/* No mapping found: default to 1-1 mapping */
				2687	PyErr_Clear();
				2688	*p++ = ch;
				2689	continue;
				2690	}
				2691	goto onError;
				2692	}
				2693
				2694	/* Apply mapping */
				2695	if (PyInt_Check(x))
				2696	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2697	else if (x == Py_None) {
				2698	/* undefined mapping */
				2699	if (translate_error(&s, &p, errors,
				2700	"character maps to <undefined>")) {
				2701	Py_DECREF(x);
				2702	goto onError;
				2703	}
				2704	}
				2705	else if (PyUnicode_Check(x)) {
				2706	if (PyUnicode_GET_SIZE(x) != 1) {
				2707	/* 1-n mapping */
				2708	PyErr_SetString(PyExc_NotImplementedError,
				2709	"1-n mappings are currently not implemented");
				2710	Py_DECREF(x);
				2711	goto onError;
				2712	}
				2713	p++ = PyUnicode_AS_UNICODE(x);
				2714	}
				2715	else {
				2716	/* wrong return value */
				2717	PyErr_SetString(PyExc_TypeError,
				2718	"translate mapping must return integer, None or unicode");
				2719	Py_DECREF(x);
				2720	goto onError;
				2721	}
				2722	Py_DECREF(x);
				2723	}
				2724	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2725	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2726	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2727
				2728	done:
				2729	return (PyObject *)v;
				2730
				2731	onError:
				2732	Py_XDECREF(v);
				2733	return NULL;
				2734	}
				2735
				2736	PyObject PyUnicode_Translate(PyObject str,
				2737	PyObject *mapping,
				2738	const char *errors)
				2739	{
				2740	PyObject *result;
				2741
				2742	str = PyUnicode_FromObject(str);
				2743	if (str == NULL)
				2744	goto onError;
				2745	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2746	PyUnicode_GET_SIZE(str),
				2747	mapping,
				2748	errors);
				2749	Py_DECREF(str);
				2750	return result;
				2751
				2752	onError:
				2753	Py_XDECREF(str);
				2754	return NULL;
				2755	}
				2756
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2757	/* --- Decimal Encoder ---------------------------------------------------- */
				2758
				2759	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2760	int length,
				2761	char *output,
				2762	const char *errors)
				2763	{
				2764	Py_UNICODE p, end;
				2765
				2766	if (output == NULL) {
				2767	PyErr_BadArgument();
				2768	return -1;
				2769	}
				2770
				2771	p = s;
				2772	end = s + length;
				2773	while (p < end) {
				2774	register Py_UNICODE ch = *p++;
				2775	int decimal;
				2776
				2777	if (Py_UNICODE_ISSPACE(ch)) {
				2778	*output++ = ' ';
				2779	continue;
				2780	}
				2781	decimal = Py_UNICODE_TODECIMAL(ch);
				2782	if (decimal >= 0) {
				2783	*output++ = '0' + decimal;
				2784	continue;
				2785	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2786	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2787	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2788	continue;
				2789	}
				2790	/* All other characters are considered invalid */
				2791	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2792	PyErr_SetString(PyExc_ValueError,
				2793	"invalid decimal Unicode string");
				2794	goto onError;
				2795	}
				2796	else if (strcmp(errors, "ignore") == 0)
				2797	continue;
				2798	else if (strcmp(errors, "replace") == 0) {
				2799	*output++ = '?';
				2800	continue;
				2801	}
				2802	}
				2803	/* 0-terminate the output string */
				2804	*output++ = '\0';
				2805	return 0;
				2806
				2807	onError:
				2808	return -1;
				2809	}
				2810
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2811	/* --- Helpers ------------------------------------------------------------ */
				2812
				2813	static
				2814	int count(PyUnicodeObject *self,
				2815	int start,
				2816	int end,
				2817	PyUnicodeObject *substring)
				2818	{
				2819	int count = 0;
				2820
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2821	if (start < 0)
				2822	start += self->length;
				2823	if (start < 0)
				2824	start = 0;
				2825	if (end > self->length)
				2826	end = self->length;
				2827	if (end < 0)
				2828	end += self->length;
				2829	if (end < 0)
				2830	end = 0;
				2831
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2832	if (substring->length == 0)
				2833	return (end - start + 1);
				2834
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2835	end -= substring->length;
				2836
				2837	while (start <= end)
				2838	if (Py_UNICODE_MATCH(self, start, substring)) {
				2839	count++;
				2840	start += substring->length;
				2841	} else
				2842	start++;
				2843
				2844	return count;
				2845	}
				2846
				2847	int PyUnicode_Count(PyObject *str,
				2848	PyObject *substr,
				2849	int start,
				2850	int end)
				2851	{
				2852	int result;
				2853
				2854	str = PyUnicode_FromObject(str);
				2855	if (str == NULL)
				2856	return -1;
				2857	substr = PyUnicode_FromObject(substr);
				2858	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2859	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2860	return -1;
				2861	}
				2862
				2863	result = count((PyUnicodeObject *)str,
				2864	start, end,
				2865	(PyUnicodeObject *)substr);
				2866
				2867	Py_DECREF(str);
				2868	Py_DECREF(substr);
				2869	return result;
				2870	}
				2871
				2872	static
				2873	int findstring(PyUnicodeObject *self,
				2874	PyUnicodeObject *substring,
				2875	int start,
				2876	int end,
				2877	int direction)
				2878	{
				2879	if (start < 0)
				2880	start += self->length;
				2881	if (start < 0)
				2882	start = 0;
				2883
				2884	if (substring->length == 0)
				2885	return start;
				2886
				2887	if (end > self->length)
				2888	end = self->length;
				2889	if (end < 0)
				2890	end += self->length;
				2891	if (end < 0)
				2892	end = 0;
				2893
				2894	end -= substring->length;
				2895
				2896	if (direction < 0) {
				2897	for (; end >= start; end--)
				2898	if (Py_UNICODE_MATCH(self, end, substring))
				2899	return end;
				2900	} else {
				2901	for (; start <= end; start++)
				2902	if (Py_UNICODE_MATCH(self, start, substring))
				2903	return start;
				2904	}
				2905
				2906	return -1;
				2907	}
				2908
				2909	int PyUnicode_Find(PyObject *str,
				2910	PyObject *substr,
				2911	int start,
				2912	int end,
				2913	int direction)
				2914	{
				2915	int result;
				2916
				2917	str = PyUnicode_FromObject(str);
				2918	if (str == NULL)
				2919	return -1;
				2920	substr = PyUnicode_FromObject(substr);
				2921	if (substr == NULL) {
				2922	Py_DECREF(substr);
				2923	return -1;
				2924	}
				2925
				2926	result = findstring((PyUnicodeObject *)str,
				2927	(PyUnicodeObject *)substr,
				2928	start, end, direction);
				2929	Py_DECREF(str);
				2930	Py_DECREF(substr);
				2931	return result;
				2932	}
				2933
				2934	static
				2935	int tailmatch(PyUnicodeObject *self,
				2936	PyUnicodeObject *substring,
				2937	int start,
				2938	int end,
				2939	int direction)
				2940	{
				2941	if (start < 0)
				2942	start += self->length;
				2943	if (start < 0)
				2944	start = 0;
				2945
				2946	if (substring->length == 0)
				2947	return 1;
				2948
				2949	if (end > self->length)
				2950	end = self->length;
				2951	if (end < 0)
				2952	end += self->length;
				2953	if (end < 0)
				2954	end = 0;
				2955
				2956	end -= substring->length;
				2957	if (end < start)
				2958	return 0;
				2959
				2960	if (direction > 0) {
				2961	if (Py_UNICODE_MATCH(self, end, substring))
				2962	return 1;
				2963	} else {
				2964	if (Py_UNICODE_MATCH(self, start, substring))
				2965	return 1;
				2966	}
				2967
				2968	return 0;
				2969	}
				2970
				2971	int PyUnicode_Tailmatch(PyObject *str,
				2972	PyObject *substr,
				2973	int start,
				2974	int end,
				2975	int direction)
				2976	{
				2977	int result;
				2978
				2979	str = PyUnicode_FromObject(str);
				2980	if (str == NULL)
				2981	return -1;
				2982	substr = PyUnicode_FromObject(substr);
				2983	if (substr == NULL) {
				2984	Py_DECREF(substr);
				2985	return -1;
				2986	}
				2987
				2988	result = tailmatch((PyUnicodeObject *)str,
				2989	(PyUnicodeObject *)substr,
				2990	start, end, direction);
				2991	Py_DECREF(str);
				2992	Py_DECREF(substr);
				2993	return result;
				2994	}
				2995
				2996	static
				2997	const Py_UNICODE findchar(const Py_UNICODE s,
				2998	int size,
				2999	Py_UNICODE ch)
				3000	{
				3001	/* like wcschr, but doesn't stop at NULL characters */
				3002
				3003	while (size-- > 0) {
				3004	if (*s == ch)
				3005	return s;
				3006	s++;
				3007	}
				3008
				3009	return NULL;
				3010	}
				3011
				3012	/* Apply fixfct filter to the Unicode object self and return a
				3013	reference to the modified object */
				3014
				3015	static
				3016	PyObject fixup(PyUnicodeObject self,
				3017	int (fixfct)(PyUnicodeObject s))
				3018	{
				3019
				3020	PyUnicodeObject *u;
				3021
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3022	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3023	if (u == NULL)
				3024	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3025
				3026	Py_UNICODE_COPY(u->str, self->str, self->length);
				3027
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3028	if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3029	/* fixfct should return TRUE if it modified the buffer. If
				3030	FALSE, return a reference to the original buffer instead
				3031	(to save space, not time) */
				3032	Py_INCREF(self);
				3033	Py_DECREF(u);
				3034	return (PyObject*) self;
				3035	}
				3036	return (PyObject*) u;
				3037	}
				3038
				3039	static
				3040	int fixupper(PyUnicodeObject *self)
				3041	{
				3042	int len = self->length;
				3043	Py_UNICODE *s = self->str;
				3044	int status = 0;
				3045
				3046	while (len-- > 0) {
				3047	register Py_UNICODE ch;
				3048
				3049	ch = Py_UNICODE_TOUPPER(*s);
				3050	if (ch != *s) {
				3051	status = 1;
				3052	*s = ch;
				3053	}
				3054	s++;
				3055	}
				3056
				3057	return status;
				3058	}
				3059
				3060	static
				3061	int fixlower(PyUnicodeObject *self)
				3062	{
				3063	int len = self->length;
				3064	Py_UNICODE *s = self->str;
				3065	int status = 0;
				3066
				3067	while (len-- > 0) {
				3068	register Py_UNICODE ch;
				3069
				3070	ch = Py_UNICODE_TOLOWER(*s);
				3071	if (ch != *s) {
				3072	status = 1;
				3073	*s = ch;
				3074	}
				3075	s++;
				3076	}
				3077
				3078	return status;
				3079	}
				3080
				3081	static
				3082	int fixswapcase(PyUnicodeObject *self)
				3083	{
				3084	int len = self->length;
				3085	Py_UNICODE *s = self->str;
				3086	int status = 0;
				3087
				3088	while (len-- > 0) {
				3089	if (Py_UNICODE_ISUPPER(*s)) {
				3090	s = Py_UNICODE_TOLOWER(s);
				3091	status = 1;
				3092	} else if (Py_UNICODE_ISLOWER(*s)) {
				3093	s = Py_UNICODE_TOUPPER(s);
				3094	status = 1;
				3095	}
				3096	s++;
				3097	}
				3098
				3099	return status;
				3100	}
				3101
				3102	static
				3103	int fixcapitalize(PyUnicodeObject *self)
				3104	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3105	int len = self->length;
				3106	Py_UNICODE *s = self->str;
				3107	int status = 0;
				3108
				3109	if (len == 0)
				3110	return 0;
				3111	if (Py_UNICODE_ISLOWER(*s)) {
				3112	s = Py_UNICODE_TOUPPER(s);
				3113	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3114	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3115	s++;
				3116	while (--len > 0) {
				3117	if (Py_UNICODE_ISUPPER(*s)) {
				3118	s = Py_UNICODE_TOLOWER(s);
				3119	status = 1;
				3120	}
				3121	s++;
				3122	}
				3123	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3124	}
				3125
				3126	static
				3127	int fixtitle(PyUnicodeObject *self)
				3128	{
				3129	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3130	register Py_UNICODE *e;
				3131	int previous_is_cased;
				3132
				3133	/* Shortcut for single character strings */
				3134	if (PyUnicode_GET_SIZE(self) == 1) {
				3135	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				3136	if (*p != ch) {
				3137	*p = ch;
				3138	return 1;
				3139	}
				3140	else
				3141	return 0;
				3142	}
				3143
				3144	e = p + PyUnicode_GET_SIZE(self);
				3145	previous_is_cased = 0;
				3146	for (; p < e; p++) {
				3147	register const Py_UNICODE ch = *p;
				3148
				3149	if (previous_is_cased)
				3150	*p = Py_UNICODE_TOLOWER(ch);
				3151	else
				3152	*p = Py_UNICODE_TOTITLE(ch);
				3153
				3154	if (Py_UNICODE_ISLOWER(ch) \|\|
				3155	Py_UNICODE_ISUPPER(ch) \|\|
				3156	Py_UNICODE_ISTITLE(ch))
				3157	previous_is_cased = 1;
				3158	else
				3159	previous_is_cased = 0;
				3160	}
				3161	return 1;
				3162	}
				3163
				3164	PyObject PyUnicode_Join(PyObject separator,
				3165	PyObject *seq)
				3166	{
				3167	Py_UNICODE *sep;
				3168	int seplen;
				3169	PyUnicodeObject *res = NULL;
				3170	int reslen = 0;
				3171	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3172	int sz = 100;
				3173	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3174	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3175
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3176	it = PyObject_GetIter(seq);
				3177	if (it == NULL)
				3178	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3179
				3180	if (separator == NULL) {
				3181	Py_UNICODE blank = ' ';
				3182	sep = &blank;
				3183	seplen = 1;
				3184	}
				3185	else {
				3186	separator = PyUnicode_FromObject(separator);
				3187	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3188	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3189	sep = PyUnicode_AS_UNICODE(separator);
				3190	seplen = PyUnicode_GET_SIZE(separator);
				3191	}
				3192
				3193	res = _PyUnicode_New(sz);
				3194	if (res == NULL)
				3195	goto onError;
				3196	p = PyUnicode_AS_UNICODE(res);
				3197	reslen = 0;
				3198
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3199	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3200	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3201	PyObject *item = PyIter_Next(it);
				3202	if (item == NULL) {
				3203	if (PyErr_Occurred())
				3204	goto onError;
				3205	break;
				3206	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3207	if (!PyUnicode_Check(item)) {
				3208	PyObject *v;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3209	if (!PyString_Check(item)) {
				3210	PyErr_Format(PyExc_TypeError,
				3211	"sequence item %i: expected string or Unicode,"
				3212	" %.80s found",
				3213	i, item->ob_type->tp_name);
				3214	Py_DECREF(item);
				3215	goto onError;
				3216	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3217	v = PyUnicode_FromObject(item);
				3218	Py_DECREF(item);
				3219	item = v;
				3220	if (item == NULL)
				3221	goto onError;
				3222	}
				3223	itemlen = PyUnicode_GET_SIZE(item);
				3224	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3225	if (_PyUnicode_Resize(&res, sz*2)) {
				3226	Py_DECREF(item);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3227	goto onError;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3228	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3229	sz *= 2;
				3230	p = PyUnicode_AS_UNICODE(res) + reslen;
				3231	}
				3232	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3233	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3234	p += seplen;
				3235	reslen += seplen;
				3236	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3237	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3238	p += itemlen;
				3239	reslen += itemlen;
				3240	Py_DECREF(item);
				3241	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3242	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3243	goto onError;
				3244
				3245	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3246	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3247	return (PyObject *)res;
				3248
				3249	onError:
				3250	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3251	Py_XDECREF(res);
				3252	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3253	return NULL;
				3254	}
				3255
				3256	static
				3257	PyUnicodeObject pad(PyUnicodeObject self,
				3258	int left,
				3259	int right,
				3260	Py_UNICODE fill)
				3261	{
				3262	PyUnicodeObject *u;
				3263
				3264	if (left < 0)
				3265	left = 0;
				3266	if (right < 0)
				3267	right = 0;
				3268
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3269	if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3270	Py_INCREF(self);
				3271	return self;
				3272	}
				3273
				3274	u = _PyUnicode_New(left + self->length + right);
				3275	if (u) {
				3276	if (left)
				3277	Py_UNICODE_FILL(u->str, fill, left);
				3278	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				3279	if (right)
				3280	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				3281	}
				3282
				3283	return u;
				3284	}
				3285
				3286	#define SPLIT_APPEND(data, left, right) \
				3287	str = PyUnicode_FromUnicode(data + left, right - left); \
				3288	if (!str) \
				3289	goto onError; \
				3290	if (PyList_Append(list, str)) { \
				3291	Py_DECREF(str); \
				3292	goto onError; \
				3293	} \
				3294	else \
				3295	Py_DECREF(str);
				3296
				3297	static
				3298	PyObject split_whitespace(PyUnicodeObject self,
				3299	PyObject *list,
				3300	int maxcount)
				3301	{
				3302	register int i;
				3303	register int j;
				3304	int len = self->length;
				3305	PyObject *str;
				3306
				3307	for (i = j = 0; i < len; ) {
				3308	/* find a token */
				3309	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3310	i++;
				3311	j = i;
				3312	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				3313	i++;
				3314	if (j < i) {
				3315	if (maxcount-- <= 0)
				3316	break;
				3317	SPLIT_APPEND(self->str, j, i);
				3318	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3319	i++;
				3320	j = i;
				3321	}
				3322	}
				3323	if (j < len) {
				3324	SPLIT_APPEND(self->str, j, len);
				3325	}
				3326	return list;
				3327
				3328	onError:
				3329	Py_DECREF(list);
				3330	return NULL;
				3331	}
				3332
				3333	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3334	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3335	{
				3336	register int i;
				3337	register int j;
				3338	int len;
				3339	PyObject *list;
				3340	PyObject *str;
				3341	Py_UNICODE *data;
				3342
				3343	string = PyUnicode_FromObject(string);
				3344	if (string == NULL)
				3345	return NULL;
				3346	data = PyUnicode_AS_UNICODE(string);
				3347	len = PyUnicode_GET_SIZE(string);
				3348
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3349	list = PyList_New(0);
				3350	if (!list)
				3351	goto onError;
				3352
				3353	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3354	int eol;
				3355
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3356	/* Find a line and append it */
				3357	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				3358	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3359
				3360	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3361	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3362	if (i < len) {
				3363	if (data[i] == '\r' && i + 1 < len &&
				3364	data[i+1] == '\n')
				3365	i += 2;
				3366	else
				3367	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3368	if (keepends)
				3369	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3370	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3371	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3372	j = i;
				3373	}
				3374	if (j < len) {
				3375	SPLIT_APPEND(data, j, len);
				3376	}
				3377
				3378	Py_DECREF(string);
				3379	return list;
				3380
				3381	onError:
				3382	Py_DECREF(list);
				3383	Py_DECREF(string);
				3384	return NULL;
				3385	}
				3386
				3387	static
				3388	PyObject split_char(PyUnicodeObject self,
				3389	PyObject *list,
				3390	Py_UNICODE ch,
				3391	int maxcount)
				3392	{
				3393	register int i;
				3394	register int j;
				3395	int len = self->length;
				3396	PyObject *str;
				3397
				3398	for (i = j = 0; i < len; ) {
				3399	if (self->str[i] == ch) {
				3400	if (maxcount-- <= 0)
				3401	break;
				3402	SPLIT_APPEND(self->str, j, i);
				3403	i = j = i + 1;
				3404	} else
				3405	i++;
				3406	}
				3407	if (j <= len) {
				3408	SPLIT_APPEND(self->str, j, len);
				3409	}
				3410	return list;
				3411
				3412	onError:
				3413	Py_DECREF(list);
				3414	return NULL;
				3415	}
				3416
				3417	static
				3418	PyObject split_substring(PyUnicodeObject self,
				3419	PyObject *list,
				3420	PyUnicodeObject *substring,
				3421	int maxcount)
				3422	{
				3423	register int i;
				3424	register int j;
				3425	int len = self->length;
				3426	int sublen = substring->length;
				3427	PyObject *str;
				3428
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3429	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3430	if (Py_UNICODE_MATCH(self, i, substring)) {
				3431	if (maxcount-- <= 0)
				3432	break;
				3433	SPLIT_APPEND(self->str, j, i);
				3434	i = j = i + sublen;
				3435	} else
				3436	i++;
				3437	}
				3438	if (j <= len) {
				3439	SPLIT_APPEND(self->str, j, len);
				3440	}
				3441	return list;
				3442
				3443	onError:
				3444	Py_DECREF(list);
				3445	return NULL;
				3446	}
				3447
				3448	#undef SPLIT_APPEND
				3449
				3450	static
				3451	PyObject split(PyUnicodeObject self,
				3452	PyUnicodeObject *substring,
				3453	int maxcount)
				3454	{
				3455	PyObject *list;
				3456
				3457	if (maxcount < 0)
				3458	maxcount = INT_MAX;
				3459
				3460	list = PyList_New(0);
				3461	if (!list)
				3462	return NULL;
				3463
				3464	if (substring == NULL)
				3465	return split_whitespace(self,list,maxcount);
				3466
				3467	else if (substring->length == 1)
				3468	return split_char(self,list,substring->str[0],maxcount);
				3469
				3470	else if (substring->length == 0) {
				3471	Py_DECREF(list);
				3472	PyErr_SetString(PyExc_ValueError, "empty separator");
				3473	return NULL;
				3474	}
				3475	else
				3476	return split_substring(self,list,substring,maxcount);
				3477	}
				3478
				3479	static
				3480	PyObject strip(PyUnicodeObject self,
				3481	int left,
				3482	int right)
				3483	{
				3484	Py_UNICODE *p = self->str;
				3485	int start = 0;
				3486	int end = self->length;
				3487
				3488	if (left)
				3489	while (start < end && Py_UNICODE_ISSPACE(p[start]))
				3490	start++;
				3491
				3492	if (right)
				3493	while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
				3494	end--;
				3495
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3496	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3497	/* couldn't strip anything off, return original string */
				3498	Py_INCREF(self);
				3499	return (PyObject*) self;
				3500	}
				3501
				3502	return (PyObject*) PyUnicode_FromUnicode(
				3503	self->str + start,
				3504	end - start
				3505	);
				3506	}
				3507
				3508	static
				3509	PyObject replace(PyUnicodeObject self,
				3510	PyUnicodeObject *str1,
				3511	PyUnicodeObject *str2,
				3512	int maxcount)
				3513	{
				3514	PyUnicodeObject *u;
				3515
				3516	if (maxcount < 0)
				3517	maxcount = INT_MAX;
				3518
				3519	if (str1->length == 1 && str2->length == 1) {
				3520	int i;
				3521
				3522	/* replace characters */
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3523	if (!findchar(self->str, self->length, str1->str[0]) &&
				3524	PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3525	/* nothing to replace, return original string */
				3526	Py_INCREF(self);
				3527	u = self;
				3528	} else {
				3529	Py_UNICODE u1 = str1->str[0];
				3530	Py_UNICODE u2 = str2->str[0];
				3531
				3532	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3533	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3534	self->length
				3535	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3536	if (u != NULL) {
				3537	Py_UNICODE_COPY(u->str, self->str,
				3538	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3539	for (i = 0; i < u->length; i++)
				3540	if (u->str[i] == u1) {
				3541	if (--maxcount < 0)
				3542	break;
				3543	u->str[i] = u2;
				3544	}
				3545	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3546	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3547
				3548	} else {
				3549	int n, i;
				3550	Py_UNICODE *p;
				3551
				3552	/* replace strings */
				3553	n = count(self, 0, self->length, str1);
				3554	if (n > maxcount)
				3555	n = maxcount;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3556	if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3557	/* nothing to replace, return original string */
				3558	Py_INCREF(self);
				3559	u = self;
				3560	} else {
				3561	u = _PyUnicode_New(
				3562	self->length + n * (str2->length - str1->length));
				3563	if (u) {
				3564	i = 0;
				3565	p = u->str;
				3566	while (i <= self->length - str1->length)
				3567	if (Py_UNICODE_MATCH(self, i, str1)) {
				3568	/* replace string segment */
				3569	Py_UNICODE_COPY(p, str2->str, str2->length);
				3570	p += str2->length;
				3571	i += str1->length;
				3572	if (--n <= 0) {
				3573	/* copy remaining part */
				3574	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3575	break;
				3576	}
				3577	} else
				3578	*p++ = self->str[i++];
				3579	}
				3580	}
				3581	}
				3582
				3583	return (PyObject *) u;
				3584	}
				3585
				3586	/* --- Unicode Object Methods --------------------------------------------- */
				3587
				3588	static char title__doc__[] =
				3589	"S.title() -> unicode\n\
				3590	\n\
				3591	Return a titlecased version of S, i.e. words start with title case\n\
				3592	characters, all remaining cased characters have lower case.";
				3593
				3594	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3595	unicode_title(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3596	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3597	return fixup(self, fixtitle);
				3598	}
				3599
				3600	static char capitalize__doc__[] =
				3601	"S.capitalize() -> unicode\n\
				3602	\n\
				3603	Return a capitalized version of S, i.e. make the first character\n\
				3604	have upper case.";
				3605
				3606	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3607	unicode_capitalize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3608	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3609	return fixup(self, fixcapitalize);
				3610	}
				3611
				3612	#if 0
				3613	static char capwords__doc__[] =
				3614	"S.capwords() -> unicode\n\
				3615	\n\
				3616	Apply .capitalize() to all words in S and return the result with\n\
				3617	normalized whitespace (all whitespace strings are replaced by ' ').";
				3618
				3619	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3620	unicode_capwords(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3621	{
				3622	PyObject *list;
				3623	PyObject *item;
				3624	int i;
				3625
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3626	/* Split into words */
				3627	list = split(self, NULL, -1);
				3628	if (!list)
				3629	return NULL;
				3630
				3631	/* Capitalize each word */
				3632	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3633	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3634	fixcapitalize);
				3635	if (item == NULL)
				3636	goto onError;
				3637	Py_DECREF(PyList_GET_ITEM(list, i));
				3638	PyList_SET_ITEM(list, i, item);
				3639	}
				3640
				3641	/* Join the words to form a new string */
				3642	item = PyUnicode_Join(NULL, list);
				3643
				3644	onError:
				3645	Py_DECREF(list);
				3646	return (PyObject *)item;
				3647	}
				3648	#endif
				3649
				3650	static char center__doc__[] =
				3651	"S.center(width) -> unicode\n\
				3652	\n\
				3653	Return S centered in a Unicode string of length width. Padding is done\n\
				3654	using spaces.";
				3655
				3656	static PyObject *
				3657	unicode_center(PyUnicodeObject self, PyObject args)
				3658	{
				3659	int marg, left;
				3660	int width;
				3661
				3662	if (!PyArg_ParseTuple(args, "i:center", &width))
				3663	return NULL;
				3664
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3665	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3666	Py_INCREF(self);
				3667	return (PyObject*) self;
				3668	}
				3669
				3670	marg = width - self->length;
				3671	left = marg / 2 + (marg & width & 1);
				3672
				3673	return (PyObject*) pad(self, left, marg - left, ' ');
				3674	}
				3675
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3676	#if 0
				3677
				3678	/* This code should go into some future Unicode collation support
				3679	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3680	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3681
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3682	/* speedy UTF-16 code point order comparison */
				3683	/* gleaned from: */
				3684	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3685
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3686	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3687	{
				3688	0, 0, 0, 0, 0, 0, 0, 0,
				3689	0, 0, 0, 0, 0, 0, 0, 0,
				3690	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3691	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3692	};
				3693
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3694	static int
				3695	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3696	{
				3697	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3698
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3699	Py_UNICODE *s1 = str1->str;
				3700	Py_UNICODE *s2 = str2->str;
				3701
				3702	len1 = str1->length;
				3703	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3704
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3705	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3706	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3707
				3708	c1 = *s1++;
				3709	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3710
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3711	if (c1 > (1<<11) * 26)
				3712	c1 += utf16Fixup[c1>>11];
				3713	if (c2 > (1<<11) * 26)
				3714	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3715	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3716
				3717	if (c1 != c2)
				3718	return (c1 < c2) ? -1 : 1;
				3719
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3720	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3721	}
				3722
				3723	return (len1 < len2) ? -1 : (len1 != len2);
				3724	}
				3725
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3726	#else
				3727
				3728	static int
				3729	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3730	{
				3731	register int len1, len2;
				3732
				3733	Py_UNICODE *s1 = str1->str;
				3734	Py_UNICODE *s2 = str2->str;
				3735
				3736	len1 = str1->length;
				3737	len2 = str2->length;
				3738
				3739	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3740	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3741
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3742	c1 = *s1++;
				3743	c2 = *s2++;
				3744
				3745	if (c1 != c2)
				3746	return (c1 < c2) ? -1 : 1;
				3747
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3748	len1--; len2--;
				3749	}
				3750
				3751	return (len1 < len2) ? -1 : (len1 != len2);
				3752	}
				3753
				3754	#endif
				3755
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3756	int PyUnicode_Compare(PyObject *left,
				3757	PyObject *right)
				3758	{
				3759	PyUnicodeObject u = NULL, v = NULL;
				3760	int result;
				3761
				3762	/* Coerce the two arguments */
				3763	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3764	if (u == NULL)
				3765	goto onError;
				3766	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3767	if (v == NULL)
				3768	goto onError;
				3769
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3770	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3771	if (v == u) {
				3772	Py_DECREF(u);
				3773	Py_DECREF(v);
				3774	return 0;
				3775	}
				3776
				3777	result = unicode_compare(u, v);
				3778
				3779	Py_DECREF(u);
				3780	Py_DECREF(v);
				3781	return result;
				3782
				3783	onError:
				3784	Py_XDECREF(u);
				3785	Py_XDECREF(v);
				3786	return -1;
				3787	}
				3788
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3789	int PyUnicode_Contains(PyObject *container,
				3790	PyObject *element)
				3791	{
				3792	PyUnicodeObject u = NULL, v = NULL;
				3793	int result;
				3794	register const Py_UNICODE p, e;
				3795	register Py_UNICODE ch;
				3796
				3797	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3798	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3799	if (v == NULL) {
				3800	PyErr_SetString(PyExc_TypeError,
				3801	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3802	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3803	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3804	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3805	if (u == NULL) {
				3806	Py_DECREF(v);
				3807	goto onError;
				3808	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3809
				3810	/* Check v in u */
				3811	if (PyUnicode_GET_SIZE(v) != 1) {
				3812	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3813	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3814	goto onError;
				3815	}
				3816	ch = *PyUnicode_AS_UNICODE(v);
				3817	p = PyUnicode_AS_UNICODE(u);
				3818	e = p + PyUnicode_GET_SIZE(u);
				3819	result = 0;
				3820	while (p < e) {
				3821	if (*p++ == ch) {
				3822	result = 1;
				3823	break;
				3824	}
				3825	}
				3826
				3827	Py_DECREF(u);
				3828	Py_DECREF(v);
				3829	return result;
				3830
				3831	onError:
				3832	Py_XDECREF(u);
				3833	Py_XDECREF(v);
				3834	return -1;
				3835	}
				3836
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3837	/* Concat to string or Unicode object giving a new Unicode object. */
				3838
				3839	PyObject PyUnicode_Concat(PyObject left,
				3840	PyObject *right)
				3841	{
				3842	PyUnicodeObject u = NULL, v = NULL, *w;
				3843
				3844	/* Coerce the two arguments */
				3845	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3846	if (u == NULL)
				3847	goto onError;
				3848	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3849	if (v == NULL)
				3850	goto onError;
				3851
				3852	/* Shortcuts */
				3853	if (v == unicode_empty) {
				3854	Py_DECREF(v);
				3855	return (PyObject *)u;
				3856	}
				3857	if (u == unicode_empty) {
				3858	Py_DECREF(u);
				3859	return (PyObject *)v;
				3860	}
				3861
				3862	/* Concat the two Unicode strings */
				3863	w = _PyUnicode_New(u->length + v->length);
				3864	if (w == NULL)
				3865	goto onError;
				3866	Py_UNICODE_COPY(w->str, u->str, u->length);
				3867	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3868
				3869	Py_DECREF(u);
				3870	Py_DECREF(v);
				3871	return (PyObject *)w;
				3872
				3873	onError:
				3874	Py_XDECREF(u);
				3875	Py_XDECREF(v);
				3876	return NULL;
				3877	}
				3878
				3879	static char count__doc__[] =
				3880	"S.count(sub[, start[, end]]) -> int\n\
				3881	\n\
				3882	Return the number of occurrences of substring sub in Unicode string\n\
				3883	S[start:end]. Optional arguments start and end are\n\
				3884	interpreted as in slice notation.";
				3885
				3886	static PyObject *
				3887	unicode_count(PyUnicodeObject self, PyObject args)
				3888	{
				3889	PyUnicodeObject *substring;
				3890	int start = 0;
				3891	int end = INT_MAX;
				3892	PyObject *result;
				3893
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3894	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3895	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3896	return NULL;
				3897
				3898	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3899	(PyObject *)substring);
				3900	if (substring == NULL)
				3901	return NULL;
				3902
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3903	if (start < 0)
				3904	start += self->length;
				3905	if (start < 0)
				3906	start = 0;
				3907	if (end > self->length)
				3908	end = self->length;
				3909	if (end < 0)
				3910	end += self->length;
				3911	if (end < 0)
				3912	end = 0;
				3913
				3914	result = PyInt_FromLong((long) count(self, start, end, substring));
				3915
				3916	Py_DECREF(substring);
				3917	return result;
				3918	}
				3919
				3920	static char encode__doc__[] =
				3921	"S.encode([encoding[,errors]]) -> string\n\
				3922	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3923	Return an encoded string version of S. Default encoding is the current\n\
				3924	default string encoding. errors may be given to set a different error\n\
				3925	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3926	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3927
				3928	static PyObject *
				3929	unicode_encode(PyUnicodeObject self, PyObject args)
				3930	{
				3931	char *encoding = NULL;
				3932	char *errors = NULL;
				3933	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3934	return NULL;
				3935	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3936	}
				3937
				3938	static char expandtabs__doc__[] =
				3939	"S.expandtabs([tabsize]) -> unicode\n\
				3940	\n\
				3941	Return a copy of S where all tab characters are expanded using spaces.\n\
				3942	If tabsize is not given, a tab size of 8 characters is assumed.";
				3943
				3944	static PyObject*
				3945	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3946	{
				3947	Py_UNICODE *e;
				3948	Py_UNICODE *p;
				3949	Py_UNICODE *q;
				3950	int i, j;
				3951	PyUnicodeObject *u;
				3952	int tabsize = 8;
				3953
				3954	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3955	return NULL;
				3956
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3957	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3958	i = j = 0;
				3959	e = self->str + self->length;
				3960	for (p = self->str; p < e; p++)
				3961	if (*p == '\t') {
				3962	if (tabsize > 0)
				3963	j += tabsize - (j % tabsize);
				3964	}
				3965	else {
				3966	j++;
				3967	if (p == '\n' \|\| p == '\r') {
				3968	i += j;
				3969	j = 0;
				3970	}
				3971	}
				3972
				3973	/* Second pass: create output string and fill it */
				3974	u = _PyUnicode_New(i + j);
				3975	if (!u)
				3976	return NULL;
				3977
				3978	j = 0;
				3979	q = u->str;
				3980
				3981	for (p = self->str; p < e; p++)
				3982	if (*p == '\t') {
				3983	if (tabsize > 0) {
				3984	i = tabsize - (j % tabsize);
				3985	j += i;
				3986	while (i--)
				3987	*q++ = ' ';
				3988	}
				3989	}
				3990	else {
				3991	j++;
				3992	q++ = p;
				3993	if (p == '\n' \|\| p == '\r')
				3994	j = 0;
				3995	}
				3996
				3997	return (PyObject*) u;
				3998	}
				3999
				4000	static char find__doc__[] =
				4001	"S.find(sub [,start [,end]]) -> int\n\
				4002	\n\
				4003	Return the lowest index in S where substring sub is found,\n\
				4004	such that sub is contained within s[start,end]. Optional\n\
				4005	arguments start and end are interpreted as in slice notation.\n\
				4006	\n\
				4007	Return -1 on failure.";
				4008
				4009	static PyObject *
				4010	unicode_find(PyUnicodeObject self, PyObject args)
				4011	{
				4012	PyUnicodeObject *substring;
				4013	int start = 0;
				4014	int end = INT_MAX;
				4015	PyObject *result;
				4016
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4017	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				4018	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4019	return NULL;
				4020	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4021	(PyObject *)substring);
				4022	if (substring == NULL)
				4023	return NULL;
				4024
				4025	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				4026
				4027	Py_DECREF(substring);
				4028	return result;
				4029	}
				4030
				4031	static PyObject *
				4032	unicode_getitem(PyUnicodeObject *self, int index)
				4033	{
				4034	if (index < 0 \|\| index >= self->length) {
				4035	PyErr_SetString(PyExc_IndexError, "string index out of range");
				4036	return NULL;
				4037	}
				4038
				4039	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				4040	}
				4041
				4042	static long
				4043	unicode_hash(PyUnicodeObject *self)
				4044	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4045	/* Since Unicode objects compare equal to their ASCII string
				4046	counterparts, they should use the individual character values
				4047	as basis for their hash value. This is needed to assure that
				4048	strings and Unicode objects behave in the same way as
				4049	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4050
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4051	register int len;
				4052	register Py_UNICODE *p;
				4053	register long x;
				4054
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4055	if (self->hash != -1)
				4056	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4057	len = PyUnicode_GET_SIZE(self);
				4058	p = PyUnicode_AS_UNICODE(self);
				4059	x = *p << 7;
				4060	while (--len >= 0)
				4061	x = (1000003x) ^ p++;
				4062	x ^= PyUnicode_GET_SIZE(self);
				4063	if (x == -1)
				4064	x = -2;
				4065	self->hash = x;
				4066	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4067	}
				4068
				4069	static char index__doc__[] =
				4070	"S.index(sub [,start [,end]]) -> int\n\
				4071	\n\
				4072	Like S.find() but raise ValueError when the substring is not found.";
				4073
				4074	static PyObject *
				4075	unicode_index(PyUnicodeObject self, PyObject args)
				4076	{
				4077	int result;
				4078	PyUnicodeObject *substring;
				4079	int start = 0;
				4080	int end = INT_MAX;
				4081
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4082	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				4083	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4084	return NULL;
				4085
				4086	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4087	(PyObject *)substring);
				4088	if (substring == NULL)
				4089	return NULL;
				4090
				4091	result = findstring(self, substring, start, end, 1);
				4092
				4093	Py_DECREF(substring);
				4094	if (result < 0) {
				4095	PyErr_SetString(PyExc_ValueError, "substring not found");
				4096	return NULL;
				4097	}
				4098	return PyInt_FromLong(result);
				4099	}
				4100
				4101	static char islower__doc__[] =
				4102	"S.islower() -> int\n\
				4103	\n\
				4104	Return 1 if all cased characters in S are lowercase and there is\n\
				4105	at least one cased character in S, 0 otherwise.";
				4106
				4107	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4108	unicode_islower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4109	{
				4110	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4111	register const Py_UNICODE *e;
				4112	int cased;
				4113
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4114	/* Shortcut for single character strings */
				4115	if (PyUnicode_GET_SIZE(self) == 1)
				4116	return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
				4117
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4118	/* Special case for empty strings */
				4119	if (PyString_GET_SIZE(self) == 0)
				4120	return PyInt_FromLong(0);
				4121
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4122	e = p + PyUnicode_GET_SIZE(self);
				4123	cased = 0;
				4124	for (; p < e; p++) {
				4125	register const Py_UNICODE ch = *p;
				4126
				4127	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4128	return PyInt_FromLong(0);
				4129	else if (!cased && Py_UNICODE_ISLOWER(ch))
				4130	cased = 1;
				4131	}
				4132	return PyInt_FromLong(cased);
				4133	}
				4134
				4135	static char isupper__doc__[] =
				4136	"S.isupper() -> int\n\
				4137	\n\
				4138	Return 1 if all cased characters in S are uppercase and there is\n\
				4139	at least one cased character in S, 0 otherwise.";
				4140
				4141	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4142	unicode_isupper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4143	{
				4144	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4145	register const Py_UNICODE *e;
				4146	int cased;
				4147
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4148	/* Shortcut for single character strings */
				4149	if (PyUnicode_GET_SIZE(self) == 1)
				4150	return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
				4151
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4152	/* Special case for empty strings */
				4153	if (PyString_GET_SIZE(self) == 0)
				4154	return PyInt_FromLong(0);
				4155
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4156	e = p + PyUnicode_GET_SIZE(self);
				4157	cased = 0;
				4158	for (; p < e; p++) {
				4159	register const Py_UNICODE ch = *p;
				4160
				4161	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
				4162	return PyInt_FromLong(0);
				4163	else if (!cased && Py_UNICODE_ISUPPER(ch))
				4164	cased = 1;
				4165	}
				4166	return PyInt_FromLong(cased);
				4167	}
				4168
				4169	static char istitle__doc__[] =
				4170	"S.istitle() -> int\n\
				4171	\n\
				4172	Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
				4173	may only follow uncased characters and lowercase characters only cased\n\
				4174	ones. Return 0 otherwise.";
				4175
				4176	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4177	unicode_istitle(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4178	{
				4179	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4180	register const Py_UNICODE *e;
				4181	int cased, previous_is_cased;
				4182
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4183	/* Shortcut for single character strings */
				4184	if (PyUnicode_GET_SIZE(self) == 1)
				4185	return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				4186	(Py_UNICODE_ISUPPER(*p) != 0));
				4187
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4188	/* Special case for empty strings */
				4189	if (PyString_GET_SIZE(self) == 0)
				4190	return PyInt_FromLong(0);
				4191
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4192	e = p + PyUnicode_GET_SIZE(self);
				4193	cased = 0;
				4194	previous_is_cased = 0;
				4195	for (; p < e; p++) {
				4196	register const Py_UNICODE ch = *p;
				4197
				4198	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				4199	if (previous_is_cased)
				4200	return PyInt_FromLong(0);
				4201	previous_is_cased = 1;
				4202	cased = 1;
				4203	}
				4204	else if (Py_UNICODE_ISLOWER(ch)) {
				4205	if (!previous_is_cased)
				4206	return PyInt_FromLong(0);
				4207	previous_is_cased = 1;
				4208	cased = 1;
				4209	}
				4210	else
				4211	previous_is_cased = 0;
				4212	}
				4213	return PyInt_FromLong(cased);
				4214	}
				4215
				4216	static char isspace__doc__[] =
				4217	"S.isspace() -> int\n\
				4218	\n\
				4219	Return 1 if there are only whitespace characters in S,\n\
				4220	0 otherwise.";
				4221
				4222	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4223	unicode_isspace(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4224	{
				4225	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4226	register const Py_UNICODE *e;
				4227
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4228	/* Shortcut for single character strings */
				4229	if (PyUnicode_GET_SIZE(self) == 1 &&
				4230	Py_UNICODE_ISSPACE(*p))
				4231	return PyInt_FromLong(1);
				4232
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4233	/* Special case for empty strings */
				4234	if (PyString_GET_SIZE(self) == 0)
				4235	return PyInt_FromLong(0);
				4236
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4237	e = p + PyUnicode_GET_SIZE(self);
				4238	for (; p < e; p++) {
				4239	if (!Py_UNICODE_ISSPACE(*p))
				4240	return PyInt_FromLong(0);
				4241	}
				4242	return PyInt_FromLong(1);
				4243	}
				4244
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4245	static char isalpha__doc__[] =
				4246	"S.isalpha() -> int\n\
				4247	\n\
				4248	Return 1 if all characters in S are alphabetic\n\
				4249	and there is at least one character in S, 0 otherwise.";
				4250
				4251	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4252	unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4253	{
				4254	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4255	register const Py_UNICODE *e;
				4256
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4257	/* Shortcut for single character strings */
				4258	if (PyUnicode_GET_SIZE(self) == 1 &&
				4259	Py_UNICODE_ISALPHA(*p))
				4260	return PyInt_FromLong(1);
				4261
				4262	/* Special case for empty strings */
				4263	if (PyString_GET_SIZE(self) == 0)
				4264	return PyInt_FromLong(0);
				4265
				4266	e = p + PyUnicode_GET_SIZE(self);
				4267	for (; p < e; p++) {
				4268	if (!Py_UNICODE_ISALPHA(*p))
				4269	return PyInt_FromLong(0);
				4270	}
				4271	return PyInt_FromLong(1);
				4272	}
				4273
				4274	static char isalnum__doc__[] =
				4275	"S.isalnum() -> int\n\
				4276	\n\
				4277	Return 1 if all characters in S are alphanumeric\n\
				4278	and there is at least one character in S, 0 otherwise.";
				4279
				4280	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4281	unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4282	{
				4283	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4284	register const Py_UNICODE *e;
				4285
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4286	/* Shortcut for single character strings */
				4287	if (PyUnicode_GET_SIZE(self) == 1 &&
				4288	Py_UNICODE_ISALNUM(*p))
				4289	return PyInt_FromLong(1);
				4290
				4291	/* Special case for empty strings */
				4292	if (PyString_GET_SIZE(self) == 0)
				4293	return PyInt_FromLong(0);
				4294
				4295	e = p + PyUnicode_GET_SIZE(self);
				4296	for (; p < e; p++) {
				4297	if (!Py_UNICODE_ISALNUM(*p))
				4298	return PyInt_FromLong(0);
				4299	}
				4300	return PyInt_FromLong(1);
				4301	}
				4302
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4303	static char isdecimal__doc__[] =
				4304	"S.isdecimal() -> int\n\
				4305	\n\
				4306	Return 1 if there are only decimal characters in S,\n\
				4307	0 otherwise.";
				4308
				4309	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4310	unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4311	{
				4312	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4313	register const Py_UNICODE *e;
				4314
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4315	/* Shortcut for single character strings */
				4316	if (PyUnicode_GET_SIZE(self) == 1 &&
				4317	Py_UNICODE_ISDECIMAL(*p))
				4318	return PyInt_FromLong(1);
				4319
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4320	/* Special case for empty strings */
				4321	if (PyString_GET_SIZE(self) == 0)
				4322	return PyInt_FromLong(0);
				4323
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4324	e = p + PyUnicode_GET_SIZE(self);
				4325	for (; p < e; p++) {
				4326	if (!Py_UNICODE_ISDECIMAL(*p))
				4327	return PyInt_FromLong(0);
				4328	}
				4329	return PyInt_FromLong(1);
				4330	}
				4331
				4332	static char isdigit__doc__[] =
				4333	"S.isdigit() -> int\n\
				4334	\n\
				4335	Return 1 if there are only digit characters in S,\n\
				4336	0 otherwise.";
				4337
				4338	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4339	unicode_isdigit(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4340	{
				4341	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4342	register const Py_UNICODE *e;
				4343
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4344	/* Shortcut for single character strings */
				4345	if (PyUnicode_GET_SIZE(self) == 1 &&
				4346	Py_UNICODE_ISDIGIT(*p))
				4347	return PyInt_FromLong(1);
				4348
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4349	/* Special case for empty strings */
				4350	if (PyString_GET_SIZE(self) == 0)
				4351	return PyInt_FromLong(0);
				4352
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4353	e = p + PyUnicode_GET_SIZE(self);
				4354	for (; p < e; p++) {
				4355	if (!Py_UNICODE_ISDIGIT(*p))
				4356	return PyInt_FromLong(0);
				4357	}
				4358	return PyInt_FromLong(1);
				4359	}
				4360
				4361	static char isnumeric__doc__[] =
				4362	"S.isnumeric() -> int\n\
				4363	\n\
				4364	Return 1 if there are only numeric characters in S,\n\
				4365	0 otherwise.";
				4366
				4367	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4368	unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4369	{
				4370	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4371	register const Py_UNICODE *e;
				4372
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4373	/* Shortcut for single character strings */
				4374	if (PyUnicode_GET_SIZE(self) == 1 &&
				4375	Py_UNICODE_ISNUMERIC(*p))
				4376	return PyInt_FromLong(1);
				4377
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4378	/* Special case for empty strings */
				4379	if (PyString_GET_SIZE(self) == 0)
				4380	return PyInt_FromLong(0);
				4381
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4382	e = p + PyUnicode_GET_SIZE(self);
				4383	for (; p < e; p++) {
				4384	if (!Py_UNICODE_ISNUMERIC(*p))
				4385	return PyInt_FromLong(0);
				4386	}
				4387	return PyInt_FromLong(1);
				4388	}
				4389
				4390	static char join__doc__[] =
				4391	"S.join(sequence) -> unicode\n\
				4392	\n\
				4393	Return a string which is the concatenation of the strings in the\n\
				4394	sequence. The separator between elements is S.";
				4395
				4396	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4397	unicode_join(PyObject self, PyObject data)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4398	{
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4399	return PyUnicode_Join(self, data);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4400	}
				4401
				4402	static int
				4403	unicode_length(PyUnicodeObject *self)
				4404	{
				4405	return self->length;
				4406	}
				4407
				4408	static char ljust__doc__[] =
				4409	"S.ljust(width) -> unicode\n\
				4410	\n\
				4411	Return S left justified in a Unicode string of length width. Padding is\n\
				4412	done using spaces.";
				4413
				4414	static PyObject *
				4415	unicode_ljust(PyUnicodeObject self, PyObject args)
				4416	{
				4417	int width;
				4418	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4419	return NULL;
				4420
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4421	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4422	Py_INCREF(self);
				4423	return (PyObject*) self;
				4424	}
				4425
				4426	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4427	}
				4428
				4429	static char lower__doc__[] =
				4430	"S.lower() -> unicode\n\
				4431	\n\
				4432	Return a copy of the string S converted to lowercase.";
				4433
				4434	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4435	unicode_lower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4436	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4437	return fixup(self, fixlower);
				4438	}
				4439
				4440	static char lstrip__doc__[] =
				4441	"S.lstrip() -> unicode\n\
				4442	\n\
				4443	Return a copy of the string S with leading whitespace removed.";
				4444
				4445	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4446	unicode_lstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4447	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4448	return strip(self, 1, 0);
				4449	}
				4450
				4451	static PyObject*
				4452	unicode_repeat(PyUnicodeObject *str, int len)
				4453	{
				4454	PyUnicodeObject *u;
				4455	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4456	int nchars;
				4457	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4458
				4459	if (len < 0)
				4460	len = 0;
				4461
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4462	if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4463	/* no repeat, return original string */
				4464	Py_INCREF(str);
				4465	return (PyObject*) str;
				4466	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4467
				4468	/* ensure # of chars needed doesn't overflow int and # of bytes
				4469	* needed doesn't overflow size_t
				4470	*/
				4471	nchars = len * str->length;
				4472	if (len && nchars / len != str->length) {
				4473	PyErr_SetString(PyExc_OverflowError,
				4474	"repeated string is too long");
				4475	return NULL;
				4476	}
				4477	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4478	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4479	PyErr_SetString(PyExc_OverflowError,
				4480	"repeated string is too long");
				4481	return NULL;
				4482	}
				4483	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4484	if (!u)
				4485	return NULL;
				4486
				4487	p = u->str;
				4488
				4489	while (len-- > 0) {
				4490	Py_UNICODE_COPY(p, str->str, str->length);
				4491	p += str->length;
				4492	}
				4493
				4494	return (PyObject*) u;
				4495	}
				4496
				4497	PyObject PyUnicode_Replace(PyObject obj,
				4498	PyObject *subobj,
				4499	PyObject *replobj,
				4500	int maxcount)
				4501	{
				4502	PyObject *self;
				4503	PyObject *str1;
				4504	PyObject *str2;
				4505	PyObject *result;
				4506
				4507	self = PyUnicode_FromObject(obj);
				4508	if (self == NULL)
				4509	return NULL;
				4510	str1 = PyUnicode_FromObject(subobj);
				4511	if (str1 == NULL) {
				4512	Py_DECREF(self);
				4513	return NULL;
				4514	}
				4515	str2 = PyUnicode_FromObject(replobj);
				4516	if (str2 == NULL) {
				4517	Py_DECREF(self);
				4518	Py_DECREF(str1);
				4519	return NULL;
				4520	}
				4521	result = replace((PyUnicodeObject *)self,
				4522	(PyUnicodeObject *)str1,
				4523	(PyUnicodeObject *)str2,
				4524	maxcount);
				4525	Py_DECREF(self);
				4526	Py_DECREF(str1);
				4527	Py_DECREF(str2);
				4528	return result;
				4529	}
				4530
				4531	static char replace__doc__[] =
				4532	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4533	\n\
				4534	Return a copy of S with all occurrences of substring\n\
				4535	old replaced by new. If the optional argument maxsplit is\n\
				4536	given, only the first maxsplit occurrences are replaced.";
				4537
				4538	static PyObject*
				4539	unicode_replace(PyUnicodeObject self, PyObject args)
				4540	{
				4541	PyUnicodeObject *str1;
				4542	PyUnicodeObject *str2;
				4543	int maxcount = -1;
				4544	PyObject *result;
				4545
				4546	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4547	return NULL;
				4548	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4549	if (str1 == NULL)
				4550	return NULL;
				4551	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4552	if (str2 == NULL)
				4553	return NULL;
				4554
				4555	result = replace(self, str1, str2, maxcount);
				4556
				4557	Py_DECREF(str1);
				4558	Py_DECREF(str2);
				4559	return result;
				4560	}
				4561
				4562	static
				4563	PyObject unicode_repr(PyObject unicode)
				4564	{
				4565	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4566	PyUnicode_GET_SIZE(unicode),
				4567	1);
				4568	}
				4569
				4570	static char rfind__doc__[] =
				4571	"S.rfind(sub [,start [,end]]) -> int\n\
				4572	\n\
				4573	Return the highest index in S where substring sub is found,\n\
				4574	such that sub is contained within s[start,end]. Optional\n\
				4575	arguments start and end are interpreted as in slice notation.\n\
				4576	\n\
				4577	Return -1 on failure.";
				4578
				4579	static PyObject *
				4580	unicode_rfind(PyUnicodeObject self, PyObject args)
				4581	{
				4582	PyUnicodeObject *substring;
				4583	int start = 0;
				4584	int end = INT_MAX;
				4585	PyObject *result;
				4586
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4587	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4588	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4589	return NULL;
				4590	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4591	(PyObject *)substring);
				4592	if (substring == NULL)
				4593	return NULL;
				4594
				4595	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4596
				4597	Py_DECREF(substring);
				4598	return result;
				4599	}
				4600
				4601	static char rindex__doc__[] =
				4602	"S.rindex(sub [,start [,end]]) -> int\n\
				4603	\n\
				4604	Like S.rfind() but raise ValueError when the substring is not found.";
				4605
				4606	static PyObject *
				4607	unicode_rindex(PyUnicodeObject self, PyObject args)
				4608	{
				4609	int result;
				4610	PyUnicodeObject *substring;
				4611	int start = 0;
				4612	int end = INT_MAX;
				4613
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4614	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4615	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4616	return NULL;
				4617	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4618	(PyObject *)substring);
				4619	if (substring == NULL)
				4620	return NULL;
				4621
				4622	result = findstring(self, substring, start, end, -1);
				4623
				4624	Py_DECREF(substring);
				4625	if (result < 0) {
				4626	PyErr_SetString(PyExc_ValueError, "substring not found");
				4627	return NULL;
				4628	}
				4629	return PyInt_FromLong(result);
				4630	}
				4631
				4632	static char rjust__doc__[] =
				4633	"S.rjust(width) -> unicode\n\
				4634	\n\
				4635	Return S right justified in a Unicode string of length width. Padding is\n\
				4636	done using spaces.";
				4637
				4638	static PyObject *
				4639	unicode_rjust(PyUnicodeObject self, PyObject args)
				4640	{
				4641	int width;
				4642	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4643	return NULL;
				4644
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4645	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4646	Py_INCREF(self);
				4647	return (PyObject*) self;
				4648	}
				4649
				4650	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4651	}
				4652
				4653	static char rstrip__doc__[] =
				4654	"S.rstrip() -> unicode\n\
				4655	\n\
				4656	Return a copy of the string S with trailing whitespace removed.";
				4657
				4658	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4659	unicode_rstrip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4660	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4661	return strip(self, 0, 1);
				4662	}
				4663
				4664	static PyObject*
				4665	unicode_slice(PyUnicodeObject *self, int start, int end)
				4666	{
				4667	/* standard clamping */
				4668	if (start < 0)
				4669	start = 0;
				4670	if (end < 0)
				4671	end = 0;
				4672	if (end > self->length)
				4673	end = self->length;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4674	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4675	/* full slice, return original string */
				4676	Py_INCREF(self);
				4677	return (PyObject*) self;
				4678	}
				4679	if (start > end)
				4680	start = end;
				4681	/* copy slice */
				4682	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4683	end - start);
				4684	}
				4685
				4686	PyObject PyUnicode_Split(PyObject s,
				4687	PyObject *sep,
				4688	int maxsplit)
				4689	{
				4690	PyObject *result;
				4691
				4692	s = PyUnicode_FromObject(s);
				4693	if (s == NULL)
				4694	return NULL;
				4695	if (sep != NULL) {
				4696	sep = PyUnicode_FromObject(sep);
				4697	if (sep == NULL) {
				4698	Py_DECREF(s);
				4699	return NULL;
				4700	}
				4701	}
				4702
				4703	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4704
				4705	Py_DECREF(s);
				4706	Py_XDECREF(sep);
				4707	return result;
				4708	}
				4709
				4710	static char split__doc__[] =
				4711	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4712	\n\
				4713	Return a list of the words in S, using sep as the\n\
				4714	delimiter string. If maxsplit is given, at most maxsplit\n\
				4715	splits are done. If sep is not specified, any whitespace string\n\
				4716	is a separator.";
				4717
				4718	static PyObject*
				4719	unicode_split(PyUnicodeObject self, PyObject args)
				4720	{
				4721	PyObject *substring = Py_None;
				4722	int maxcount = -1;
				4723
				4724	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4725	return NULL;
				4726
				4727	if (substring == Py_None)
				4728	return split(self, NULL, maxcount);
				4729	else if (PyUnicode_Check(substring))
				4730	return split(self, (PyUnicodeObject *)substring, maxcount);
				4731	else
				4732	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4733	}
				4734
				4735	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4736	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4737	\n\
				4738	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4739	Line breaks are not included in the resulting list unless keepends\n\
				4740	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4741
				4742	static PyObject*
				4743	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4744	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4745	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4746
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4747	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4748	return NULL;
				4749
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4750	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4751	}
				4752
				4753	static
				4754	PyObject unicode_str(PyUnicodeObject self)
				4755	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4756	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4757	}
				4758
				4759	static char strip__doc__[] =
				4760	"S.strip() -> unicode\n\
				4761	\n\
				4762	Return a copy of S with leading and trailing whitespace removed.";
				4763
				4764	static PyObject *
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4765	unicode_strip(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4766	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4767	return strip(self, 1, 1);
				4768	}
				4769
				4770	static char swapcase__doc__[] =
				4771	"S.swapcase() -> unicode\n\
				4772	\n\
				4773	Return a copy of S with uppercase characters converted to lowercase\n\
				4774	and vice versa.";
				4775
				4776	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4777	unicode_swapcase(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4778	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4779	return fixup(self, fixswapcase);
				4780	}
				4781
				4782	static char translate__doc__[] =
				4783	"S.translate(table) -> unicode\n\
				4784	\n\
				4785	Return a copy of the string S, where all characters have been mapped\n\
				4786	through the given translation table, which must be a mapping of\n\
				4787	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4788	are left untouched. Characters mapped to None are deleted.";
				4789
				4790	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4791	unicode_translate(PyUnicodeObject self, PyObject table)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4792	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4793	return PyUnicode_TranslateCharmap(self->str,
				4794	self->length,
				4795	table,
				4796	"ignore");
				4797	}
				4798
				4799	static char upper__doc__[] =
				4800	"S.upper() -> unicode\n\
				4801	\n\
				4802	Return a copy of S converted to uppercase.";
				4803
				4804	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4805	unicode_upper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4806	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4807	return fixup(self, fixupper);
				4808	}
				4809
				4810	#if 0
				4811	static char zfill__doc__[] =
				4812	"S.zfill(width) -> unicode\n\
				4813	\n\
				4814	Pad a numeric string x with zeros on the left, to fill a field\n\
				4815	of the specified width. The string x is never truncated.";
				4816
				4817	static PyObject *
				4818	unicode_zfill(PyUnicodeObject self, PyObject args)
				4819	{
				4820	int fill;
				4821	PyUnicodeObject *u;
				4822
				4823	int width;
				4824	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4825	return NULL;
				4826
				4827	if (self->length >= width) {
				4828	Py_INCREF(self);
				4829	return (PyObject*) self;
				4830	}
				4831
				4832	fill = width - self->length;
				4833
				4834	u = pad(self, fill, 0, '0');
				4835
				4836	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4837	/* move sign to beginning of string */
				4838	u->str[0] = u->str[fill];
				4839	u->str[fill] = '0';
				4840	}
				4841
				4842	return (PyObject*) u;
				4843	}
				4844	#endif
				4845
				4846	#if 0
				4847	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4848	unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4849	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4850	return PyInt_FromLong(unicode_freelist_size);
				4851	}
				4852	#endif
				4853
				4854	static char startswith__doc__[] =
				4855	"S.startswith(prefix[, start[, end]]) -> int\n\
				4856	\n\
				4857	Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
				4858	optional start, test S beginning at that position. With optional end, stop\n\
				4859	comparing S at that position.";
				4860
				4861	static PyObject *
				4862	unicode_startswith(PyUnicodeObject *self,
				4863	PyObject *args)
				4864	{
				4865	PyUnicodeObject *substring;
				4866	int start = 0;
				4867	int end = INT_MAX;
				4868	PyObject *result;
				4869
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4870	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				4871	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4872	return NULL;
				4873	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4874	(PyObject *)substring);
				4875	if (substring == NULL)
				4876	return NULL;
				4877
				4878	result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
				4879
				4880	Py_DECREF(substring);
				4881	return result;
				4882	}
				4883
				4884
				4885	static char endswith__doc__[] =
				4886	"S.endswith(suffix[, start[, end]]) -> int\n\
				4887	\n\
				4888	Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
				4889	optional start, test S beginning at that position. With optional end, stop\n\
				4890	comparing S at that position.";
				4891
				4892	static PyObject *
				4893	unicode_endswith(PyUnicodeObject *self,
				4894	PyObject *args)
				4895	{
				4896	PyUnicodeObject *substring;
				4897	int start = 0;
				4898	int end = INT_MAX;
				4899	PyObject *result;
				4900
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4901	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				4902	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4903	return NULL;
				4904	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4905	(PyObject *)substring);
				4906	if (substring == NULL)
				4907	return NULL;
				4908
				4909	result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
				4910
				4911	Py_DECREF(substring);
				4912	return result;
				4913	}
				4914
				4915
				4916	static PyMethodDef unicode_methods[] = {
				4917
				4918	/* Order is according to common usage: often used methods should
				4919	appear first, since lookup is done sequentially. */
				4920
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4921	{"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
				4922	{"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
				4923	{"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
				4924	{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
				4925	{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
				4926	{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
				4927	{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
				4928	{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
				4929	{"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
				4930	{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
				4931	{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
				4932	{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
				4933	{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
				4934	{"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
				4935	/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
				4936	{"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
				4937	{"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
				4938	{"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
				4939	{"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
				4940	{"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
				4941	{"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
				4942	{"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
				4943	{"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
				4944	{"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
				4945	{"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
				4946	{"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
				4947	{"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
				4948	{"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
				4949	{"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
				4950	{"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
				4951	{"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
				4952	{"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
				4953	{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
				4954	{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
				4955	{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4956	#if 0
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4957	{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
				4958	{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4959	#endif
				4960
				4961	#if 0
				4962	/* This one is just used for debugging the implementation. */
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4963	{"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4964	#endif
				4965
				4966	{NULL, NULL}
				4967	};
				4968
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4969	static PySequenceMethods unicode_as_sequence = {
				4970	(inquiry) unicode_length, /* sq_length */
				4971	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				4972	(intargfunc) unicode_repeat, /* sq_repeat */
				4973	(intargfunc) unicode_getitem, /* sq_item */
				4974	(intintargfunc) unicode_slice, /* sq_slice */
				4975	0, /* sq_ass_item */
				4976	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	4977	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4978	};
				4979
				4980	static int
				4981	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				4982	int index,
				4983	const void **ptr)
				4984	{
				4985	if (index != 0) {
				4986	PyErr_SetString(PyExc_SystemError,
				4987	"accessing non-existent unicode segment");
				4988	return -1;
				4989	}
				4990	ptr = (void ) self->str;
				4991	return PyUnicode_GET_DATA_SIZE(self);
				4992	}
				4993
				4994	static int
				4995	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				4996	const void **ptr)
				4997	{
				4998	PyErr_SetString(PyExc_TypeError,
				4999	"cannot use unicode as modifyable buffer");
				5000	return -1;
				5001	}
				5002
				5003	static int
				5004	unicode_buffer_getsegcount(PyUnicodeObject *self,
				5005	int *lenp)
				5006	{
				5007	if (lenp)
				5008	*lenp = PyUnicode_GET_DATA_SIZE(self);
				5009	return 1;
				5010	}
				5011
				5012	static int
				5013	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				5014	int index,
				5015	const void **ptr)
				5016	{
				5017	PyObject *str;
				5018
				5019	if (index != 0) {
				5020	PyErr_SetString(PyExc_SystemError,
				5021	"accessing non-existent unicode segment");
				5022	return -1;
				5023	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5024	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5025	if (str == NULL)
				5026	return -1;
				5027	ptr = (void ) PyString_AS_STRING(str);
				5028	return PyString_GET_SIZE(str);
				5029	}
				5030
				5031	/* Helpers for PyUnicode_Format() */
				5032
				5033	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5034	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5035	{
				5036	int argidx = *p_argidx;
				5037	if (argidx < arglen) {
				5038	(*p_argidx)++;
				5039	if (arglen < 0)
				5040	return args;
				5041	else
				5042	return PyTuple_GetItem(args, argidx);
				5043	}
				5044	PyErr_SetString(PyExc_TypeError,
				5045	"not enough arguments for format string");
				5046	return NULL;
				5047	}
				5048
				5049	#define F_LJUST (1<<0)
				5050	#define F_SIGN (1<<1)
				5051	#define F_BLANK (1<<2)
				5052	#define F_ALT (1<<3)
				5053	#define F_ZERO (1<<4)
				5054
				5055	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5056	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5057	{
				5058	register int i;
				5059	int len;
				5060	va_list va;
				5061	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5062	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5063
				5064	/* First, format the string as char array, then expand to Py_UNICODE
				5065	array. */
				5066	charbuffer = (char *)buffer;
				5067	len = vsprintf(charbuffer, format, va);
				5068	for (i = len - 1; i >= 0; i--)
				5069	buffer[i] = (Py_UNICODE) charbuffer[i];
				5070
				5071	va_end(va);
				5072	return len;
				5073	}
				5074
				5075	static int
				5076	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5077	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5078	int flags,
				5079	int prec,
				5080	int type,
				5081	PyObject *v)
				5082	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5083	/* fmt = '%#.' + `prec` + `type`
				5084	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5085	char fmt[20];
				5086	double x;
				5087
				5088	x = PyFloat_AsDouble(v);
				5089	if (x == -1.0 && PyErr_Occurred())
				5090	return -1;
				5091	if (prec < 0)
				5092	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5093	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				5094	type = 'g';
Barry Warsaw	e5c492d	2001-11-28 21:00:41 +0000	[diff] [blame]	5095	PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
				5096	(flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5097	/* worst case length calc to ensure no buffer overrun:
				5098	fmt = %#.<prec>g
				5099	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				5100	for any double rep.)
				5101	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				5102	If prec=0 the effective precision is 1 (the leading digit is
				5103	always given), therefore increase by one to 10+prec. */
				5104	if (buflen <= (size_t)10 + (size_t)prec) {
				5105	PyErr_SetString(PyExc_OverflowError,
				5106	"formatted float is too long (precision too long?)");
				5107	return -1;
				5108	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5109	return usprintf(buf, fmt, x);
				5110	}
				5111
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5112	static PyObject*
				5113	formatlong(PyObject *val, int flags, int prec, int type)
				5114	{
				5115	char *buf;
				5116	int i, len;
				5117	PyObject str; / temporary string object. */
				5118	PyUnicodeObject *result;
				5119
				5120	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				5121	if (!str)
				5122	return NULL;
				5123	result = _PyUnicode_New(len);
				5124	for (i = 0; i < len; i++)
				5125	result->str[i] = buf[i];
				5126	result->str[len] = 0;
				5127	Py_DECREF(str);
				5128	return (PyObject*)result;
				5129	}
				5130
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5131	static int
				5132	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5133	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5134	int flags,
				5135	int prec,
				5136	int type,
				5137	PyObject *v)
				5138	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5139	/* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5140	worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				5141	+ 1 + 1 = 24*/
				5142	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5143	long x;
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5144	int use_native_c_format = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5145
				5146	x = PyInt_AsLong(v);
				5147	if (x == -1 && PyErr_Occurred())
				5148	return -1;
				5149	if (prec < 0)
				5150	prec = 1;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5151	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
				5152	worst case buf = '0x' + [0-9]prec, where prec >= 11 /
				5153	if (buflen <= 13 \|\| buflen <= (size_t)2+(size_t)prec) {
				5154	PyErr_SetString(PyExc_OverflowError,
				5155	"formatted integer is too long (precision too long?)");
				5156	return -1;
				5157	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5158	/* When converting 0 under %#x or %#X, C leaves off the base marker,
				5159	* but we want it (for consistency with other %#x conversions, and
				5160	* for consistency with Python's hex() function).
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5161	* BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
				5162	* Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
				5163	* So add it only if the platform doesn't already.
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5164	*/
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5165	if (x == 0 && (flags & F_ALT) && (type == 'x' \|\| type == 'X')) {
				5166	/* Only way to know what the platform does is to try it. */
Barry Warsaw	e5c492d	2001-11-28 21:00:41 +0000	[diff] [blame]	5167	PyOS_snprintf(fmt, sizeof(fmt), type == 'x' ? "%#x" : "%#X", 0);
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5168	if (fmt[1] != (char)type) {
				5169	/* Supply our own leading 0x/0X -- needed under std C */
				5170	use_native_c_format = 0;
Barry Warsaw	e5c492d	2001-11-28 21:00:41 +0000	[diff] [blame]	5171	PyOS_snprintf(fmt, sizeof(fmt), "0%c%%#.%dl%c", type, prec, type);
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5172	}
				5173	}
				5174	if (use_native_c_format)
Barry Warsaw	e5c492d	2001-11-28 21:00:41 +0000	[diff] [blame]	5175	PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
				5176	(flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5177	return usprintf(buf, fmt, x);
				5178	}
				5179
				5180	static int
				5181	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5182	size_t buflen,
				5183	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5184	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5185	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5186	if (PyUnicode_Check(v)) {
				5187	if (PyUnicode_GET_SIZE(v) != 1)
				5188	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5189	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5190	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5191
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5192	else if (PyString_Check(v)) {
				5193	if (PyString_GET_SIZE(v) != 1)
				5194	goto onError;
				5195	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				5196	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5197
				5198	else {
				5199	/* Integer input truncated to a character */
				5200	long x;
				5201	x = PyInt_AsLong(v);
				5202	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5203	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5204	buf[0] = (char) x;
				5205	}
				5206	buf[1] = '\0';
				5207	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5208
				5209	onError:
				5210	PyErr_SetString(PyExc_TypeError,
				5211	"%c requires int or char");
				5212	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5213	}
				5214
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5215	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				5216
				5217	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				5218	chars are formatted. XXX This is a magic number. Each formatting
				5219	routine does bounds checking to ensure no overflow, but a better
				5220	solution may be to malloc a buffer of appropriate size for each
				5221	format. For now, the current solution is sufficient.
				5222	*/
				5223	#define FORMATBUFLEN (size_t)120
				5224
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5225	PyObject PyUnicode_Format(PyObject format,
				5226	PyObject *args)
				5227	{
				5228	Py_UNICODE fmt, res;
				5229	int fmtcnt, rescnt, reslen, arglen, argidx;
				5230	int args_owned = 0;
				5231	PyUnicodeObject *result = NULL;
				5232	PyObject *dict = NULL;
				5233	PyObject *uformat;
				5234
				5235	if (format == NULL \|\| args == NULL) {
				5236	PyErr_BadInternalCall();
				5237	return NULL;
				5238	}
				5239	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5240	if (uformat == NULL)
				5241	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5242	fmt = PyUnicode_AS_UNICODE(uformat);
				5243	fmtcnt = PyUnicode_GET_SIZE(uformat);
				5244
				5245	reslen = rescnt = fmtcnt + 100;
				5246	result = _PyUnicode_New(reslen);
				5247	if (result == NULL)
				5248	goto onError;
				5249	res = PyUnicode_AS_UNICODE(result);
				5250
				5251	if (PyTuple_Check(args)) {
				5252	arglen = PyTuple_Size(args);
				5253	argidx = 0;
				5254	}
				5255	else {
				5256	arglen = -1;
				5257	argidx = -2;
				5258	}
				5259	if (args->ob_type->tp_as_mapping)
				5260	dict = args;
				5261
				5262	while (--fmtcnt >= 0) {
				5263	if (*fmt != '%') {
				5264	if (--rescnt < 0) {
				5265	rescnt = fmtcnt + 100;
				5266	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5267	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5268	return NULL;
				5269	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				5270	--rescnt;
				5271	}
				5272	res++ = fmt++;
				5273	}
				5274	else {
				5275	/* Got a format specifier */
				5276	int flags = 0;
				5277	int width = -1;
				5278	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5279	Py_UNICODE c = '\0';
				5280	Py_UNICODE fill;
				5281	PyObject *v = NULL;
				5282	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5283	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5284	Py_UNICODE sign;
				5285	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5286	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5287
				5288	fmt++;
				5289	if (*fmt == '(') {
				5290	Py_UNICODE *keystart;
				5291	int keylen;
				5292	PyObject *key;
				5293	int pcount = 1;
				5294
				5295	if (dict == NULL) {
				5296	PyErr_SetString(PyExc_TypeError,
				5297	"format requires a mapping");
				5298	goto onError;
				5299	}
				5300	++fmt;
				5301	--fmtcnt;
				5302	keystart = fmt;
				5303	/* Skip over balanced parentheses */
				5304	while (pcount > 0 && --fmtcnt >= 0) {
				5305	if (*fmt == ')')
				5306	--pcount;
				5307	else if (*fmt == '(')
				5308	++pcount;
				5309	fmt++;
				5310	}
				5311	keylen = fmt - keystart - 1;
				5312	if (fmtcnt < 0 \|\| pcount > 0) {
				5313	PyErr_SetString(PyExc_ValueError,
				5314	"incomplete format key");
				5315	goto onError;
				5316	}
Marc-André Lemburg	72f8213	2001-11-20 15:18:49 +0000	[diff] [blame]	5317	#if 0
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5318	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5319	then looked up since Python uses strings to hold
				5320	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5321	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5322	key = PyUnicode_EncodeUTF8(keystart,
				5323	keylen,
				5324	NULL);
Marc-André Lemburg	72f8213	2001-11-20 15:18:49 +0000	[diff] [blame]	5325	#else
				5326	key = PyUnicode_FromUnicode(keystart, keylen);
				5327	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5328	if (key == NULL)
				5329	goto onError;
				5330	if (args_owned) {
				5331	Py_DECREF(args);
				5332	args_owned = 0;
				5333	}
				5334	args = PyObject_GetItem(dict, key);
				5335	Py_DECREF(key);
				5336	if (args == NULL) {
				5337	goto onError;
				5338	}
				5339	args_owned = 1;
				5340	arglen = -1;
				5341	argidx = -2;
				5342	}
				5343	while (--fmtcnt >= 0) {
				5344	switch (c = *fmt++) {
				5345	case '-': flags \|= F_LJUST; continue;
				5346	case '+': flags \|= F_SIGN; continue;
				5347	case ' ': flags \|= F_BLANK; continue;
				5348	case '#': flags \|= F_ALT; continue;
				5349	case '0': flags \|= F_ZERO; continue;
				5350	}
				5351	break;
				5352	}
				5353	if (c == '*') {
				5354	v = getnextarg(args, arglen, &argidx);
				5355	if (v == NULL)
				5356	goto onError;
				5357	if (!PyInt_Check(v)) {
				5358	PyErr_SetString(PyExc_TypeError,
				5359	"* wants int");
				5360	goto onError;
				5361	}
				5362	width = PyInt_AsLong(v);
				5363	if (width < 0) {
				5364	flags \|= F_LJUST;
				5365	width = -width;
				5366	}
				5367	if (--fmtcnt >= 0)
				5368	c = *fmt++;
				5369	}
				5370	else if (c >= '0' && c <= '9') {
				5371	width = c - '0';
				5372	while (--fmtcnt >= 0) {
				5373	c = *fmt++;
				5374	if (c < '0' \|\| c > '9')
				5375	break;
				5376	if ((width*10) / 10 != width) {
				5377	PyErr_SetString(PyExc_ValueError,
				5378	"width too big");
				5379	goto onError;
				5380	}
				5381	width = width*10 + (c - '0');
				5382	}
				5383	}
				5384	if (c == '.') {
				5385	prec = 0;
				5386	if (--fmtcnt >= 0)
				5387	c = *fmt++;
				5388	if (c == '*') {
				5389	v = getnextarg(args, arglen, &argidx);
				5390	if (v == NULL)
				5391	goto onError;
				5392	if (!PyInt_Check(v)) {
				5393	PyErr_SetString(PyExc_TypeError,
				5394	"* wants int");
				5395	goto onError;
				5396	}
				5397	prec = PyInt_AsLong(v);
				5398	if (prec < 0)
				5399	prec = 0;
				5400	if (--fmtcnt >= 0)
				5401	c = *fmt++;
				5402	}
				5403	else if (c >= '0' && c <= '9') {
				5404	prec = c - '0';
				5405	while (--fmtcnt >= 0) {
				5406	c = Py_CHARMASK(*fmt++);
				5407	if (c < '0' \|\| c > '9')
				5408	break;
				5409	if ((prec*10) / 10 != prec) {
				5410	PyErr_SetString(PyExc_ValueError,
				5411	"prec too big");
				5412	goto onError;
				5413	}
				5414	prec = prec*10 + (c - '0');
				5415	}
				5416	}
				5417	} /* prec */
				5418	if (fmtcnt >= 0) {
				5419	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5420	if (--fmtcnt >= 0)
				5421	c = *fmt++;
				5422	}
				5423	}
				5424	if (fmtcnt < 0) {
				5425	PyErr_SetString(PyExc_ValueError,
				5426	"incomplete format");
				5427	goto onError;
				5428	}
				5429	if (c != '%') {
				5430	v = getnextarg(args, arglen, &argidx);
				5431	if (v == NULL)
				5432	goto onError;
				5433	}
				5434	sign = 0;
				5435	fill = ' ';
				5436	switch (c) {
				5437
				5438	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5439	pbuf = formatbuf;
				5440	/* presume that buffer length is at least 1 */
				5441	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5442	len = 1;
				5443	break;
				5444
				5445	case 's':
				5446	case 'r':
				5447	if (PyUnicode_Check(v) && c == 's') {
				5448	temp = v;
				5449	Py_INCREF(temp);
				5450	}
				5451	else {
				5452	PyObject *unicode;
				5453	if (c == 's')
				5454	temp = PyObject_Str(v);
				5455	else
				5456	temp = PyObject_Repr(v);
				5457	if (temp == NULL)
				5458	goto onError;
				5459	if (!PyString_Check(temp)) {
				5460	/* XXX Note: this should never happen, since
				5461	PyObject_Repr() and PyObject_Str() assure
				5462	this */
				5463	Py_DECREF(temp);
				5464	PyErr_SetString(PyExc_TypeError,
				5465	"%s argument has non-string str()");
				5466	goto onError;
				5467	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5468	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5469	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5470	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5471	"strict");
				5472	Py_DECREF(temp);
				5473	temp = unicode;
				5474	if (temp == NULL)
				5475	goto onError;
				5476	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5477	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5478	len = PyUnicode_GET_SIZE(temp);
				5479	if (prec >= 0 && len > prec)
				5480	len = prec;
				5481	break;
				5482
				5483	case 'i':
				5484	case 'd':
				5485	case 'u':
				5486	case 'o':
				5487	case 'x':
				5488	case 'X':
				5489	if (c == 'i')
				5490	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5491	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5492	temp = formatlong(v, flags, prec, c);
				5493	if (!temp)
				5494	goto onError;
				5495	pbuf = PyUnicode_AS_UNICODE(temp);
				5496	len = PyUnicode_GET_SIZE(temp);
				5497	/* unbounded ints can always produce
				5498	a sign character! */
				5499	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5500	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5501	else {
				5502	pbuf = formatbuf;
				5503	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5504	flags, prec, c, v);
				5505	if (len < 0)
				5506	goto onError;
				5507	/* only d conversion is signed */
				5508	sign = c == 'd';
				5509	}
				5510	if (flags & F_ZERO)
				5511	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5512	break;
				5513
				5514	case 'e':
				5515	case 'E':
				5516	case 'f':
				5517	case 'g':
				5518	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5519	pbuf = formatbuf;
				5520	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5521	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5522	if (len < 0)
				5523	goto onError;
				5524	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5525	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5526	fill = '0';
				5527	break;
				5528
				5529	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5530	pbuf = formatbuf;
				5531	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5532	if (len < 0)
				5533	goto onError;
				5534	break;
				5535
				5536	default:
				5537	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5538	"unsupported format character '%c' (0x%x) "
				5539	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5540	(31<=c && c<=126) ? c : '?',
				5541	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5542	goto onError;
				5543	}
				5544	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5545	if (pbuf == '-' \|\| pbuf == '+') {
				5546	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5547	len--;
				5548	}
				5549	else if (flags & F_SIGN)
				5550	sign = '+';
				5551	else if (flags & F_BLANK)
				5552	sign = ' ';
				5553	else
				5554	sign = 0;
				5555	}
				5556	if (width < len)
				5557	width = len;
				5558	if (rescnt < width + (sign != 0)) {
				5559	reslen -= rescnt;
				5560	rescnt = width + fmtcnt + 100;
				5561	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5562	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5563	return NULL;
				5564	res = PyUnicode_AS_UNICODE(result)
				5565	+ reslen - rescnt;
				5566	}
				5567	if (sign) {
				5568	if (fill != ' ')
				5569	*res++ = sign;
				5570	rescnt--;
				5571	if (width > len)
				5572	width--;
				5573	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5574	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5575	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5576	assert(pbuf[1] == c);
				5577	if (fill != ' ') {
				5578	res++ = pbuf++;
				5579	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5580	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5581	rescnt -= 2;
				5582	width -= 2;
				5583	if (width < 0)
				5584	width = 0;
				5585	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5586	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5587	if (width > len && !(flags & F_LJUST)) {
				5588	do {
				5589	--rescnt;
				5590	*res++ = fill;
				5591	} while (--width > len);
				5592	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5593	if (fill == ' ') {
				5594	if (sign)
				5595	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5596	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5597	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5598	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5599	res++ = pbuf++;
				5600	res++ = pbuf++;
				5601	}
				5602	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5603	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5604	res += len;
				5605	rescnt -= len;
				5606	while (--width >= len) {
				5607	--rescnt;
				5608	*res++ = ' ';
				5609	}
				5610	if (dict && (argidx < arglen) && c != '%') {
				5611	PyErr_SetString(PyExc_TypeError,
				5612	"not all arguments converted");
				5613	goto onError;
				5614	}
				5615	Py_XDECREF(temp);
				5616	} /* '%' */
				5617	} /* until end */
				5618	if (argidx < arglen && !dict) {
				5619	PyErr_SetString(PyExc_TypeError,
				5620	"not all arguments converted");
				5621	goto onError;
				5622	}
				5623
				5624	if (args_owned) {
				5625	Py_DECREF(args);
				5626	}
				5627	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5628	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5629	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5630	return (PyObject *)result;
				5631
				5632	onError:
				5633	Py_XDECREF(result);
				5634	Py_DECREF(uformat);
				5635	if (args_owned) {
				5636	Py_DECREF(args);
				5637	}
				5638	return NULL;
				5639	}
				5640
				5641	static PyBufferProcs unicode_as_buffer = {
				5642	(getreadbufferproc) unicode_buffer_getreadbuf,
				5643	(getwritebufferproc) unicode_buffer_getwritebuf,
				5644	(getsegcountproc) unicode_buffer_getsegcount,
				5645	(getcharbufferproc) unicode_buffer_getcharbuf,
				5646	};
				5647
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5648	staticforward PyObject *
				5649	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds);
				5650
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5651	static PyObject *
				5652	unicode_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5653	{
				5654	PyObject *x = NULL;
				5655	static char *kwlist[] = {"string", "encoding", "errors", 0};
				5656	char *encoding = NULL;
				5657	char *errors = NULL;
				5658
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5659	if (type != &PyUnicode_Type)
				5660	return unicode_subtype_new(type, args, kwds);
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5661	if (!PyArg_ParseTupleAndKeywords(args, kwds, "\|Oss:unicode",
				5662	kwlist, &x, &encoding, &errors))
				5663	return NULL;
				5664	if (x == NULL)
				5665	return (PyObject *)_PyUnicode_New(0);
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	5666	if (encoding == NULL && errors == NULL)
				5667	return PyObject_Unicode(x);
				5668	else
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5669	return PyUnicode_FromEncodedObject(x, encoding, errors);
				5670	}
				5671
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5672	static PyObject *
				5673	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5674	{
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5675	PyUnicodeObject tmp, pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5676	int n;
				5677
				5678	assert(PyType_IsSubtype(type, &PyUnicode_Type));
				5679	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
				5680	if (tmp == NULL)
				5681	return NULL;
				5682	assert(PyUnicode_Check(tmp));
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5683	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
				5684	if (pnew == NULL)
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5685	return NULL;
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5686	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
				5687	if (pnew->str == NULL) {
				5688	_Py_ForgetReference((PyObject *)pnew);
				5689	PyObject_DEL(pnew);
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5690	return NULL;
				5691	}
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5692	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
				5693	pnew->length = n;
				5694	pnew->hash = tmp->hash;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5695	Py_DECREF(tmp);
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5696	return (PyObject *)pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5697	}
				5698
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5699	static char unicode_doc[] =
				5700	"unicode(string [, encoding[, errors]]) -> object\n\
				5701	\n\
				5702	Create a new Unicode object from the given encoded string.\n\
				5703	encoding defaults to the current default string encoding and \n\
				5704	errors, defining the error handling, to 'strict'.";
				5705
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5706	PyTypeObject PyUnicode_Type = {
				5707	PyObject_HEAD_INIT(&PyType_Type)
				5708	0, /* ob_size */
				5709	"unicode", /* tp_name */
				5710	sizeof(PyUnicodeObject), /* tp_size */
				5711	0, /* tp_itemsize */
				5712	/* Slots */
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	5713	(destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5714	0, /* tp_print */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5715	0, /* tp_getattr */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5716	0, /* tp_setattr */
				5717	(cmpfunc) unicode_compare, /* tp_compare */
				5718	(reprfunc) unicode_repr, /* tp_repr */
				5719	0, /* tp_as_number */
				5720	&unicode_as_sequence, /* tp_as_sequence */
				5721	0, /* tp_as_mapping */
				5722	(hashfunc) unicode_hash, /* tp_hash*/
				5723	0, /* tp_call*/
				5724	(reprfunc) unicode_str, /* tp_str */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5725	PyObject_GenericGetAttr, /* tp_getattro */
				5726	0, /* tp_setattro */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5727	&unicode_as_buffer, /* tp_as_buffer */
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5728	Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5729	unicode_doc, /* tp_doc */
				5730	0, /* tp_traverse */
				5731	0, /* tp_clear */
				5732	0, /* tp_richcompare */
				5733	0, /* tp_weaklistoffset */
				5734	0, /* tp_iter */
				5735	0, /* tp_iternext */
				5736	unicode_methods, /* tp_methods */
				5737	0, /* tp_members */
				5738	0, /* tp_getset */
				5739	0, /* tp_base */
				5740	0, /* tp_dict */
				5741	0, /* tp_descr_get */
				5742	0, /* tp_descr_set */
				5743	0, /* tp_dictoffset */
				5744	0, /* tp_init */
				5745	0, /* tp_alloc */
				5746	unicode_new, /* tp_new */
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	5747	_PyObject_Del, /* tp_free */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5748	};
				5749
				5750	/* Initialize the Unicode implementation */
				5751
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5752	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5753	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5754	int i;
				5755
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5756	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5757	unicode_freelist = NULL;
				5758	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5759	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5760	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5761	for (i = 0; i < 256; i++)
				5762	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5763	}
				5764
				5765	/* Finalize the Unicode implementation */
				5766
				5767	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5768	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5769	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5770	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5771	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5772
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5773	Py_XDECREF(unicode_empty);
				5774	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5775
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5776	for (i = 0; i < 256; i++) {
				5777	if (unicode_latin1[i]) {
				5778	Py_DECREF(unicode_latin1[i]);
				5779	unicode_latin1[i] = NULL;
				5780	}
				5781	}
				5782
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5783	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5784	PyUnicodeObject *v = u;
				5785	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5786	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5787	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5788	Py_XDECREF(v->defenc);
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5789	PyObject_DEL(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5790	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5791	unicode_freelist = NULL;
				5792	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5793	}