Blame - Objects/unicodeobject.c - platform/external/python/cpython3

blob: 2fe96681a0564dfb324496871211f8df99425c93 [file] [log] [blame]

Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1	/*
				2
				3	Unicode implementation based on original code by Fredrik Lundh,
Fred Drake	785d14f	2000-05-09 19:54:43 +0000	[diff] [blame]	4	modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5	Unicode Integration Proposal (see file Misc/unicode.txt).
				6
Guido van Rossum	16b1ad9	2000-08-03 16:24:25 +0000	[diff] [blame]	7	Copyright (c) Corporation for National Research Initiatives.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	8
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	9	--------------------------------------------------------------------
				10	The original string type implementation is:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	11
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	12	Copyright (c) 1999 by Secret Labs AB
				13	Copyright (c) 1999 by Fredrik Lundh
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	14
Fredrik Lundh	0fdb90c	2001-01-19 09:45:02 +0000	[diff] [blame]	15	By obtaining, using, and/or copying this software and/or its
				16	associated documentation, you agree that you have read, understood,
				17	and will comply with the following terms and conditions:
				18
				19	Permission to use, copy, modify, and distribute this software and its
				20	associated documentation for any purpose and without fee is hereby
				21	granted, provided that the above copyright notice appears in all
				22	copies, and that both that copyright notice and this permission notice
				23	appear in supporting documentation, and that the name of Secret Labs
				24	AB or the author not be used in advertising or publicity pertaining to
				25	distribution of the software without specific, written prior
				26	permission.
				27
				28	SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
				29	THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
				30	FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
				31	ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
				32	WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
				33	ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
				34	OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
				35	--------------------------------------------------------------------
				36
				37	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	38
				39	#include "Python.h"
				40
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	41	#include "unicodeobject.h"
Marc-André Lemburg	d49e5b4	2000-06-30 14:58:20 +0000	[diff] [blame]	42	#include "ucnhash.h"
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	43
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	44	#ifdef MS_WIN32
				45	#include <windows.h>
				46	#endif
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	47
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	48	/* Limit for the Unicode object free list */
				49
				50	#define MAX_UNICODE_FREELIST_SIZE 1024
				51
				52	/* Limit for the Unicode object free list stay alive optimization.
				53
				54	The implementation will keep allocated Unicode memory intact for
				55	all objects on the free list having a size less than this
				56	limit. This reduces malloc() overhead for small Unicode objects.
				57
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	58	At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	59	(sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	60	malloc()-overhead) bytes of unused garbage.
				61
				62	Setting the limit to 0 effectively turns the feature off.
				63
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	64	Note: This is an experimental feature ! If you get core dumps when
				65	using Unicode objects, turn this feature off.
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	66
				67	*/
				68
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	69	#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	70
				71	/* Endianness switches; defaults to little endian */
				72
				73	#ifdef WORDS_BIGENDIAN
				74	# define BYTEORDER_IS_BIG_ENDIAN
				75	#else
				76	# define BYTEORDER_IS_LITTLE_ENDIAN
				77	#endif
				78
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	79	/* --- Globals ------------------------------------------------------------
				80
				81	The globals are initialized by the _PyUnicode_Init() API and should
				82	not be used before calling that API.
				83
				84	*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	85
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	86	/* Free list for Unicode objects */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	87	static PyUnicodeObject *unicode_freelist;
				88	static int unicode_freelist_size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	89
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	90	/* The empty Unicode object is shared to improve performance. */
				91	static PyUnicodeObject *unicode_empty;
				92
				93	/* Single character Unicode strings in the Latin-1 range are being
				94	shared as well. */
				95	static PyUnicodeObject *unicode_latin1[256];
				96
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	97	/* Default encoding to use and assume when NULL is passed as encoding
				98	parameter; it is initialized by _PyUnicode_Init().
				99
				100	Always use the PyUnicode_SetDefaultEncoding() and
				101	PyUnicode_GetDefaultEncoding() APIs to access this global.
				102
				103	*/
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	104	static char unicode_default_encoding[100];
				105
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	106	Py_UNICODE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	107	PyUnicode_GetMax(void)
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	108	{
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	109	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	ce9b5a5	2001-06-27 06:28:56 +0000	[diff] [blame]	110	return 0x10FFFF;
				111	#else
				112	/* This is actually an illegal character, so it should
				113	not be passed to unichr. */
				114	return 0xFFFF;
				115	#endif
				116	}
				117
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	118	/* --- Unicode Object ----------------------------------------------------- */
				119
				120	static
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	121	int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	122	int length)
				123	{
				124	void *oldstr;
				125
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	126	/* Shortcut if there's nothing much to do. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	127	if (unicode->length == length)
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	128	goto reset;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	129
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	130	/* Resizing shared object (unicode_empty or single character
				131	objects) in-place is not allowed. Use PyUnicode_Resize()
				132	instead ! */
				133	if (unicode == unicode_empty \|\|
				134	(unicode->length == 1 &&
				135	unicode->str[0] < 256 &&
				136	unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	137	PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	138	"can't resize shared unicode objects");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	139	return -1;
				140	}
				141
				142	/* We allocate one more byte to make sure the string is
				143	Ux0000 terminated -- XXX is this needed ? */
				144	oldstr = unicode->str;
				145	PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
				146	if (!unicode->str) {
				147	unicode->str = oldstr;
				148	PyErr_NoMemory();
				149	return -1;
				150	}
				151	unicode->str[length] = 0;
				152	unicode->length = length;
				153
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	154	reset:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	155	/* Reset the object caches */
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	156	if (unicode->defenc) {
				157	Py_DECREF(unicode->defenc);
				158	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	159	}
				160	unicode->hash = -1;
				161
				162	return 0;
				163	}
				164
				165	/* We allocate one more byte to make sure the string is
				166	Ux0000 terminated -- XXX is this needed ?
				167
				168	XXX This allocator could further be enhanced by assuring that the
				169	free list never reduces its size below 1.
				170
				171	*/
				172
				173	static
				174	PyUnicodeObject *_PyUnicode_New(int length)
				175	{
				176	register PyUnicodeObject *unicode;
				177
				178	/* Optimization for empty strings */
				179	if (length == 0 && unicode_empty != NULL) {
				180	Py_INCREF(unicode_empty);
				181	return unicode_empty;
				182	}
				183
				184	/* Unicode freelist & memory allocation */
				185	if (unicode_freelist) {
				186	unicode = unicode_freelist;
Marc-André Lemburg	bea47e7	2000-06-17 20:31:17 +0000	[diff] [blame]	187	unicode_freelist = (PyUnicodeObject *)unicode;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	188	unicode_freelist_size--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	189	if (unicode->str) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	190	/* Keep-Alive optimization: we only upsize the buffer,
				191	never downsize it. */
				192	if ((unicode->length < length) &&
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	193	unicode_resize(unicode, length)) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	194	PyMem_DEL(unicode->str);
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	195	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	196	}
				197	}
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	198	else {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	199	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossum	ad98db1	2001-06-14 17:52:02 +0000	[diff] [blame]	200	}
				201	PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	202	}
				203	else {
Neil Schemenauer	58aa861	2002-04-12 03:07:20 +0000	[diff] [blame]	204	unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	205	if (unicode == NULL)
				206	return NULL;
				207	unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
				208	}
				209
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	210	if (!unicode->str) {
				211	PyErr_NoMemory();
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	212	goto onError;
Guido van Rossum	3c1bb80	2000-04-27 20:13:50 +0000	[diff] [blame]	213	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	214	unicode->str[length] = 0;
				215	unicode->length = length;
				216	unicode->hash = -1;
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	217	unicode->defenc = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	218	return unicode;
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	219
				220	onError:
				221	_Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer	58aa861	2002-04-12 03:07:20 +0000	[diff] [blame]	222	PyObject_Del(unicode);
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	223	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	224	}
				225
				226	static
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	227	void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	228	{
Guido van Rossum	604ddf8	2001-12-06 20:03:56 +0000	[diff] [blame]	229	if (PyUnicode_CheckExact(unicode) &&
				230	unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	231	/* Keep-Alive optimization */
				232	if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	233	PyMem_DEL(unicode->str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	234	unicode->str = NULL;
				235	unicode->length = 0;
				236	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	237	if (unicode->defenc) {
				238	Py_DECREF(unicode->defenc);
				239	unicode->defenc = NULL;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	240	}
				241	/* Add to free list */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	242	(PyUnicodeObject *)unicode = unicode_freelist;
				243	unicode_freelist = unicode;
				244	unicode_freelist_size++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	245	}
				246	else {
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	247	PyMem_DEL(unicode->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	248	Py_XDECREF(unicode->defenc);
Guido van Rossum	604ddf8	2001-12-06 20:03:56 +0000	[diff] [blame]	249	unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	250	}
				251	}
				252
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	253	int PyUnicode_Resize(PyObject **unicode,
				254	int length)
				255	{
				256	register PyUnicodeObject *v;
				257
				258	/* Argument checks */
				259	if (unicode == NULL) {
				260	PyErr_BadInternalCall();
				261	return -1;
				262	}
				263	v = (PyUnicodeObject )unicode;
				264	if (v == NULL \|\| !PyUnicode_Check(v) \|\| v->ob_refcnt != 1) {
				265	PyErr_BadInternalCall();
				266	return -1;
				267	}
				268
				269	/* Resizing unicode_empty and single character objects is not
				270	possible since these are being shared. We simply return a fresh
				271	copy with the same Unicode content. */
				272	if (v->length != length &&
				273	(v == unicode_empty \|\| v->length == 1)) {
				274	PyUnicodeObject *w = _PyUnicode_New(length);
				275	if (w == NULL)
				276	return -1;
				277	Py_UNICODE_COPY(w->str, v->str,
				278	length < v->length ? length : v->length);
				279	unicode = (PyObject )w;
				280	return 0;
				281	}
				282
				283	/* Note that we don't have to modify *unicode for unshared Unicode
				284	objects, since we can modify them in-place. */
				285	return unicode_resize(v, length);
				286	}
				287
				288	/* Internal API for use in unicodeobject.c only ! */
				289	#define _PyUnicode_Resize(unicodevar, length) \
				290	PyUnicode_Resize(((PyObject **)(unicodevar)), length)
				291
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	292	PyObject PyUnicode_FromUnicode(const Py_UNICODE u,
				293	int size)
				294	{
				295	PyUnicodeObject *unicode;
				296
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	297	/* If the Unicode data is known at construction time, we can apply
				298	some optimizations which share commonly used objects. */
				299	if (u != NULL) {
				300
				301	/* Optimization for empty strings */
				302	if (size == 0 && unicode_empty != NULL) {
				303	Py_INCREF(unicode_empty);
				304	return (PyObject *)unicode_empty;
				305	}
				306
				307	/* Single character Unicode objects in the Latin-1 range are
				308	shared when using this constructor */
				309	if (size == 1 && *u < 256) {
				310	unicode = unicode_latin1[*u];
				311	if (!unicode) {
				312	unicode = _PyUnicode_New(1);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	313	if (!unicode)
				314	return NULL;
Marc-André Lemburg	8879a33	2001-06-07 12:26:56 +0000	[diff] [blame]	315	unicode->str[0] = *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	316	unicode_latin1[*u] = unicode;
				317	}
				318	Py_INCREF(unicode);
				319	return (PyObject *)unicode;
				320	}
				321	}
				322
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	323	unicode = _PyUnicode_New(size);
				324	if (!unicode)
				325	return NULL;
				326
				327	/* Copy the Unicode data into the new object */
				328	if (u != NULL)
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	329	Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	330
				331	return (PyObject *)unicode;
				332	}
				333
				334	#ifdef HAVE_WCHAR_H
				335
				336	PyObject PyUnicode_FromWideChar(register const wchar_t w,
				337	int size)
				338	{
				339	PyUnicodeObject *unicode;
				340
				341	if (w == NULL) {
				342	PyErr_BadInternalCall();
				343	return NULL;
				344	}
				345
				346	unicode = _PyUnicode_New(size);
				347	if (!unicode)
				348	return NULL;
				349
				350	/* Copy the wchar_t data into the new object */
				351	#ifdef HAVE_USABLE_WCHAR_T
				352	memcpy(unicode->str, w, size * sizeof(wchar_t));
				353	#else
				354	{
				355	register Py_UNICODE *u;
				356	register int i;
				357	u = PyUnicode_AS_UNICODE(unicode);
				358	for (i = size; i >= 0; i--)
				359	u++ = w++;
				360	}
				361	#endif
				362
				363	return (PyObject *)unicode;
				364	}
				365
				366	int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
				367	register wchar_t *w,
				368	int size)
				369	{
				370	if (unicode == NULL) {
				371	PyErr_BadInternalCall();
				372	return -1;
				373	}
				374	if (size > PyUnicode_GET_SIZE(unicode))
				375	size = PyUnicode_GET_SIZE(unicode);
				376	#ifdef HAVE_USABLE_WCHAR_T
				377	memcpy(w, unicode->str, size * sizeof(wchar_t));
				378	#else
				379	{
				380	register Py_UNICODE *u;
				381	register int i;
				382	u = PyUnicode_AS_UNICODE(unicode);
				383	for (i = size; i >= 0; i--)
				384	w++ = u++;
				385	}
				386	#endif
				387
				388	return size;
				389	}
				390
				391	#endif
				392
				393	PyObject PyUnicode_FromObject(register PyObject obj)
				394	{
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	395	/* XXX Perhaps we should make this API an alias of
				396	PyObject_Unicode() instead ?! */
				397	if (PyUnicode_CheckExact(obj)) {
				398	Py_INCREF(obj);
				399	return obj;
				400	}
				401	if (PyUnicode_Check(obj)) {
				402	/* For a Unicode subtype that's not a Unicode object,
				403	return a true Unicode object with the same data. */
				404	return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
				405	PyUnicode_GET_SIZE(obj));
				406	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	407	return PyUnicode_FromEncodedObject(obj, NULL, "strict");
				408	}
				409
				410	PyObject PyUnicode_FromEncodedObject(register PyObject obj,
				411	const char *encoding,
				412	const char *errors)
				413	{
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	414	const char *s = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	415	int len;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	416	int owned = 0;
				417	PyObject *v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	418
				419	if (obj == NULL) {
				420	PyErr_BadInternalCall();
				421	return NULL;
				422	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	423
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	424	#if 0
				425	/* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburg	b5507ec	2001-10-19 12:02:29 +0000	[diff] [blame]	426	that no encodings is given and then redirect to
				427	PyObject_Unicode() which then applies the additional logic for
				428	Unicode subclasses.
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	429
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	430	NOTE: This API should really only be used for object which
				431	represent encoded Unicode !
				432
				433	*/
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	434	if (PyUnicode_Check(obj)) {
				435	if (encoding) {
				436	PyErr_SetString(PyExc_TypeError,
				437	"decoding Unicode is not supported");
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	438	return NULL;
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	439	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	440	return PyObject_Unicode(obj);
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	441	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	442	#else
				443	if (PyUnicode_Check(obj)) {
				444	PyErr_SetString(PyExc_TypeError,
				445	"decoding Unicode is not supported");
				446	return NULL;
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	447	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	448	#endif
				449
				450	/* Coerce object */
				451	if (PyString_Check(obj)) {
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	452	s = PyString_AS_STRING(obj);
				453	len = PyString_GET_SIZE(obj);
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	454	}
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	455	else if (PyObject_AsCharBuffer(obj, &s, &len)) {
				456	/* Overwrite the error message with something more useful in
				457	case of a TypeError. */
				458	if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	459	PyErr_Format(PyExc_TypeError,
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	460	"coercing to Unicode: need string or buffer, "
				461	"%.80s found",
Marc-André Lemburg	6871f6a	2001-09-20 12:53:16 +0000	[diff] [blame]	462	obj->ob_type->tp_name);
				463	goto onError;
				464	}
				465
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	466	/* Convert to Unicode */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	467	if (len == 0) {
				468	Py_INCREF(unicode_empty);
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	469	v = (PyObject *)unicode_empty;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	470	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	471	else
				472	v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburg	ad7c98e	2001-01-17 17:09:53 +0000	[diff] [blame]	473
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	474	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	475	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	476	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	477	return v;
				478
				479	onError:
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	480	if (owned) {
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	481	Py_DECREF(obj);
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	482	}
Marc-André Lemburg	5a5c81a	2000-07-07 13:46:42 +0000	[diff] [blame]	483	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	484	}
				485
				486	PyObject PyUnicode_Decode(const char s,
				487	int size,
				488	const char *encoding,
				489	const char *errors)
				490	{
				491	PyObject buffer = NULL, unicode;
				492
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	493	if (encoding == NULL)
				494	encoding = PyUnicode_GetDefaultEncoding();
				495
				496	/* Shortcuts for common default encodings */
				497	if (strcmp(encoding, "utf-8") == 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	498	return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	499	else if (strcmp(encoding, "latin-1") == 0)
				500	return PyUnicode_DecodeLatin1(s, size, errors);
				501	else if (strcmp(encoding, "ascii") == 0)
				502	return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	503
				504	/* Decode via the codec registry */
				505	buffer = PyBuffer_FromMemory((void *)s, size);
				506	if (buffer == NULL)
				507	goto onError;
				508	unicode = PyCodec_Decode(buffer, encoding, errors);
				509	if (unicode == NULL)
				510	goto onError;
				511	if (!PyUnicode_Check(unicode)) {
				512	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	513	"decoder did not return an unicode object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	514	unicode->ob_type->tp_name);
				515	Py_DECREF(unicode);
				516	goto onError;
				517	}
				518	Py_DECREF(buffer);
				519	return unicode;
				520
				521	onError:
				522	Py_XDECREF(buffer);
				523	return NULL;
				524	}
				525
				526	PyObject PyUnicode_Encode(const Py_UNICODE s,
				527	int size,
				528	const char *encoding,
				529	const char *errors)
				530	{
				531	PyObject v, unicode;
				532
				533	unicode = PyUnicode_FromUnicode(s, size);
				534	if (unicode == NULL)
				535	return NULL;
				536	v = PyUnicode_AsEncodedString(unicode, encoding, errors);
				537	Py_DECREF(unicode);
				538	return v;
				539	}
				540
				541	PyObject PyUnicode_AsEncodedString(PyObject unicode,
				542	const char *encoding,
				543	const char *errors)
				544	{
				545	PyObject *v;
				546
				547	if (!PyUnicode_Check(unicode)) {
				548	PyErr_BadArgument();
				549	goto onError;
				550	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	551
				552	if (encoding == NULL)
				553	encoding = PyUnicode_GetDefaultEncoding();
				554
				555	/* Shortcuts for common default encodings */
				556	if (errors == NULL) {
				557	if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton	9cea41c	2001-05-29 17:13:15 +0000	[diff] [blame]	558	return PyUnicode_AsUTF8String(unicode);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	559	else if (strcmp(encoding, "latin-1") == 0)
				560	return PyUnicode_AsLatin1String(unicode);
				561	else if (strcmp(encoding, "ascii") == 0)
				562	return PyUnicode_AsASCIIString(unicode);
				563	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	564
				565	/* Encode via the codec registry */
				566	v = PyCodec_Encode(unicode, encoding, errors);
				567	if (v == NULL)
				568	goto onError;
				569	/* XXX Should we really enforce this ? */
				570	if (!PyString_Check(v)) {
				571	PyErr_Format(PyExc_TypeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	572	"encoder did not return a string object (type=%.400s)",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	573	v->ob_type->tp_name);
				574	Py_DECREF(v);
				575	goto onError;
				576	}
				577	return v;
				578
				579	onError:
				580	return NULL;
				581	}
				582
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	583	PyObject _PyUnicode_AsDefaultEncodedString(PyObject unicode,
				584	const char *errors)
				585	{
				586	PyObject v = ((PyUnicodeObject )unicode)->defenc;
				587
				588	if (v)
				589	return v;
				590	v = PyUnicode_AsEncodedString(unicode, NULL, errors);
				591	if (v && errors == NULL)
				592	((PyUnicodeObject *)unicode)->defenc = v;
				593	return v;
				594	}
				595
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	596	Py_UNICODE PyUnicode_AsUnicode(PyObject unicode)
				597	{
				598	if (!PyUnicode_Check(unicode)) {
				599	PyErr_BadArgument();
				600	goto onError;
				601	}
				602	return PyUnicode_AS_UNICODE(unicode);
				603
				604	onError:
				605	return NULL;
				606	}
				607
				608	int PyUnicode_GetSize(PyObject *unicode)
				609	{
				610	if (!PyUnicode_Check(unicode)) {
				611	PyErr_BadArgument();
				612	goto onError;
				613	}
				614	return PyUnicode_GET_SIZE(unicode);
				615
				616	onError:
				617	return -1;
				618	}
				619
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	620	const char *PyUnicode_GetDefaultEncoding(void)
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	621	{
				622	return unicode_default_encoding;
				623	}
				624
				625	int PyUnicode_SetDefaultEncoding(const char *encoding)
				626	{
				627	PyObject *v;
				628
				629	/* Make sure the encoding is valid. As side effect, this also
				630	loads the encoding into the codec registry cache. */
				631	v = _PyCodec_Lookup(encoding);
				632	if (v == NULL)
				633	goto onError;
				634	Py_DECREF(v);
				635	strncpy(unicode_default_encoding,
				636	encoding,
				637	sizeof(unicode_default_encoding));
				638	return 0;
				639
				640	onError:
				641	return -1;
				642	}
				643
Marc-André Lemburg	c60e6f7	2001-09-20 10:35:46 +0000	[diff] [blame]	644	/* --- UTF-7 Codec -------------------------------------------------------- */
				645
				646	/* see RFC2152 for details */
				647
				648	static
				649	char utf7_special[128] = {
				650	/* indicate whether a UTF-7 character is special i.e. cannot be directly
				651	encoded:
				652	0 - not special
				653	1 - special
				654	2 - whitespace (optional)
				655	3 - RFC2152 Set O (optional) */
				656	1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
				657	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				658	2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
				659	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
				660	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				661	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
				662	3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				663	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
				664
				665	};
				666
				667	#define SPECIAL(c, encodeO, encodeWS) \
				668	(((c)>127 \|\| utf7_special[(c)] == 1) \|\| \
				669	(encodeWS && (utf7_special[(c)] == 2)) \|\| \
				670	(encodeO && (utf7_special[(c)] == 3)))
				671
				672	#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
				673	#define B64CHAR(c) (isalnum(c) \|\| (c) == '+' \|\| (c) == '/')
				674	#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
				675	(c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
				676
				677	#define ENCODE(out, ch, bits) \
				678	while (bits >= 6) { \
				679	*out++ = B64(ch >> (bits-6)); \
				680	bits -= 6; \
				681	}
				682
				683	#define DECODE(out, ch, bits, surrogate) \
				684	while (bits >= 16) { \
				685	Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
				686	bits -= 16; \
				687	if (surrogate) { \
				688	/* We have already generated an error for the high surrogate
				689	so let's not bother seeing if the low surrogate is correct or not */\
				690	surrogate = 0; \
				691	} else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
				692	/* This is a surrogate pair. Unfortunately we can't represent \
				693	it in a 16-bit character */ \
				694	surrogate = 1; \
				695	errmsg = "code pairs are not supported"; \
				696	goto utf7Error; \
				697	} else { \
				698	*out++ = outCh; \
				699	} \
				700	} \
				701
				702	static
				703	int utf7_decoding_error(Py_UNICODE **dest,
				704	const char *errors,
				705	const char *details)
				706	{
				707	if ((errors == NULL) \|\|
				708	(strcmp(errors,"strict") == 0)) {
				709	PyErr_Format(PyExc_UnicodeError,
				710	"UTF-7 decoding error: %.400s",
				711	details);
				712	return -1;
				713	}
				714	else if (strcmp(errors,"ignore") == 0) {
				715	return 0;
				716	}
				717	else if (strcmp(errors,"replace") == 0) {
				718	if (dest != NULL) {
				719	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				720	(*dest)++;
				721	}
				722	return 0;
				723	}
				724	else {
				725	PyErr_Format(PyExc_ValueError,
				726	"UTF-7 decoding error; unknown error handling code: %.400s",
				727	errors);
				728	return -1;
				729	}
				730	}
				731
				732	PyObject PyUnicode_DecodeUTF7(const char s,
				733	int size,
				734	const char *errors)
				735	{
				736	const char *e;
				737	PyUnicodeObject *unicode;
				738	Py_UNICODE *p;
				739	const char *errmsg = "";
				740	int inShift = 0;
				741	unsigned int bitsleft = 0;
				742	unsigned long charsleft = 0;
				743	int surrogate = 0;
				744
				745	unicode = _PyUnicode_New(size);
				746	if (!unicode)
				747	return NULL;
				748	if (size == 0)
				749	return (PyObject *)unicode;
				750
				751	p = unicode->str;
				752	e = s + size;
				753
				754	while (s < e) {
				755	Py_UNICODE ch = *s;
				756
				757	if (inShift) {
				758	if ((ch == '-') \|\| !B64CHAR(ch)) {
				759	inShift = 0;
				760	s++;
				761
				762	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				763	if (bitsleft >= 6) {
				764	/* The shift sequence has a partial character in it. If
				765	bitsleft < 6 then we could just classify it as padding
				766	but that is not the case here */
				767
				768	errmsg = "partial character in shift sequence";
				769	goto utf7Error;
				770	}
				771	/* According to RFC2152 the remaining bits should be zero. We
				772	choose to signal an error/insert a replacement character
				773	here so indicate the potential of a misencoded character. */
				774
				775	/* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
				776	if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
				777	errmsg = "non-zero padding bits in shift sequence";
				778	goto utf7Error;
				779	}
				780
				781	if (ch == '-') {
				782	if ((s < e) && (*(s) == '-')) {
				783	*p++ = '-';
				784	inShift = 1;
				785	}
				786	} else if (SPECIAL(ch,0,0)) {
				787	errmsg = "unexpected special character";
				788	goto utf7Error;
				789	} else {
				790	*p++ = ch;
				791	}
				792	} else {
				793	charsleft = (charsleft << 6) \| UB64(ch);
				794	bitsleft += 6;
				795	s++;
				796	/* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
				797	}
				798	}
				799	else if ( ch == '+' ) {
				800	s++;
				801	if (s < e && *s == '-') {
				802	s++;
				803	*p++ = '+';
				804	} else
				805	{
				806	inShift = 1;
				807	bitsleft = 0;
				808	}
				809	}
				810	else if (SPECIAL(ch,0,0)) {
				811	errmsg = "unexpected special character";
				812	s++;
				813	goto utf7Error;
				814	}
				815	else {
				816	*p++ = ch;
				817	s++;
				818	}
				819	continue;
				820	utf7Error:
				821	if (utf7_decoding_error(&p, errors, errmsg))
				822	goto onError;
				823	}
				824
				825	if (inShift) {
				826	if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
				827	goto onError;
				828	}
				829
				830	if (_PyUnicode_Resize(&unicode, p - unicode->str))
				831	goto onError;
				832
				833	return (PyObject *)unicode;
				834
				835	onError:
				836	Py_DECREF(unicode);
				837	return NULL;
				838	}
				839
				840
				841	PyObject PyUnicode_EncodeUTF7(const Py_UNICODE s,
				842	int size,
				843	int encodeSetO,
				844	int encodeWhiteSpace,
				845	const char *errors)
				846	{
				847	PyObject *v;
				848	/* It might be possible to tighten this worst case */
				849	unsigned int cbAllocated = 5 * size;
				850	int inShift = 0;
				851	int i = 0;
				852	unsigned int bitsleft = 0;
				853	unsigned long charsleft = 0;
				854	char * out;
				855	char * start;
				856
				857	if (size == 0)
				858	return PyString_FromStringAndSize(NULL, 0);
				859
				860	v = PyString_FromStringAndSize(NULL, cbAllocated);
				861	if (v == NULL)
				862	return NULL;
				863
				864	start = out = PyString_AS_STRING(v);
				865	for (;i < size; ++i) {
				866	Py_UNICODE ch = s[i];
				867
				868	if (!inShift) {
				869	if (ch == '+') {
				870	*out++ = '+';
				871	*out++ = '-';
				872	} else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				873	charsleft = ch;
				874	bitsleft = 16;
				875	*out++ = '+';
				876	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				877	inShift = bitsleft > 0;
				878	} else {
				879	*out++ = (char) ch;
				880	}
				881	} else {
				882	if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
				883	*out++ = B64(charsleft << (6-bitsleft));
				884	charsleft = 0;
				885	bitsleft = 0;
				886	/* Characters not in the BASE64 set implicitly unshift the sequence
				887	so no '-' is required, except if the character is itself a '-' */
				888	if (B64CHAR(ch) \|\| ch == '-') {
				889	*out++ = '-';
				890	}
				891	inShift = 0;
				892	*out++ = (char) ch;
				893	} else {
				894	bitsleft += 16;
				895	charsleft = (charsleft << 16) \| ch;
				896	/* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
				897
				898	/* If the next character is special then we dont' need to terminate
				899	the shift sequence. If the next character is not a BASE64 character
				900	or '-' then the shift sequence will be terminated implicitly and we
				901	don't have to insert a '-'. */
				902
				903	if (bitsleft == 0) {
				904	if (i + 1 < size) {
				905	Py_UNICODE ch2 = s[i+1];
				906
				907	if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
				908
				909	} else if (B64CHAR(ch2) \|\| ch2 == '-') {
				910	*out++ = '-';
				911	inShift = 0;
				912	} else {
				913	inShift = 0;
				914	}
				915
				916	}
				917	else {
				918	*out++ = '-';
				919	inShift = 0;
				920	}
				921	}
				922	}
				923	}
				924	}
				925	if (bitsleft) {
				926	*out++= B64(charsleft << (6-bitsleft) );
				927	*out++ = '-';
				928	}
				929
				930	if (_PyString_Resize(&v, out - start)) {
				931	Py_DECREF(v);
				932	return NULL;
				933	}
				934	return v;
				935	}
				936
				937	#undef SPECIAL
				938	#undef B64
				939	#undef B64CHAR
				940	#undef UB64
				941	#undef ENCODE
				942	#undef DECODE
				943
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	944	/* --- UTF-8 Codec -------------------------------------------------------- */
				945
				946	static
				947	char utf8_code_length[256] = {
				948	/* Map UTF-8 encoded prefix byte to sequence length. zero means
				949	illegal prefix. see RFC 2279 for details */
				950	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				951	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				952	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				953	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				954	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				955	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				956	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				957	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
				958	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				959	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				960	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				961	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
				962	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				963	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
				964	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
				965	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
				966	};
				967
				968	static
				969	int utf8_decoding_error(const char **source,
				970	Py_UNICODE **dest,
				971	const char *errors,
				972	const char *details)
				973	{
				974	if ((errors == NULL) \|\|
				975	(strcmp(errors,"strict") == 0)) {
				976	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	977	"UTF-8 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	978	details);
				979	return -1;
				980	}
				981	else if (strcmp(errors,"ignore") == 0) {
				982	(*source)++;
				983	return 0;
				984	}
				985	else if (strcmp(errors,"replace") == 0) {
				986	(*source)++;
				987	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				988	(*dest)++;
				989	return 0;
				990	}
				991	else {
				992	PyErr_Format(PyExc_ValueError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	993	"UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	994	errors);
				995	return -1;
				996	}
				997	}
				998
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	999	PyObject PyUnicode_DecodeUTF8(const char s,
				1000	int size,
				1001	const char *errors)
				1002	{
				1003	int n;
				1004	const char *e;
				1005	PyUnicodeObject *unicode;
				1006	Py_UNICODE *p;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1007	const char *errmsg = "";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1008
				1009	/* Note: size will always be longer than the resulting Unicode
				1010	character count */
				1011	unicode = _PyUnicode_New(size);
				1012	if (!unicode)
				1013	return NULL;
				1014	if (size == 0)
				1015	return (PyObject *)unicode;
				1016
				1017	/* Unpack UTF-8 encoded data */
				1018	p = unicode->str;
				1019	e = s + size;
				1020
				1021	while (s < e) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1022	Py_UCS4 ch = (unsigned char)*s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1023
				1024	if (ch < 0x80) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1025	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1026	s++;
				1027	continue;
				1028	}
				1029
				1030	n = utf8_code_length[ch];
				1031
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1032	if (s + n > e) {
				1033	errmsg = "unexpected end of data";
				1034	goto utf8Error;
				1035	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1036
				1037	switch (n) {
				1038
				1039	case 0:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1040	errmsg = "unexpected code byte";
				1041	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1042
				1043	case 1:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1044	errmsg = "internal error";
				1045	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1046
				1047	case 2:
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1048	if ((s[1] & 0xc0) != 0x80) {
				1049	errmsg = "invalid data";
				1050	goto utf8Error;
				1051	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1052	ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1053	if (ch < 0x80) {
				1054	errmsg = "illegal encoding";
				1055	goto utf8Error;
				1056	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1057	else
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1058	*p++ = (Py_UNICODE)ch;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1059	break;
				1060
				1061	case 3:
				1062	if ((s[1] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1063	(s[2] & 0xc0) != 0x80) {
				1064	errmsg = "invalid data";
				1065	goto utf8Error;
				1066	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1067	ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg	bd3be8f	2002-02-07 11:33:49 +0000	[diff] [blame]	1068	if (ch < 0x0800) {
				1069	/* Note: UTF-8 encodings of surrogates are considered
				1070	legal UTF-8 sequences;
				1071
				1072	XXX For wide builds (UCS-4) we should probably try
				1073	to recombine the surrogates into a single code
				1074	unit.
				1075	*/
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1076	errmsg = "illegal encoding";
				1077	goto utf8Error;
				1078	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1079	else
Marc-André Lemburg	bd3be8f	2002-02-07 11:33:49 +0000	[diff] [blame]	1080	*p++ = (Py_UNICODE)ch;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1081	break;
				1082
				1083	case 4:
				1084	if ((s[1] & 0xc0) != 0x80 \|\|
				1085	(s[2] & 0xc0) != 0x80 \|\|
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1086	(s[3] & 0xc0) != 0x80) {
				1087	errmsg = "invalid data";
				1088	goto utf8Error;
				1089	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1090	ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
				1091	((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
				1092	/* validate and convert to UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1093	if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg	bd3be8f	2002-02-07 11:33:49 +0000	[diff] [blame]	1094	byte encoding */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1095	\|\| (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg	bd3be8f	2002-02-07 11:33:49 +0000	[diff] [blame]	1096	UTF-16 */
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1097	{
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1098	errmsg = "illegal encoding";
				1099	goto utf8Error;
				1100	}
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1101	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1102	*p++ = (Py_UNICODE)ch;
				1103	#else
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1104	/* compute and append the two surrogates: */
				1105
				1106	/* translate from 10000..10FFFF to 0..FFFF */
				1107	ch -= 0x10000;
				1108
				1109	/* high surrogate = top 10 bits added to D800 */
				1110	*p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
				1111
				1112	/* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1113	*p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1114	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1115	break;
				1116
				1117	default:
				1118	/* Other sizes are only needed for UCS-4 */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1119	errmsg = "unsupported Unicode code range";
				1120	goto utf8Error;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1121	}
				1122	s += n;
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1123	continue;
				1124
				1125	utf8Error:
				1126	if (utf8_decoding_error(&s, &p, errors, errmsg))
				1127	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1128	}
				1129
				1130	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1131	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1132	goto onError;
				1133
				1134	return (PyObject *)unicode;
				1135
				1136	onError:
				1137	Py_DECREF(unicode);
				1138	return NULL;
				1139	}
				1140
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1141	/* Not used anymore, now that the encoder supports UTF-16
				1142	surrogates. */
Greg Stein	af36a3a	2000-07-17 09:04:43 +0000	[diff] [blame]	1143	#if 0
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1144	static
				1145	int utf8_encoding_error(const Py_UNICODE **source,
				1146	char **dest,
				1147	const char *errors,
				1148	const char *details)
				1149	{
				1150	if ((errors == NULL) \|\|
				1151	(strcmp(errors,"strict") == 0)) {
				1152	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1153	"UTF-8 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1154	details);
				1155	return -1;
				1156	}
				1157	else if (strcmp(errors,"ignore") == 0) {
				1158	return 0;
				1159	}
				1160	else if (strcmp(errors,"replace") == 0) {
				1161	**dest = '?';
				1162	(*dest)++;
				1163	return 0;
				1164	}
				1165	else {
				1166	PyErr_Format(PyExc_ValueError,
				1167	"UTF-8 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1168	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1169	errors);
				1170	return -1;
				1171	}
				1172	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1173	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1174
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1175	/* Allocation strategy: we default to Latin-1, then do one resize
				1176	whenever we hit an order boundary. The assumption is that
				1177	characters from higher orders usually occur often enough to warrant
				1178	this.
				1179	*/
				1180
Tim Peters	7e3d961	2002-04-21 03:26:37 +0000	[diff] [blame]	1181	PyObject *
				1182	PyUnicode_EncodeUTF8(const Py_UNICODE *s,
				1183	int size,
				1184	const char *errors)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1185	{
				1186	PyObject *v;
				1187	char *p;
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1188	int len;
Tim Peters	0eca65c	2002-04-21 17:28:06 +0000	[diff] [blame]	1189	int i = 0;
				1190	long overalloc = 2;
				1191	int nallocated; /* overalloc * size; PyString_ adds one more for \0 */
				1192
				1193	/* Short-cut for empty strings */
Marc-André Lemburg	bd3be8f	2002-02-07 11:33:49 +0000	[diff] [blame]	1194	if (size == 0)
				1195	return PyString_FromStringAndSize(NULL, 0);
				1196
Tim Peters	0eca65c	2002-04-21 17:28:06 +0000	[diff] [blame]	1197	nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
				1198	v = PyString_FromStringAndSize(NULL, nallocated);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1199	if (v == NULL)
				1200	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1201
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1202	p = PyString_AS_STRING(v);
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1203
				1204	while (i < size) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1205	Py_UCS4 ch = s[i++];
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1206
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1207	if (ch < 0x80)
				1208	/* Encode ASCII */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1209	*p++ = (char) ch;
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1210
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1211	else if (ch < 0x0800) {
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1212	/* Encode Latin-1 */
Marc-André Lemburg	dc724d6	2002-02-06 18:20:19 +0000	[diff] [blame]	1213	*p++ = (char)(0xc0 \| (ch >> 6));
				1214	*p++ = (char)(0x80 \| (ch & 0x3f));
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1215	}
Tim Peters	0eca65c	2002-04-21 17:28:06 +0000	[diff] [blame]	1216
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1217	else {
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1218	/* Encode UCS2 Unicode ordinals */
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1219	if (ch < 0x10000) {
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1220
				1221	/* Special case: check for high surrogate */
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1222	if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
				1223	Py_UCS4 ch2 = s[i];
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1224	/* Check for low surrogate and combine the two to
				1225	form a UCS4 value */
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1226	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1227	ch = ((ch - 0xD800) << 10 \| (ch2 - 0xDC00)) + 0x10000;
				1228	i++;
				1229	goto encodeUCS4;
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1230	}
Marc-André Lemburg	3688a88	2002-02-06 18:09:02 +0000	[diff] [blame]	1231	/* Fall through: handles isolated high surrogates */
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1232	}
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1233
				1234	if (overalloc < 3) {
Tim Peters	0eca65c	2002-04-21 17:28:06 +0000	[diff] [blame]	1235	len = Py_SAFE_DOWNCAST(p-PyString_AS_STRING(v), long, int);
				1236	assert(len <= nallocated);
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1237	overalloc = 3;
Tim Peters	0eca65c	2002-04-21 17:28:06 +0000	[diff] [blame]	1238	nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
				1239	if (_PyString_Resize(&v, nallocated))
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1240	goto onError;
				1241	p = PyString_AS_STRING(v) + len;
				1242	}
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	1243	*p++ = (char)(0xe0 \| (ch >> 12));
Marc-André Lemburg	e7c6ee4	2002-02-06 18:18:03 +0000	[diff] [blame]	1244	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				1245	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1246	continue;
Marc-André Lemburg	e7c6ee4	2002-02-06 18:18:03 +0000	[diff] [blame]	1247	}
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1248
				1249	/* Encode UCS4 Unicode ordinals */
				1250	encodeUCS4:
				1251	if (overalloc < 4) {
Tim Peters	0eca65c	2002-04-21 17:28:06 +0000	[diff] [blame]	1252	len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
				1253	assert(len <= nallocated);
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1254	overalloc = 4;
Tim Peters	0eca65c	2002-04-21 17:28:06 +0000	[diff] [blame]	1255	nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
				1256	if (_PyString_Resize(&v, nallocated))
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1257	goto onError;
				1258	p = PyString_AS_STRING(v) + len;
				1259	}
				1260	*p++ = (char)(0xf0 \| (ch >> 18));
				1261	*p++ = (char)(0x80 \| ((ch >> 12) & 0x3f));
				1262	*p++ = (char)(0x80 \| ((ch >> 6) & 0x3f));
				1263	*p++ = (char)(0x80 \| (ch & 0x3f));
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1264	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1265	}
Tim Peters	0eca65c	2002-04-21 17:28:06 +0000	[diff] [blame]	1266
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1267	*p = '\0';
Tim Peters	0eca65c	2002-04-21 17:28:06 +0000	[diff] [blame]	1268	len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
				1269	assert(len <= nallocated);
				1270	if (_PyString_Resize(&v, len))
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1271	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1272	return v;
Martin v. Löwis	2a7ff35	2002-04-21 09:59:45 +0000	[diff] [blame]	1273
				1274	onError:
				1275	Py_DECREF(v);
				1276	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1277	}
				1278
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1279	PyObject PyUnicode_AsUTF8String(PyObject unicode)
				1280	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1281	if (!PyUnicode_Check(unicode)) {
				1282	PyErr_BadArgument();
				1283	return NULL;
				1284	}
Barry Warsaw	2dd4abf	2000-08-18 06:58:15 +0000	[diff] [blame]	1285	return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
				1286	PyUnicode_GET_SIZE(unicode),
				1287	NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1288	}
				1289
				1290	/* --- UTF-16 Codec ------------------------------------------------------- */
				1291
				1292	static
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1293	int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1294	const char *errors,
				1295	const char *details)
				1296	{
				1297	if ((errors == NULL) \|\|
				1298	(strcmp(errors,"strict") == 0)) {
				1299	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1300	"UTF-16 decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1301	details);
				1302	return -1;
				1303	}
				1304	else if (strcmp(errors,"ignore") == 0) {
				1305	return 0;
				1306	}
				1307	else if (strcmp(errors,"replace") == 0) {
				1308	if (dest) {
				1309	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				1310	(*dest)++;
				1311	}
				1312	return 0;
				1313	}
				1314	else {
				1315	PyErr_Format(PyExc_ValueError,
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	1316	"UTF-16 decoding error; "
				1317	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1318	errors);
				1319	return -1;
				1320	}
				1321	}
				1322
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1323	PyObject *
				1324	PyUnicode_DecodeUTF16(const char *s,
				1325	int size,
				1326	const char *errors,
				1327	int *byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1328	{
				1329	PyUnicodeObject *unicode;
				1330	Py_UNICODE *p;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1331	const unsigned char q, e;
				1332	int bo = 0; /* assume native ordering by default */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1333	const char *errmsg = "";
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1334	/* Offsets from q for retrieving byte pairs in the right order. */
				1335	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1336	int ihi = 1, ilo = 0;
				1337	#else
				1338	int ihi = 0, ilo = 1;
				1339	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1340
				1341	/* size should be an even number */
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1342	if (size & 1) {
				1343	if (utf16_decoding_error(NULL, errors, "truncated data"))
				1344	return NULL;
				1345	--size; /* else ignore the oddball byte */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1346	}
				1347
				1348	/* Note: size will always be longer than the resulting Unicode
				1349	character count */
				1350	unicode = _PyUnicode_New(size);
				1351	if (!unicode)
				1352	return NULL;
				1353	if (size == 0)
				1354	return (PyObject *)unicode;
				1355
				1356	/* Unpack UTF-16 encoded data */
				1357	p = unicode->str;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1358	q = (unsigned char *)s;
				1359	e = q + size;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1360
				1361	if (byteorder)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1362	bo = *byteorder;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1363
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1364	/* Check for BOM marks (U+FEFF) in the input and adjust current
				1365	byte order setting accordingly. In native mode, the leading BOM
				1366	mark is skipped, in all other modes, it is copied to the output
				1367	stream as-is (giving a ZWNBSP character). */
				1368	if (bo == 0) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1369	const Py_UNICODE bom = (q[ihi] << 8) \| q[ilo];
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1370	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1371	if (bom == 0xFEFF) {
				1372	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1373	bo = -1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1374	}
				1375	else if (bom == 0xFFFE) {
				1376	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1377	bo = 1;
				1378	}
				1379	#else
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1380	if (bom == 0xFEFF) {
				1381	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1382	bo = 1;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1383	}
				1384	else if (bom == 0xFFFE) {
				1385	q += 2;
Marc-André Lemburg	489b56e	2001-05-21 20:30:15 +0000	[diff] [blame]	1386	bo = -1;
				1387	}
				1388	#endif
				1389	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1390
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1391	if (bo == -1) {
				1392	/* force LE */
				1393	ihi = 1;
				1394	ilo = 0;
				1395	}
				1396	else if (bo == 1) {
				1397	/* force BE */
				1398	ihi = 0;
				1399	ilo = 1;
				1400	}
				1401
				1402	while (q < e) {
				1403	Py_UNICODE ch = (q[ihi] << 8) \| q[ilo];
				1404	q += 2;
				1405
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1406	if (ch < 0xD800 \|\| ch > 0xDFFF) {
				1407	*p++ = ch;
				1408	continue;
				1409	}
				1410
				1411	/* UTF-16 code pair: */
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1412	if (q >= e) {
				1413	errmsg = "unexpected end of data";
				1414	goto utf16Error;
				1415	}
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1416	if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1417	Py_UNICODE ch2 = (q[ihi] << 8) \| q[ilo];
				1418	q += 2;
Martin v. Löwis	ac93bc2	2001-06-26 22:43:40 +0000	[diff] [blame]	1419	if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1420	#ifndef Py_UNICODE_WIDE
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1421	*p++ = ch;
				1422	*p++ = ch2;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1423	#else
				1424	*p++ = (((ch & 0x3FF)<<10) \| (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1425	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1426	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1427	}
				1428	else {
				1429	errmsg = "illegal UTF-16 surrogate";
				1430	goto utf16Error;
				1431	}
				1432
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1433	}
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1434	errmsg = "illegal encoding";
				1435	/* Fall through to report the error */
				1436
				1437	utf16Error:
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1438	if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg	9542f48	2000-07-17 18:23:13 +0000	[diff] [blame]	1439	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1440	}
				1441
				1442	if (byteorder)
				1443	*byteorder = bo;
				1444
				1445	/* Adjust length */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1446	if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1447	goto onError;
				1448
				1449	return (PyObject *)unicode;
				1450
				1451	onError:
				1452	Py_DECREF(unicode);
				1453	return NULL;
				1454	}
				1455
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1456	PyObject *
				1457	PyUnicode_EncodeUTF16(const Py_UNICODE *s,
				1458	int size,
				1459	const char *errors,
				1460	int byteorder)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1461	{
				1462	PyObject *v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1463	unsigned char *p;
				1464	int i, pairs;
				1465	/* Offsets from p for storing byte pairs in the right order. */
				1466	#ifdef BYTEORDER_IS_LITTLE_ENDIAN
				1467	int ihi = 1, ilo = 0;
				1468	#else
				1469	int ihi = 0, ilo = 1;
				1470	#endif
				1471
				1472	#define STORECHAR(CH) \
				1473	do { \
				1474	p[ihi] = ((CH) >> 8) & 0xff; \
				1475	p[ilo] = (CH) & 0xff; \
				1476	p += 2; \
				1477	} while(0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1478
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1479	for (i = pairs = 0; i < size; i++)
				1480	if (s[i] >= 0x10000)
				1481	pairs++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1482	v = PyString_FromStringAndSize(NULL,
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1483	2 * (size + pairs + (byteorder == 0)));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1484	if (v == NULL)
				1485	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1486
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1487	p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1488	if (byteorder == 0)
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1489	STORECHAR(0xFEFF);
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1490	if (size == 0)
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	1491	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1492
				1493	if (byteorder == -1) {
				1494	/* force LE */
				1495	ihi = 1;
				1496	ilo = 0;
				1497	}
				1498	else if (byteorder == 1) {
				1499	/* force BE */
				1500	ihi = 0;
				1501	ilo = 1;
				1502	}
				1503
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1504	while (size-- > 0) {
				1505	Py_UNICODE ch = *s++;
				1506	Py_UNICODE ch2 = 0;
				1507	if (ch >= 0x10000) {
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1508	ch2 = 0xDC00 \| ((ch-0x10000) & 0x3FF);
				1509	ch = 0xD800 \| ((ch-0x10000) >> 10);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1510	}
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1511	STORECHAR(ch);
				1512	if (ch2)
				1513	STORECHAR(ch2);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1514	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1515	return v;
Tim Peters	772747b	2001-08-09 22:21:55 +0000	[diff] [blame]	1516	#undef STORECHAR
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1517	}
				1518
				1519	PyObject PyUnicode_AsUTF16String(PyObject unicode)
				1520	{
				1521	if (!PyUnicode_Check(unicode)) {
				1522	PyErr_BadArgument();
				1523	return NULL;
				1524	}
				1525	return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
				1526	PyUnicode_GET_SIZE(unicode),
				1527	NULL,
				1528	0);
				1529	}
				1530
				1531	/* --- Unicode Escape Codec ----------------------------------------------- */
				1532
				1533	static
Martin v. Löwis	047c05e	2002-03-21 08:55:28 +0000	[diff] [blame]	1534	int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1535	const char *errors,
				1536	const char *details)
				1537	{
				1538	if ((errors == NULL) \|\|
				1539	(strcmp(errors,"strict") == 0)) {
				1540	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1541	"Unicode-Escape decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1542	details);
				1543	return -1;
				1544	}
				1545	else if (strcmp(errors,"ignore") == 0) {
				1546	return 0;
				1547	}
				1548	else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis	047c05e	2002-03-21 08:55:28 +0000	[diff] [blame]	1549	**x = Py_UNICODE_REPLACEMENT_CHARACTER;
				1550	(*x)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1551	return 0;
				1552	}
				1553	else {
				1554	PyErr_Format(PyExc_ValueError,
				1555	"Unicode-Escape decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	1556	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1557	errors);
				1558	return -1;
				1559	}
				1560	}
				1561
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1562	static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg	0f774e3	2000-06-28 16:43:35 +0000	[diff] [blame]	1563
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1564	PyObject PyUnicode_DecodeUnicodeEscape(const char s,
				1565	int size,
				1566	const char *errors)
				1567	{
				1568	PyUnicodeObject *v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1569	Py_UNICODE p, buf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1570	const char *end;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1571	char* message;
				1572	Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
				1573
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1574	/* Escaped strings will always be longer than the resulting
				1575	Unicode string, so we start with size here and then reduce the
				1576	length after conversion to the true value. */
				1577	v = _PyUnicode_New(size);
				1578	if (v == NULL)
				1579	goto onError;
				1580	if (size == 0)
				1581	return (PyObject *)v;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1582
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1583	p = buf = PyUnicode_AS_UNICODE(v);
				1584	end = s + size;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1585
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1586	while (s < end) {
				1587	unsigned char c;
Marc-André Lemburg	063e0cb	2000-07-07 11:27:45 +0000	[diff] [blame]	1588	Py_UNICODE x;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1589	int i, digits;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1590
				1591	/* Non-escape characters are interpreted as Unicode ordinals */
				1592	if (*s != '\\') {
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1593	p++ = (unsigned char) s++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1594	continue;
				1595	}
				1596
				1597	/* \ - Escapes */
				1598	s++;
				1599	switch (*s++) {
				1600
				1601	/* \x escapes */
				1602	case '\n': break;
				1603	case '\\': *p++ = '\\'; break;
				1604	case '\'': *p++ = '\''; break;
				1605	case '\"': *p++ = '\"'; break;
				1606	case 'b': *p++ = '\b'; break;
				1607	case 'f': p++ = '\014'; break; / FF */
				1608	case 't': *p++ = '\t'; break;
				1609	case 'n': *p++ = '\n'; break;
				1610	case 'r': *p++ = '\r'; break;
				1611	case 'v': p++ = '\013'; break; / VT */
				1612	case 'a': p++ = '\007'; break; / BEL, not classic C */
				1613
				1614	/* \OOO (octal) escapes */
				1615	case '0': case '1': case '2': case '3':
				1616	case '4': case '5': case '6': case '7':
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1617	x = s[-1] - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1618	if ('0' <= s && s <= '7') {
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1619	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1620	if ('0' <= s && s <= '7')
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1621	x = (x<<3) + *s++ - '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1622	}
Guido van Rossum	0e4f657	2000-05-01 21:27:20 +0000	[diff] [blame]	1623	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1624	break;
				1625
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1626	/* hex escapes */
				1627	/* \xXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1628	case 'x':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1629	digits = 2;
				1630	message = "truncated \\xXX escape";
				1631	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1632
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1633	/* \uXXXX */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1634	case 'u':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1635	digits = 4;
				1636	message = "truncated \\uXXXX escape";
				1637	goto hexescape;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1638
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1639	/* \UXXXXXXXX */
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1640	case 'U':
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1641	digits = 8;
				1642	message = "truncated \\UXXXXXXXX escape";
				1643	hexescape:
				1644	chr = 0;
				1645	for (i = 0; i < digits; i++) {
				1646	c = (unsigned char) s[i];
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1647	if (!isxdigit(c)) {
Martin v. Löwis	047c05e	2002-03-21 08:55:28 +0000	[diff] [blame]	1648	if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1649	goto onError;
Martin v. Löwis	047c05e	2002-03-21 08:55:28 +0000	[diff] [blame]	1650	chr = 0xffffffff;
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1651	i++;
				1652	break;
				1653	}
				1654	chr = (chr<<4) & ~0xF;
				1655	if (c >= '0' && c <= '9')
				1656	chr += c - '0';
				1657	else if (c >= 'a' && c <= 'f')
				1658	chr += 10 + c - 'a';
				1659	else
				1660	chr += 10 + c - 'A';
				1661	}
				1662	s += i;
Walter Dörwald	8c07722	2002-03-25 11:16:18 +0000	[diff] [blame]	1663	if (chr == 0xffffffff)
				1664	/* _decoding_error will have already written into the
				1665	target buffer. */
				1666	break;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1667	store:
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1668	/* when we get here, chr is a 32-bit unicode character */
				1669	if (chr <= 0xffff)
				1670	/* UCS-2 character */
				1671	*p++ = (Py_UNICODE) chr;
				1672	else if (chr <= 0x10ffff) {
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1673	/* UCS-4 character. Either store directly, or as
Walter Dörwald	8c07722	2002-03-25 11:16:18 +0000	[diff] [blame]	1674	surrogate pair. */
Fredrik Lundh	8f45585	2001-06-27 18:59:43 +0000	[diff] [blame]	1675	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1676	*p++ = chr;
				1677	#else
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1678	chr -= 0x10000L;
				1679	*p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	1680	*p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1681	#endif
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1682	} else {
				1683	if (unicodeescape_decoding_error(
Martin v. Löwis	047c05e	2002-03-21 08:55:28 +0000	[diff] [blame]	1684	&p, errors,
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1685	"illegal Unicode character")
Fredrik Lundh	df84675	2000-09-03 11:29:49 +0000	[diff] [blame]	1686	)
				1687	goto onError;
				1688	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1689	break;
				1690
				1691	/* \N{name} */
				1692	case 'N':
				1693	message = "malformed \\N character escape";
				1694	if (ucnhash_CAPI == NULL) {
				1695	/* load the unicode data module */
				1696	PyObject m, v;
				1697	m = PyImport_ImportModule("unicodedata");
				1698	if (m == NULL)
				1699	goto ucnhashError;
				1700	v = PyObject_GetAttrString(m, "ucnhash_CAPI");
				1701	Py_DECREF(m);
				1702	if (v == NULL)
				1703	goto ucnhashError;
				1704	ucnhash_CAPI = PyCObject_AsVoidPtr(v);
				1705	Py_DECREF(v);
				1706	if (ucnhash_CAPI == NULL)
				1707	goto ucnhashError;
				1708	}
				1709	if (*s == '{') {
				1710	const char *start = s+1;
				1711	/* look for the closing brace */
				1712	while (*s != '}' && s < end)
				1713	s++;
				1714	if (s > start && s < end && *s == '}') {
				1715	/* found a name. look it up in the unicode database */
				1716	message = "unknown Unicode character name";
				1717	s++;
				1718	if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
				1719	goto store;
				1720	}
				1721	}
Martin v. Löwis	047c05e	2002-03-21 08:55:28 +0000	[diff] [blame]	1722	if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1723	goto onError;
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1724	break;
				1725
				1726	default:
Walter Dörwald	8c07722	2002-03-25 11:16:18 +0000	[diff] [blame]	1727	if (s > end) {
				1728	if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
				1729	goto onError;
				1730	}
				1731	else {
				1732	*p++ = '\\';
				1733	*p++ = (unsigned char)s[-1];
				1734	}
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1735	break;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1736	}
				1737	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1738	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald	8c07722	2002-03-25 11:16:18 +0000	[diff] [blame]	1739	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1740	return (PyObject *)v;
Walter Dörwald	8c07722	2002-03-25 11:16:18 +0000	[diff] [blame]	1741
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1742	ucnhashError:
Fredrik Lundh	06d1268	2001-01-24 07:59:11 +0000	[diff] [blame]	1743	PyErr_SetString(
				1744	PyExc_UnicodeError,
				1745	"\\N escapes not supported (can't load unicodedata module)"
				1746	);
Fredrik Lundh	f605606	2001-01-20 11:15:25 +0000	[diff] [blame]	1747	return NULL;
				1748
Fredrik Lundh	ccc7473	2001-02-18 22:13:49 +0000	[diff] [blame]	1749	onError:
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1750	Py_XDECREF(v);
				1751	return NULL;
				1752	}
				1753
				1754	/* Return a Unicode-Escape string version of the Unicode object.
				1755
				1756	If quotes is true, the string is enclosed in u"" or u'' quotes as
				1757	appropriate.
				1758
				1759	*/
				1760
Barry Warsaw	51ac580	2000-03-20 16:36:48 +0000	[diff] [blame]	1761	static const Py_UNICODE findchar(const Py_UNICODE s,
				1762	int size,
				1763	Py_UNICODE ch);
				1764
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1765	static
				1766	PyObject unicodeescape_string(const Py_UNICODE s,
				1767	int size,
				1768	int quotes)
				1769	{
				1770	PyObject *repr;
				1771	char *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1772
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1773	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1774
				1775	repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
				1776	if (repr == NULL)
				1777	return NULL;
				1778
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1779	p = PyString_AS_STRING(repr);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1780
				1781	if (quotes) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1782	*p++ = 'u';
				1783	*p++ = (findchar(s, size, '\'') &&
				1784	!findchar(s, size, '"')) ? '"' : '\'';
				1785	}
				1786	while (size-- > 0) {
				1787	Py_UNICODE ch = *s++;
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1788
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1789	/* Escape quotes */
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1790	if (quotes &&
				1791	(ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] \|\| ch == '\\')) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1792	*p++ = '\\';
				1793	*p++ = (char) ch;
Guido van Rossum	ad9744a	2001-09-21 15:38:17 +0000	[diff] [blame]	1794	continue;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1795	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1796
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1797	#ifdef Py_UNICODE_WIDE
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1798	/* Map 21-bit characters to '\U00xxxxxx' */
				1799	else if (ch >= 0x10000) {
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1800	int offset = p - PyString_AS_STRING(repr);
				1801
				1802	/* Resize the string if necessary */
				1803	if (offset + 12 > PyString_GET_SIZE(repr)) {
				1804	if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
				1805	goto onError;
				1806	p = PyString_AS_STRING(repr) + offset;
				1807	}
				1808
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1809	*p++ = '\\';
				1810	*p++ = 'U';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1811	*p++ = hexdigit[(ch >> 28) & 0x0000000F];
				1812	*p++ = hexdigit[(ch >> 24) & 0x0000000F];
				1813	*p++ = hexdigit[(ch >> 20) & 0x0000000F];
				1814	*p++ = hexdigit[(ch >> 16) & 0x0000000F];
				1815	*p++ = hexdigit[(ch >> 12) & 0x0000000F];
				1816	*p++ = hexdigit[(ch >> 8) & 0x0000000F];
				1817	*p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1818	*p++ = hexdigit[ch & 0x0000000F];
				1819	continue;
Martin v. Löwis	0ba70cc	2001-06-26 22:22:37 +0000	[diff] [blame]	1820	}
Guido van Rossum	0d42e0c	2001-07-20 16:36:21 +0000	[diff] [blame]	1821	#endif
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1822	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
				1823	else if (ch >= 0xD800 && ch < 0xDC00) {
				1824	Py_UNICODE ch2;
				1825	Py_UCS4 ucs;
				1826
				1827	ch2 = *s++;
				1828	size--;
				1829	if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
				1830	ucs = (((ch & 0x03FF) << 10) \| (ch2 & 0x03FF)) + 0x00010000;
				1831	*p++ = '\\';
				1832	*p++ = 'U';
				1833	*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
				1834	*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
				1835	*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
				1836	*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
				1837	*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
				1838	*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
				1839	*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
				1840	*p++ = hexdigit[ucs & 0x0000000F];
				1841	continue;
				1842	}
				1843	/* Fall through: isolated surrogates are copied as-is */
				1844	s--;
				1845	size++;
				1846	}
				1847
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1848	/* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1849	if (ch >= 256) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1850	*p++ = '\\';
				1851	*p++ = 'u';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1852	*p++ = hexdigit[(ch >> 12) & 0x000F];
				1853	*p++ = hexdigit[(ch >> 8) & 0x000F];
				1854	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1855	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1856	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1857
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1858	/* Map special whitespace to '\t', \n', '\r' */
				1859	else if (ch == '\t') {
				1860	*p++ = '\\';
				1861	*p++ = 't';
				1862	}
				1863	else if (ch == '\n') {
				1864	*p++ = '\\';
				1865	*p++ = 'n';
				1866	}
				1867	else if (ch == '\r') {
				1868	*p++ = '\\';
				1869	*p++ = 'r';
				1870	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1871
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1872	/* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg	11326de	2001-11-28 12:56:20 +0000	[diff] [blame]	1873	else if (ch < ' ' \|\| ch >= 0x7F) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1874	*p++ = '\\';
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	1875	*p++ = 'x';
Marc-André Lemburg	6c6bfb7	2001-07-20 17:39:11 +0000	[diff] [blame]	1876	*p++ = hexdigit[(ch >> 4) & 0x000F];
				1877	*p++ = hexdigit[ch & 0x000F];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1878	}
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1879
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1880	/* Copy everything else as-is */
				1881	else
				1882	*p++ = (char) ch;
				1883	}
				1884	if (quotes)
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1885	*p++ = PyString_AS_STRING(repr)[1];
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1886
				1887	*p = '\0';
Marc-André Lemburg	80d1dd5	2001-07-25 16:05:59 +0000	[diff] [blame]	1888	if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1889	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1890
				1891	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1892
				1893	onError:
				1894	Py_DECREF(repr);
				1895	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1896	}
				1897
				1898	PyObject PyUnicode_EncodeUnicodeEscape(const Py_UNICODE s,
				1899	int size)
				1900	{
				1901	return unicodeescape_string(s, size, 0);
				1902	}
				1903
				1904	PyObject PyUnicode_AsUnicodeEscapeString(PyObject unicode)
				1905	{
				1906	if (!PyUnicode_Check(unicode)) {
				1907	PyErr_BadArgument();
				1908	return NULL;
				1909	}
				1910	return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				1911	PyUnicode_GET_SIZE(unicode));
				1912	}
				1913
				1914	/* --- Raw Unicode Escape Codec ------------------------------------------- */
				1915
				1916	PyObject PyUnicode_DecodeRawUnicodeEscape(const char s,
				1917	int size,
				1918	const char *errors)
				1919	{
				1920	PyUnicodeObject *v;
				1921	Py_UNICODE p, buf;
				1922	const char *end;
				1923	const char *bs;
				1924
				1925	/* Escaped strings will always be longer than the resulting
				1926	Unicode string, so we start with size here and then reduce the
				1927	length after conversion to the true value. */
				1928	v = _PyUnicode_New(size);
				1929	if (v == NULL)
				1930	goto onError;
				1931	if (size == 0)
				1932	return (PyObject *)v;
				1933	p = buf = PyUnicode_AS_UNICODE(v);
				1934	end = s + size;
				1935	while (s < end) {
				1936	unsigned char c;
Martin v. Löwis	047c05e	2002-03-21 08:55:28 +0000	[diff] [blame]	1937	Py_UCS4 x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1938	int i;
				1939
				1940	/* Non-escape characters are interpreted as Unicode ordinals */
				1941	if (*s != '\\') {
				1942	p++ = (unsigned char)s++;
				1943	continue;
				1944	}
				1945
				1946	/* \u-escapes are only interpreted iff the number of leading
				1947	backslashes if odd */
				1948	bs = s;
				1949	for (;s < end;) {
				1950	if (*s != '\\')
				1951	break;
				1952	p++ = (unsigned char)s++;
				1953	}
				1954	if (((s - bs) & 1) == 0 \|\|
				1955	s >= end \|\|
				1956	*s != 'u') {
				1957	continue;
				1958	}
				1959	p--;
				1960	s++;
				1961
				1962	/* \uXXXX with 4 hex digits */
				1963	for (x = 0, i = 0; i < 4; i++) {
				1964	c = (unsigned char)s[i];
				1965	if (!isxdigit(c)) {
Martin v. Löwis	047c05e	2002-03-21 08:55:28 +0000	[diff] [blame]	1966	if (unicodeescape_decoding_error(&p, errors,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1967	"truncated \\uXXXX"))
				1968	goto onError;
Martin v. Löwis	047c05e	2002-03-21 08:55:28 +0000	[diff] [blame]	1969	x = 0xffffffff;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1970	i++;
				1971	break;
				1972	}
				1973	x = (x<<4) & ~0xF;
				1974	if (c >= '0' && c <= '9')
				1975	x += c - '0';
				1976	else if (c >= 'a' && c <= 'f')
				1977	x += 10 + c - 'a';
				1978	else
				1979	x += 10 + c - 'A';
				1980	}
				1981	s += i;
Martin v. Löwis	047c05e	2002-03-21 08:55:28 +0000	[diff] [blame]	1982	if (x != 0xffffffff)
				1983	*p++ = x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1984	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	1985	if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	1986	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	1987	return (PyObject *)v;
				1988
				1989	onError:
				1990	Py_XDECREF(v);
				1991	return NULL;
				1992	}
				1993
				1994	PyObject PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE s,
				1995	int size)
				1996	{
				1997	PyObject *repr;
				1998	char *p;
				1999	char *q;
				2000
Ka-Ping Yee	fa004ad	2001-01-24 17:19:08 +0000	[diff] [blame]	2001	static const char *hexdigit = "0123456789abcdef";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2002
				2003	repr = PyString_FromStringAndSize(NULL, 6 * size);
				2004	if (repr == NULL)
				2005	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2006	if (size == 0)
				2007	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2008
				2009	p = q = PyString_AS_STRING(repr);
				2010	while (size-- > 0) {
				2011	Py_UNICODE ch = *s++;
				2012	/* Map 16-bit characters to '\uxxxx' */
				2013	if (ch >= 256) {
				2014	*p++ = '\\';
				2015	*p++ = 'u';
				2016	*p++ = hexdigit[(ch >> 12) & 0xf];
				2017	*p++ = hexdigit[(ch >> 8) & 0xf];
				2018	*p++ = hexdigit[(ch >> 4) & 0xf];
				2019	*p++ = hexdigit[ch & 15];
				2020	}
				2021	/* Copy everything else as-is */
				2022	else
				2023	*p++ = (char) ch;
				2024	}
				2025	*p = '\0';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2026	if (_PyString_Resize(&repr, p - q))
				2027	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2028
				2029	return repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2030
				2031	onError:
				2032	Py_DECREF(repr);
				2033	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2034	}
				2035
				2036	PyObject PyUnicode_AsRawUnicodeEscapeString(PyObject unicode)
				2037	{
				2038	if (!PyUnicode_Check(unicode)) {
				2039	PyErr_BadArgument();
				2040	return NULL;
				2041	}
				2042	return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
				2043	PyUnicode_GET_SIZE(unicode));
				2044	}
				2045
				2046	/* --- Latin-1 Codec ------------------------------------------------------ */
				2047
				2048	PyObject PyUnicode_DecodeLatin1(const char s,
				2049	int size,
				2050	const char *errors)
				2051	{
				2052	PyUnicodeObject *v;
				2053	Py_UNICODE *p;
				2054
				2055	/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2056	if (size == 1 && (unsigned char)s < 256) {
				2057	Py_UNICODE r = (unsigned char)s;
				2058	return PyUnicode_FromUnicode(&r, 1);
				2059	}
				2060
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2061	v = _PyUnicode_New(size);
				2062	if (v == NULL)
				2063	goto onError;
				2064	if (size == 0)
				2065	return (PyObject *)v;
				2066	p = PyUnicode_AS_UNICODE(v);
				2067	while (size-- > 0)
				2068	p++ = (unsigned char)s++;
				2069	return (PyObject *)v;
				2070
				2071	onError:
				2072	Py_XDECREF(v);
				2073	return NULL;
				2074	}
				2075
				2076	static
				2077	int latin1_encoding_error(const Py_UNICODE **source,
				2078	char **dest,
				2079	const char *errors,
				2080	const char *details)
				2081	{
				2082	if ((errors == NULL) \|\|
				2083	(strcmp(errors,"strict") == 0)) {
				2084	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2085	"Latin-1 encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2086	details);
				2087	return -1;
				2088	}
				2089	else if (strcmp(errors,"ignore") == 0) {
				2090	return 0;
				2091	}
				2092	else if (strcmp(errors,"replace") == 0) {
				2093	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2094	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2095	return 0;
				2096	}
				2097	else {
				2098	PyErr_Format(PyExc_ValueError,
				2099	"Latin-1 encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2100	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2101	errors);
				2102	return -1;
				2103	}
				2104	}
				2105
				2106	PyObject PyUnicode_EncodeLatin1(const Py_UNICODE p,
				2107	int size,
				2108	const char *errors)
				2109	{
				2110	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2111	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2112
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2113	repr = PyString_FromStringAndSize(NULL, size);
				2114	if (repr == NULL)
				2115	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2116	if (size == 0)
				2117	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2118
				2119	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2120	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2121	while (size-- > 0) {
				2122	Py_UNICODE ch = *p++;
				2123	if (ch >= 256) {
				2124	if (latin1_encoding_error(&p, &s, errors,
				2125	"ordinal not in range(256)"))
				2126	goto onError;
				2127	}
				2128	else
				2129	*s++ = (char)ch;
				2130	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2131	/* Resize if error handling skipped some characters */
				2132	if (s - start < PyString_GET_SIZE(repr))
				2133	if (_PyString_Resize(&repr, s - start))
				2134	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2135	return repr;
				2136
				2137	onError:
				2138	Py_DECREF(repr);
				2139	return NULL;
				2140	}
				2141
				2142	PyObject PyUnicode_AsLatin1String(PyObject unicode)
				2143	{
				2144	if (!PyUnicode_Check(unicode)) {
				2145	PyErr_BadArgument();
				2146	return NULL;
				2147	}
				2148	return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
				2149	PyUnicode_GET_SIZE(unicode),
				2150	NULL);
				2151	}
				2152
				2153	/* --- 7-bit ASCII Codec -------------------------------------------------- */
				2154
				2155	static
				2156	int ascii_decoding_error(const char **source,
				2157	Py_UNICODE **dest,
				2158	const char *errors,
				2159	const char *details)
				2160	{
				2161	if ((errors == NULL) \|\|
				2162	(strcmp(errors,"strict") == 0)) {
				2163	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2164	"ASCII decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2165	details);
				2166	return -1;
				2167	}
				2168	else if (strcmp(errors,"ignore") == 0) {
				2169	return 0;
				2170	}
				2171	else if (strcmp(errors,"replace") == 0) {
				2172	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2173	(*dest)++;
				2174	return 0;
				2175	}
				2176	else {
				2177	PyErr_Format(PyExc_ValueError,
				2178	"ASCII decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2179	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2180	errors);
				2181	return -1;
				2182	}
				2183	}
				2184
				2185	PyObject PyUnicode_DecodeASCII(const char s,
				2186	int size,
				2187	const char *errors)
				2188	{
				2189	PyUnicodeObject *v;
				2190	Py_UNICODE *p;
				2191
				2192	/* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2193	if (size == 1 && (unsigned char)s < 128) {
				2194	Py_UNICODE r = (unsigned char)s;
				2195	return PyUnicode_FromUnicode(&r, 1);
				2196	}
				2197
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2198	v = _PyUnicode_New(size);
				2199	if (v == NULL)
				2200	goto onError;
				2201	if (size == 0)
				2202	return (PyObject *)v;
				2203	p = PyUnicode_AS_UNICODE(v);
				2204	while (size-- > 0) {
				2205	register unsigned char c;
				2206
				2207	c = (unsigned char)*s++;
				2208	if (c < 128)
				2209	*p++ = c;
				2210	else if (ascii_decoding_error(&s, &p, errors,
				2211	"ordinal not in range(128)"))
				2212	goto onError;
				2213	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2214	if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2215	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2216	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2217	return (PyObject *)v;
				2218
				2219	onError:
				2220	Py_XDECREF(v);
				2221	return NULL;
				2222	}
				2223
				2224	static
				2225	int ascii_encoding_error(const Py_UNICODE **source,
				2226	char **dest,
				2227	const char *errors,
				2228	const char *details)
				2229	{
				2230	if ((errors == NULL) \|\|
				2231	(strcmp(errors,"strict") == 0)) {
				2232	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2233	"ASCII encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2234	details);
				2235	return -1;
				2236	}
				2237	else if (strcmp(errors,"ignore") == 0) {
				2238	return 0;
				2239	}
				2240	else if (strcmp(errors,"replace") == 0) {
				2241	**dest = '?';
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2242	(*dest)++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2243	return 0;
				2244	}
				2245	else {
				2246	PyErr_Format(PyExc_ValueError,
				2247	"ASCII encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2248	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2249	errors);
				2250	return -1;
				2251	}
				2252	}
				2253
				2254	PyObject PyUnicode_EncodeASCII(const Py_UNICODE p,
				2255	int size,
				2256	const char *errors)
				2257	{
				2258	PyObject *repr;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2259	char s, start;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2260
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2261	repr = PyString_FromStringAndSize(NULL, size);
				2262	if (repr == NULL)
				2263	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2264	if (size == 0)
				2265	return repr;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2266
				2267	s = PyString_AS_STRING(repr);
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2268	start = s;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2269	while (size-- > 0) {
				2270	Py_UNICODE ch = *p++;
				2271	if (ch >= 128) {
				2272	if (ascii_encoding_error(&p, &s, errors,
				2273	"ordinal not in range(128)"))
				2274	goto onError;
				2275	}
				2276	else
				2277	*s++ = (char)ch;
				2278	}
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2279	/* Resize if error handling skipped some characters */
				2280	if (s - start < PyString_GET_SIZE(repr))
				2281	if (_PyString_Resize(&repr, s - start))
				2282	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2283	return repr;
				2284
				2285	onError:
				2286	Py_DECREF(repr);
				2287	return NULL;
				2288	}
				2289
				2290	PyObject PyUnicode_AsASCIIString(PyObject unicode)
				2291	{
				2292	if (!PyUnicode_Check(unicode)) {
				2293	PyErr_BadArgument();
				2294	return NULL;
				2295	}
				2296	return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
				2297	PyUnicode_GET_SIZE(unicode),
				2298	NULL);
				2299	}
				2300
Fredrik Lundh	3083163	2001-06-26 15:11:00 +0000	[diff] [blame]	2301	#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2302
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2303	/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2304
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2305	PyObject PyUnicode_DecodeMBCS(const char s,
				2306	int size,
				2307	const char *errors)
				2308	{
				2309	PyUnicodeObject *v;
				2310	Py_UNICODE *p;
				2311
				2312	/* First get the size of the result */
				2313	DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2314	if (size > 0 && usize==0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2315	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2316
				2317	v = _PyUnicode_New(usize);
				2318	if (v == NULL)
				2319	return NULL;
				2320	if (usize == 0)
				2321	return (PyObject *)v;
				2322	p = PyUnicode_AS_UNICODE(v);
				2323	if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
				2324	Py_DECREF(v);
				2325	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2326	}
				2327
				2328	return (PyObject *)v;
				2329	}
				2330
				2331	PyObject PyUnicode_EncodeMBCS(const Py_UNICODE p,
				2332	int size,
				2333	const char *errors)
				2334	{
				2335	PyObject *repr;
				2336	char *s;
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2337	DWORD mbcssize;
				2338
				2339	/* If there are no characters, bail now! */
				2340	if (size==0)
				2341	return PyString_FromString("");
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2342
				2343	/* First get the size of the result */
Guido van Rossum	03e29f1	2000-05-04 15:52:20 +0000	[diff] [blame]	2344	mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2345	if (mbcssize==0)
				2346	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2347
				2348	repr = PyString_FromStringAndSize(NULL, mbcssize);
				2349	if (repr == NULL)
				2350	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2351	if (mbcssize == 0)
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2352	return repr;
				2353
				2354	/* Do the conversion */
				2355	s = PyString_AS_STRING(repr);
				2356	if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
				2357	Py_DECREF(repr);
				2358	return PyErr_SetFromWindowsErrWithFilename(0, NULL);
				2359	}
				2360	return repr;
				2361	}
Guido van Rossum	2ea3e14	2000-03-31 17:24:09 +0000	[diff] [blame]	2362
Guido van Rossum	b7a40ba	2000-03-28 02:01:52 +0000	[diff] [blame]	2363	#endif /* MS_WIN32 */
				2364
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2365	/* --- Character Mapping Codec -------------------------------------------- */
				2366
				2367	static
				2368	int charmap_decoding_error(const char **source,
				2369	Py_UNICODE **dest,
				2370	const char *errors,
				2371	const char *details)
				2372	{
				2373	if ((errors == NULL) \|\|
				2374	(strcmp(errors,"strict") == 0)) {
				2375	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2376	"charmap decoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2377	details);
				2378	return -1;
				2379	}
				2380	else if (strcmp(errors,"ignore") == 0) {
				2381	return 0;
				2382	}
				2383	else if (strcmp(errors,"replace") == 0) {
				2384	**dest = Py_UNICODE_REPLACEMENT_CHARACTER;
				2385	(*dest)++;
				2386	return 0;
				2387	}
				2388	else {
				2389	PyErr_Format(PyExc_ValueError,
				2390	"charmap decoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2391	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2392	errors);
				2393	return -1;
				2394	}
				2395	}
				2396
				2397	PyObject PyUnicode_DecodeCharmap(const char s,
				2398	int size,
				2399	PyObject *mapping,
				2400	const char *errors)
				2401	{
				2402	PyUnicodeObject *v;
				2403	Py_UNICODE *p;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2404	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2405
				2406	/* Default to Latin-1 */
				2407	if (mapping == NULL)
				2408	return PyUnicode_DecodeLatin1(s, size, errors);
				2409
				2410	v = _PyUnicode_New(size);
				2411	if (v == NULL)
				2412	goto onError;
				2413	if (size == 0)
				2414	return (PyObject *)v;
				2415	p = PyUnicode_AS_UNICODE(v);
				2416	while (size-- > 0) {
				2417	unsigned char ch = *s++;
				2418	PyObject w, x;
				2419
				2420	/* Get mapping (char ordinal -> integer, Unicode char or None) */
				2421	w = PyInt_FromLong((long)ch);
				2422	if (w == NULL)
				2423	goto onError;
				2424	x = PyObject_GetItem(mapping, w);
				2425	Py_DECREF(w);
				2426	if (x == NULL) {
				2427	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2428	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2429	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2430	x = Py_None;
				2431	Py_INCREF(x);
				2432	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2433	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2434	}
				2435
				2436	/* Apply mapping */
				2437	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2438	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2439	if (value < 0 \|\| value > 65535) {
				2440	PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg	07ceb67	2000-06-10 09:32:51 +0000	[diff] [blame]	2441	"character mapping must be in range(65536)");
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2442	Py_DECREF(x);
				2443	goto onError;
				2444	}
				2445	*p++ = (Py_UNICODE)value;
				2446	}
				2447	else if (x == Py_None) {
				2448	/* undefined mapping */
				2449	if (charmap_decoding_error(&s, &p, errors,
				2450	"character maps to <undefined>")) {
				2451	Py_DECREF(x);
				2452	goto onError;
				2453	}
				2454	}
				2455	else if (PyUnicode_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2456	int targetsize = PyUnicode_GET_SIZE(x);
				2457
				2458	if (targetsize == 1)
				2459	/* 1-1 mapping */
				2460	p++ = PyUnicode_AS_UNICODE(x);
				2461
				2462	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2463	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2464	if (targetsize > extrachars) {
				2465	/* resize first */
				2466	int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
				2467	int needed = (targetsize - extrachars) + \
				2468	(targetsize << 2);
				2469	extrachars += needed;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2470	if (_PyUnicode_Resize(&v,
				2471	PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2472	Py_DECREF(x);
				2473	goto onError;
				2474	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2475	p = PyUnicode_AS_UNICODE(v) + oldpos;
				2476	}
				2477	Py_UNICODE_COPY(p,
				2478	PyUnicode_AS_UNICODE(x),
				2479	targetsize);
				2480	p += targetsize;
				2481	extrachars -= targetsize;
				2482	}
				2483	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2484	}
				2485	else {
				2486	/* wrong return value */
				2487	PyErr_SetString(PyExc_TypeError,
				2488	"character mapping must return integer, None or unicode");
				2489	Py_DECREF(x);
				2490	goto onError;
				2491	}
				2492	Py_DECREF(x);
				2493	}
				2494	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2495	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2496	goto onError;
				2497	return (PyObject *)v;
				2498
				2499	onError:
				2500	Py_XDECREF(v);
				2501	return NULL;
				2502	}
				2503
				2504	static
				2505	int charmap_encoding_error(const Py_UNICODE **source,
				2506	char **dest,
				2507	const char *errors,
				2508	const char *details)
				2509	{
				2510	if ((errors == NULL) \|\|
				2511	(strcmp(errors,"strict") == 0)) {
				2512	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2513	"charmap encoding error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2514	details);
				2515	return -1;
				2516	}
				2517	else if (strcmp(errors,"ignore") == 0) {
				2518	return 0;
				2519	}
				2520	else if (strcmp(errors,"replace") == 0) {
				2521	**dest = '?';
				2522	(*dest)++;
				2523	return 0;
				2524	}
				2525	else {
				2526	PyErr_Format(PyExc_ValueError,
				2527	"charmap encoding error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2528	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2529	errors);
				2530	return -1;
				2531	}
				2532	}
				2533
				2534	PyObject PyUnicode_EncodeCharmap(const Py_UNICODE p,
				2535	int size,
				2536	PyObject *mapping,
				2537	const char *errors)
				2538	{
				2539	PyObject *v;
				2540	char *s;
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2541	int extrachars = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2542
				2543	/* Default to Latin-1 */
				2544	if (mapping == NULL)
				2545	return PyUnicode_EncodeLatin1(p, size, errors);
				2546
				2547	v = PyString_FromStringAndSize(NULL, size);
				2548	if (v == NULL)
				2549	return NULL;
Marc-André Lemburg	b752077	2000-08-14 11:29:19 +0000	[diff] [blame]	2550	if (size == 0)
				2551	return v;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2552	s = PyString_AS_STRING(v);
				2553	while (size-- > 0) {
				2554	Py_UNICODE ch = *p++;
				2555	PyObject w, x;
				2556
				2557	/* Get mapping (Unicode ordinal -> string char, integer or None) */
				2558	w = PyInt_FromLong((long)ch);
				2559	if (w == NULL)
				2560	goto onError;
				2561	x = PyObject_GetItem(mapping, w);
				2562	Py_DECREF(w);
				2563	if (x == NULL) {
				2564	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2565	/* No mapping found means: mapping is undefined. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2566	PyErr_Clear();
Marc-André Lemburg	a866df8	2001-01-03 21:29:14 +0000	[diff] [blame]	2567	x = Py_None;
				2568	Py_INCREF(x);
				2569	} else
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2570	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2571	}
				2572
				2573	/* Apply mapping */
				2574	if (PyInt_Check(x)) {
Marc-André Lemburg	85cc4d8	2000-07-06 19:43:31 +0000	[diff] [blame]	2575	long value = PyInt_AS_LONG(x);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2576	if (value < 0 \|\| value > 255) {
				2577	PyErr_SetString(PyExc_TypeError,
				2578	"character mapping must be in range(256)");
				2579	Py_DECREF(x);
				2580	goto onError;
				2581	}
				2582	*s++ = (char)value;
				2583	}
				2584	else if (x == Py_None) {
				2585	/* undefined mapping */
				2586	if (charmap_encoding_error(&p, &s, errors,
				2587	"character maps to <undefined>")) {
				2588	Py_DECREF(x);
				2589	goto onError;
				2590	}
				2591	}
				2592	else if (PyString_Check(x)) {
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2593	int targetsize = PyString_GET_SIZE(x);
				2594
				2595	if (targetsize == 1)
				2596	/* 1-1 mapping */
				2597	s++ = PyString_AS_STRING(x);
				2598
				2599	else if (targetsize > 1) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2600	/* 1-n mapping */
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2601	if (targetsize > extrachars) {
				2602	/* resize first */
				2603	int oldpos = (int)(s - PyString_AS_STRING(v));
				2604	int needed = (targetsize - extrachars) + \
				2605	(targetsize << 2);
				2606	extrachars += needed;
				2607	if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2608	Py_DECREF(x);
				2609	goto onError;
				2610	}
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2611	s = PyString_AS_STRING(v) + oldpos;
				2612	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2613	memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburg	ec233e5	2001-01-06 14:59:58 +0000	[diff] [blame]	2614	s += targetsize;
				2615	extrachars -= targetsize;
				2616	}
				2617	/* 1-0 mapping: skip the character */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2618	}
				2619	else {
				2620	/* wrong return value */
				2621	PyErr_SetString(PyExc_TypeError,
				2622	"character mapping must return integer, None or unicode");
				2623	Py_DECREF(x);
				2624	goto onError;
				2625	}
				2626	Py_DECREF(x);
				2627	}
				2628	if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
				2629	if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
				2630	goto onError;
				2631	return v;
				2632
				2633	onError:
				2634	Py_DECREF(v);
				2635	return NULL;
				2636	}
				2637
				2638	PyObject PyUnicode_AsCharmapString(PyObject unicode,
				2639	PyObject *mapping)
				2640	{
				2641	if (!PyUnicode_Check(unicode) \|\| mapping == NULL) {
				2642	PyErr_BadArgument();
				2643	return NULL;
				2644	}
				2645	return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
				2646	PyUnicode_GET_SIZE(unicode),
				2647	mapping,
				2648	NULL);
				2649	}
				2650
				2651	static
				2652	int translate_error(const Py_UNICODE **source,
				2653	Py_UNICODE **dest,
				2654	const char *errors,
				2655	const char *details)
				2656	{
				2657	if ((errors == NULL) \|\|
				2658	(strcmp(errors,"strict") == 0)) {
				2659	PyErr_Format(PyExc_UnicodeError,
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2660	"translate error: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2661	details);
				2662	return -1;
				2663	}
				2664	else if (strcmp(errors,"ignore") == 0) {
				2665	return 0;
				2666	}
				2667	else if (strcmp(errors,"replace") == 0) {
				2668	**dest = '?';
				2669	(*dest)++;
				2670	return 0;
				2671	}
				2672	else {
				2673	PyErr_Format(PyExc_ValueError,
				2674	"translate error; "
Guido van Rossum	5db862d	2000-04-10 12:46:51 +0000	[diff] [blame]	2675	"unknown error handling code: %.400s",
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2676	errors);
				2677	return -1;
				2678	}
				2679	}
				2680
				2681	PyObject PyUnicode_TranslateCharmap(const Py_UNICODE s,
				2682	int size,
				2683	PyObject *mapping,
				2684	const char *errors)
				2685	{
				2686	PyUnicodeObject *v;
				2687	Py_UNICODE *p;
				2688
				2689	if (mapping == NULL) {
				2690	PyErr_BadArgument();
				2691	return NULL;
				2692	}
				2693
				2694	/* Output will never be longer than input */
				2695	v = _PyUnicode_New(size);
				2696	if (v == NULL)
				2697	goto onError;
				2698	if (size == 0)
				2699	goto done;
				2700	p = PyUnicode_AS_UNICODE(v);
				2701	while (size-- > 0) {
				2702	Py_UNICODE ch = *s++;
				2703	PyObject w, x;
				2704
				2705	/* Get mapping */
				2706	w = PyInt_FromLong(ch);
				2707	if (w == NULL)
				2708	goto onError;
				2709	x = PyObject_GetItem(mapping, w);
				2710	Py_DECREF(w);
				2711	if (x == NULL) {
				2712	if (PyErr_ExceptionMatches(PyExc_LookupError)) {
				2713	/* No mapping found: default to 1-1 mapping */
				2714	PyErr_Clear();
				2715	*p++ = ch;
				2716	continue;
				2717	}
				2718	goto onError;
				2719	}
				2720
				2721	/* Apply mapping */
				2722	if (PyInt_Check(x))
				2723	*p++ = (Py_UNICODE)PyInt_AS_LONG(x);
				2724	else if (x == Py_None) {
				2725	/* undefined mapping */
				2726	if (translate_error(&s, &p, errors,
				2727	"character maps to <undefined>")) {
				2728	Py_DECREF(x);
				2729	goto onError;
				2730	}
				2731	}
				2732	else if (PyUnicode_Check(x)) {
				2733	if (PyUnicode_GET_SIZE(x) != 1) {
				2734	/* 1-n mapping */
				2735	PyErr_SetString(PyExc_NotImplementedError,
				2736	"1-n mappings are currently not implemented");
				2737	Py_DECREF(x);
				2738	goto onError;
				2739	}
				2740	p++ = PyUnicode_AS_UNICODE(x);
				2741	}
				2742	else {
				2743	/* wrong return value */
				2744	PyErr_SetString(PyExc_TypeError,
				2745	"translate mapping must return integer, None or unicode");
				2746	Py_DECREF(x);
				2747	goto onError;
				2748	}
				2749	Py_DECREF(x);
				2750	}
				2751	if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	2752	if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	2753	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2754
				2755	done:
				2756	return (PyObject *)v;
				2757
				2758	onError:
				2759	Py_XDECREF(v);
				2760	return NULL;
				2761	}
				2762
				2763	PyObject PyUnicode_Translate(PyObject str,
				2764	PyObject *mapping,
				2765	const char *errors)
				2766	{
				2767	PyObject *result;
				2768
				2769	str = PyUnicode_FromObject(str);
				2770	if (str == NULL)
				2771	goto onError;
				2772	result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
				2773	PyUnicode_GET_SIZE(str),
				2774	mapping,
				2775	errors);
				2776	Py_DECREF(str);
				2777	return result;
				2778
				2779	onError:
				2780	Py_XDECREF(str);
				2781	return NULL;
				2782	}
				2783
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2784	/* --- Decimal Encoder ---------------------------------------------------- */
				2785
				2786	int PyUnicode_EncodeDecimal(Py_UNICODE *s,
				2787	int length,
				2788	char *output,
				2789	const char *errors)
				2790	{
				2791	Py_UNICODE p, end;
				2792
				2793	if (output == NULL) {
				2794	PyErr_BadArgument();
				2795	return -1;
				2796	}
				2797
				2798	p = s;
				2799	end = s + length;
				2800	while (p < end) {
				2801	register Py_UNICODE ch = *p++;
				2802	int decimal;
				2803
				2804	if (Py_UNICODE_ISSPACE(ch)) {
				2805	*output++ = ' ';
				2806	continue;
				2807	}
				2808	decimal = Py_UNICODE_TODECIMAL(ch);
				2809	if (decimal >= 0) {
				2810	*output++ = '0' + decimal;
				2811	continue;
				2812	}
Guido van Rossum	ba47704	2000-04-06 18:18:10 +0000	[diff] [blame]	2813	if (0 < ch && ch < 256) {
Guido van Rossum	42c29aa	2000-05-03 23:58:29 +0000	[diff] [blame]	2814	*output++ = (char)ch;
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	2815	continue;
				2816	}
				2817	/* All other characters are considered invalid */
				2818	if (errors == NULL \|\| strcmp(errors, "strict") == 0) {
				2819	PyErr_SetString(PyExc_ValueError,
				2820	"invalid decimal Unicode string");
				2821	goto onError;
				2822	}
				2823	else if (strcmp(errors, "ignore") == 0)
				2824	continue;
				2825	else if (strcmp(errors, "replace") == 0) {
				2826	*output++ = '?';
				2827	continue;
				2828	}
				2829	}
				2830	/* 0-terminate the output string */
				2831	*output++ = '\0';
				2832	return 0;
				2833
				2834	onError:
				2835	return -1;
				2836	}
				2837
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2838	/* --- Helpers ------------------------------------------------------------ */
				2839
				2840	static
				2841	int count(PyUnicodeObject *self,
				2842	int start,
				2843	int end,
				2844	PyUnicodeObject *substring)
				2845	{
				2846	int count = 0;
				2847
Marc-André Lemburg	3a645e4	2001-01-16 11:54:12 +0000	[diff] [blame]	2848	if (start < 0)
				2849	start += self->length;
				2850	if (start < 0)
				2851	start = 0;
				2852	if (end > self->length)
				2853	end = self->length;
				2854	if (end < 0)
				2855	end += self->length;
				2856	if (end < 0)
				2857	end = 0;
				2858
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2859	if (substring->length == 0)
				2860	return (end - start + 1);
				2861
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2862	end -= substring->length;
				2863
				2864	while (start <= end)
				2865	if (Py_UNICODE_MATCH(self, start, substring)) {
				2866	count++;
				2867	start += substring->length;
				2868	} else
				2869	start++;
				2870
				2871	return count;
				2872	}
				2873
				2874	int PyUnicode_Count(PyObject *str,
				2875	PyObject *substr,
				2876	int start,
				2877	int end)
				2878	{
				2879	int result;
				2880
				2881	str = PyUnicode_FromObject(str);
				2882	if (str == NULL)
				2883	return -1;
				2884	substr = PyUnicode_FromObject(substr);
				2885	if (substr == NULL) {
Marc-André Lemburg	49ef6dc	2000-06-18 22:25:22 +0000	[diff] [blame]	2886	Py_DECREF(str);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	2887	return -1;
				2888	}
				2889
				2890	result = count((PyUnicodeObject *)str,
				2891	start, end,
				2892	(PyUnicodeObject *)substr);
				2893
				2894	Py_DECREF(str);
				2895	Py_DECREF(substr);
				2896	return result;
				2897	}
				2898
				2899	static
				2900	int findstring(PyUnicodeObject *self,
				2901	PyUnicodeObject *substring,
				2902	int start,
				2903	int end,
				2904	int direction)
				2905	{
				2906	if (start < 0)
				2907	start += self->length;
				2908	if (start < 0)
				2909	start = 0;
				2910
				2911	if (substring->length == 0)
				2912	return start;
				2913
				2914	if (end > self->length)
				2915	end = self->length;
				2916	if (end < 0)
				2917	end += self->length;
				2918	if (end < 0)
				2919	end = 0;
				2920
				2921	end -= substring->length;
				2922
				2923	if (direction < 0) {
				2924	for (; end >= start; end--)
				2925	if (Py_UNICODE_MATCH(self, end, substring))
				2926	return end;
				2927	} else {
				2928	for (; start <= end; start++)
				2929	if (Py_UNICODE_MATCH(self, start, substring))
				2930	return start;
				2931	}
				2932
				2933	return -1;
				2934	}
				2935
				2936	int PyUnicode_Find(PyObject *str,
				2937	PyObject *substr,
				2938	int start,
				2939	int end,
				2940	int direction)
				2941	{
				2942	int result;
				2943
				2944	str = PyUnicode_FromObject(str);
				2945	if (str == NULL)
				2946	return -1;
				2947	substr = PyUnicode_FromObject(substr);
				2948	if (substr == NULL) {
				2949	Py_DECREF(substr);
				2950	return -1;
				2951	}
				2952
				2953	result = findstring((PyUnicodeObject *)str,
				2954	(PyUnicodeObject *)substr,
				2955	start, end, direction);
				2956	Py_DECREF(str);
				2957	Py_DECREF(substr);
				2958	return result;
				2959	}
				2960
				2961	static
				2962	int tailmatch(PyUnicodeObject *self,
				2963	PyUnicodeObject *substring,
				2964	int start,
				2965	int end,
				2966	int direction)
				2967	{
				2968	if (start < 0)
				2969	start += self->length;
				2970	if (start < 0)
				2971	start = 0;
				2972
				2973	if (substring->length == 0)
				2974	return 1;
				2975
				2976	if (end > self->length)
				2977	end = self->length;
				2978	if (end < 0)
				2979	end += self->length;
				2980	if (end < 0)
				2981	end = 0;
				2982
				2983	end -= substring->length;
				2984	if (end < start)
				2985	return 0;
				2986
				2987	if (direction > 0) {
				2988	if (Py_UNICODE_MATCH(self, end, substring))
				2989	return 1;
				2990	} else {
				2991	if (Py_UNICODE_MATCH(self, start, substring))
				2992	return 1;
				2993	}
				2994
				2995	return 0;
				2996	}
				2997
				2998	int PyUnicode_Tailmatch(PyObject *str,
				2999	PyObject *substr,
				3000	int start,
				3001	int end,
				3002	int direction)
				3003	{
				3004	int result;
				3005
				3006	str = PyUnicode_FromObject(str);
				3007	if (str == NULL)
				3008	return -1;
				3009	substr = PyUnicode_FromObject(substr);
				3010	if (substr == NULL) {
				3011	Py_DECREF(substr);
				3012	return -1;
				3013	}
				3014
				3015	result = tailmatch((PyUnicodeObject *)str,
				3016	(PyUnicodeObject *)substr,
				3017	start, end, direction);
				3018	Py_DECREF(str);
				3019	Py_DECREF(substr);
				3020	return result;
				3021	}
				3022
				3023	static
				3024	const Py_UNICODE findchar(const Py_UNICODE s,
				3025	int size,
				3026	Py_UNICODE ch)
				3027	{
				3028	/* like wcschr, but doesn't stop at NULL characters */
				3029
				3030	while (size-- > 0) {
				3031	if (*s == ch)
				3032	return s;
				3033	s++;
				3034	}
				3035
				3036	return NULL;
				3037	}
				3038
				3039	/* Apply fixfct filter to the Unicode object self and return a
				3040	reference to the modified object */
				3041
				3042	static
				3043	PyObject fixup(PyUnicodeObject self,
				3044	int (fixfct)(PyUnicodeObject s))
				3045	{
				3046
				3047	PyUnicodeObject *u;
				3048
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3049	u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3050	if (u == NULL)
				3051	return NULL;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3052
				3053	Py_UNICODE_COPY(u->str, self->str, self->length);
				3054
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3055	if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3056	/* fixfct should return TRUE if it modified the buffer. If
				3057	FALSE, return a reference to the original buffer instead
				3058	(to save space, not time) */
				3059	Py_INCREF(self);
				3060	Py_DECREF(u);
				3061	return (PyObject*) self;
				3062	}
				3063	return (PyObject*) u;
				3064	}
				3065
				3066	static
				3067	int fixupper(PyUnicodeObject *self)
				3068	{
				3069	int len = self->length;
				3070	Py_UNICODE *s = self->str;
				3071	int status = 0;
				3072
				3073	while (len-- > 0) {
				3074	register Py_UNICODE ch;
				3075
				3076	ch = Py_UNICODE_TOUPPER(*s);
				3077	if (ch != *s) {
				3078	status = 1;
				3079	*s = ch;
				3080	}
				3081	s++;
				3082	}
				3083
				3084	return status;
				3085	}
				3086
				3087	static
				3088	int fixlower(PyUnicodeObject *self)
				3089	{
				3090	int len = self->length;
				3091	Py_UNICODE *s = self->str;
				3092	int status = 0;
				3093
				3094	while (len-- > 0) {
				3095	register Py_UNICODE ch;
				3096
				3097	ch = Py_UNICODE_TOLOWER(*s);
				3098	if (ch != *s) {
				3099	status = 1;
				3100	*s = ch;
				3101	}
				3102	s++;
				3103	}
				3104
				3105	return status;
				3106	}
				3107
				3108	static
				3109	int fixswapcase(PyUnicodeObject *self)
				3110	{
				3111	int len = self->length;
				3112	Py_UNICODE *s = self->str;
				3113	int status = 0;
				3114
				3115	while (len-- > 0) {
				3116	if (Py_UNICODE_ISUPPER(*s)) {
				3117	s = Py_UNICODE_TOLOWER(s);
				3118	status = 1;
				3119	} else if (Py_UNICODE_ISLOWER(*s)) {
				3120	s = Py_UNICODE_TOUPPER(s);
				3121	status = 1;
				3122	}
				3123	s++;
				3124	}
				3125
				3126	return status;
				3127	}
				3128
				3129	static
				3130	int fixcapitalize(PyUnicodeObject *self)
				3131	{
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3132	int len = self->length;
				3133	Py_UNICODE *s = self->str;
				3134	int status = 0;
				3135
				3136	if (len == 0)
				3137	return 0;
				3138	if (Py_UNICODE_ISLOWER(*s)) {
				3139	s = Py_UNICODE_TOUPPER(s);
				3140	status = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3141	}
Marc-André Lemburg	fde66e1	2001-01-29 11:14:16 +0000	[diff] [blame]	3142	s++;
				3143	while (--len > 0) {
				3144	if (Py_UNICODE_ISUPPER(*s)) {
				3145	s = Py_UNICODE_TOLOWER(s);
				3146	status = 1;
				3147	}
				3148	s++;
				3149	}
				3150	return status;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3151	}
				3152
				3153	static
				3154	int fixtitle(PyUnicodeObject *self)
				3155	{
				3156	register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				3157	register Py_UNICODE *e;
				3158	int previous_is_cased;
				3159
				3160	/* Shortcut for single character strings */
				3161	if (PyUnicode_GET_SIZE(self) == 1) {
				3162	Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
				3163	if (*p != ch) {
				3164	*p = ch;
				3165	return 1;
				3166	}
				3167	else
				3168	return 0;
				3169	}
				3170
				3171	e = p + PyUnicode_GET_SIZE(self);
				3172	previous_is_cased = 0;
				3173	for (; p < e; p++) {
				3174	register const Py_UNICODE ch = *p;
				3175
				3176	if (previous_is_cased)
				3177	*p = Py_UNICODE_TOLOWER(ch);
				3178	else
				3179	*p = Py_UNICODE_TOTITLE(ch);
				3180
				3181	if (Py_UNICODE_ISLOWER(ch) \|\|
				3182	Py_UNICODE_ISUPPER(ch) \|\|
				3183	Py_UNICODE_ISTITLE(ch))
				3184	previous_is_cased = 1;
				3185	else
				3186	previous_is_cased = 0;
				3187	}
				3188	return 1;
				3189	}
				3190
				3191	PyObject PyUnicode_Join(PyObject separator,
				3192	PyObject *seq)
				3193	{
				3194	Py_UNICODE *sep;
				3195	int seplen;
				3196	PyUnicodeObject *res = NULL;
				3197	int reslen = 0;
				3198	Py_UNICODE *p;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3199	int sz = 100;
				3200	int i;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3201	PyObject *it;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3202
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3203	it = PyObject_GetIter(seq);
				3204	if (it == NULL)
				3205	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3206
				3207	if (separator == NULL) {
				3208	Py_UNICODE blank = ' ';
				3209	sep = &blank;
				3210	seplen = 1;
				3211	}
				3212	else {
				3213	separator = PyUnicode_FromObject(separator);
				3214	if (separator == NULL)
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3215	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3216	sep = PyUnicode_AS_UNICODE(separator);
				3217	seplen = PyUnicode_GET_SIZE(separator);
				3218	}
				3219
				3220	res = _PyUnicode_New(sz);
				3221	if (res == NULL)
				3222	goto onError;
				3223	p = PyUnicode_AS_UNICODE(res);
				3224	reslen = 0;
				3225
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3226	for (i = 0; ; ++i) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3227	int itemlen;
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3228	PyObject *item = PyIter_Next(it);
				3229	if (item == NULL) {
				3230	if (PyErr_Occurred())
				3231	goto onError;
				3232	break;
				3233	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3234	if (!PyUnicode_Check(item)) {
				3235	PyObject *v;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3236	if (!PyString_Check(item)) {
				3237	PyErr_Format(PyExc_TypeError,
				3238	"sequence item %i: expected string or Unicode,"
				3239	" %.80s found",
				3240	i, item->ob_type->tp_name);
				3241	Py_DECREF(item);
				3242	goto onError;
				3243	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3244	v = PyUnicode_FromObject(item);
				3245	Py_DECREF(item);
				3246	item = v;
				3247	if (item == NULL)
				3248	goto onError;
				3249	}
				3250	itemlen = PyUnicode_GET_SIZE(item);
				3251	while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3252	if (_PyUnicode_Resize(&res, sz*2)) {
				3253	Py_DECREF(item);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3254	goto onError;
Marc-André Lemburg	3508e30	2001-09-20 17:22:58 +0000	[diff] [blame]	3255	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3256	sz *= 2;
				3257	p = PyUnicode_AS_UNICODE(res) + reslen;
				3258	}
				3259	if (i > 0) {
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3260	Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3261	p += seplen;
				3262	reslen += seplen;
				3263	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3264	Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3265	p += itemlen;
				3266	reslen += itemlen;
				3267	Py_DECREF(item);
				3268	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3269	if (_PyUnicode_Resize(&res, reslen))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3270	goto onError;
				3271
				3272	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3273	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3274	return (PyObject *)res;
				3275
				3276	onError:
				3277	Py_XDECREF(separator);
Tim Peters	2cfe368	2001-05-05 05:36:48 +0000	[diff] [blame]	3278	Py_XDECREF(res);
				3279	Py_DECREF(it);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3280	return NULL;
				3281	}
				3282
				3283	static
				3284	PyUnicodeObject pad(PyUnicodeObject self,
				3285	int left,
				3286	int right,
				3287	Py_UNICODE fill)
				3288	{
				3289	PyUnicodeObject *u;
				3290
				3291	if (left < 0)
				3292	left = 0;
				3293	if (right < 0)
				3294	right = 0;
				3295
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3296	if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3297	Py_INCREF(self);
				3298	return self;
				3299	}
				3300
				3301	u = _PyUnicode_New(left + self->length + right);
				3302	if (u) {
				3303	if (left)
				3304	Py_UNICODE_FILL(u->str, fill, left);
				3305	Py_UNICODE_COPY(u->str + left, self->str, self->length);
				3306	if (right)
				3307	Py_UNICODE_FILL(u->str + left + self->length, fill, right);
				3308	}
				3309
				3310	return u;
				3311	}
				3312
				3313	#define SPLIT_APPEND(data, left, right) \
				3314	str = PyUnicode_FromUnicode(data + left, right - left); \
				3315	if (!str) \
				3316	goto onError; \
				3317	if (PyList_Append(list, str)) { \
				3318	Py_DECREF(str); \
				3319	goto onError; \
				3320	} \
				3321	else \
				3322	Py_DECREF(str);
				3323
				3324	static
				3325	PyObject split_whitespace(PyUnicodeObject self,
				3326	PyObject *list,
				3327	int maxcount)
				3328	{
				3329	register int i;
				3330	register int j;
				3331	int len = self->length;
				3332	PyObject *str;
				3333
				3334	for (i = j = 0; i < len; ) {
				3335	/* find a token */
				3336	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3337	i++;
				3338	j = i;
				3339	while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
				3340	i++;
				3341	if (j < i) {
				3342	if (maxcount-- <= 0)
				3343	break;
				3344	SPLIT_APPEND(self->str, j, i);
				3345	while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
				3346	i++;
				3347	j = i;
				3348	}
				3349	}
				3350	if (j < len) {
				3351	SPLIT_APPEND(self->str, j, len);
				3352	}
				3353	return list;
				3354
				3355	onError:
				3356	Py_DECREF(list);
				3357	return NULL;
				3358	}
				3359
				3360	PyObject PyUnicode_Splitlines(PyObject string,
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3361	int keepends)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3362	{
				3363	register int i;
				3364	register int j;
				3365	int len;
				3366	PyObject *list;
				3367	PyObject *str;
				3368	Py_UNICODE *data;
				3369
				3370	string = PyUnicode_FromObject(string);
				3371	if (string == NULL)
				3372	return NULL;
				3373	data = PyUnicode_AS_UNICODE(string);
				3374	len = PyUnicode_GET_SIZE(string);
				3375
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3376	list = PyList_New(0);
				3377	if (!list)
				3378	goto onError;
				3379
				3380	for (i = j = 0; i < len; ) {
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3381	int eol;
				3382
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3383	/* Find a line and append it */
				3384	while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
				3385	i++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3386
				3387	/* Skip the line break reading CRLF as one line break */
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3388	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3389	if (i < len) {
				3390	if (data[i] == '\r' && i + 1 < len &&
				3391	data[i+1] == '\n')
				3392	i += 2;
				3393	else
				3394	i++;
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3395	if (keepends)
				3396	eol = i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3397	}
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	3398	SPLIT_APPEND(data, j, eol);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3399	j = i;
				3400	}
				3401	if (j < len) {
				3402	SPLIT_APPEND(data, j, len);
				3403	}
				3404
				3405	Py_DECREF(string);
				3406	return list;
				3407
				3408	onError:
				3409	Py_DECREF(list);
				3410	Py_DECREF(string);
				3411	return NULL;
				3412	}
				3413
				3414	static
				3415	PyObject split_char(PyUnicodeObject self,
				3416	PyObject *list,
				3417	Py_UNICODE ch,
				3418	int maxcount)
				3419	{
				3420	register int i;
				3421	register int j;
				3422	int len = self->length;
				3423	PyObject *str;
				3424
				3425	for (i = j = 0; i < len; ) {
				3426	if (self->str[i] == ch) {
				3427	if (maxcount-- <= 0)
				3428	break;
				3429	SPLIT_APPEND(self->str, j, i);
				3430	i = j = i + 1;
				3431	} else
				3432	i++;
				3433	}
				3434	if (j <= len) {
				3435	SPLIT_APPEND(self->str, j, len);
				3436	}
				3437	return list;
				3438
				3439	onError:
				3440	Py_DECREF(list);
				3441	return NULL;
				3442	}
				3443
				3444	static
				3445	PyObject split_substring(PyUnicodeObject self,
				3446	PyObject *list,
				3447	PyUnicodeObject *substring,
				3448	int maxcount)
				3449	{
				3450	register int i;
				3451	register int j;
				3452	int len = self->length;
				3453	int sublen = substring->length;
				3454	PyObject *str;
				3455
Guido van Rossum	cda4f9a	2000-12-19 02:23:19 +0000	[diff] [blame]	3456	for (i = j = 0; i <= len - sublen; ) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3457	if (Py_UNICODE_MATCH(self, i, substring)) {
				3458	if (maxcount-- <= 0)
				3459	break;
				3460	SPLIT_APPEND(self->str, j, i);
				3461	i = j = i + sublen;
				3462	} else
				3463	i++;
				3464	}
				3465	if (j <= len) {
				3466	SPLIT_APPEND(self->str, j, len);
				3467	}
				3468	return list;
				3469
				3470	onError:
				3471	Py_DECREF(list);
				3472	return NULL;
				3473	}
				3474
				3475	#undef SPLIT_APPEND
				3476
				3477	static
				3478	PyObject split(PyUnicodeObject self,
				3479	PyUnicodeObject *substring,
				3480	int maxcount)
				3481	{
				3482	PyObject *list;
				3483
				3484	if (maxcount < 0)
				3485	maxcount = INT_MAX;
				3486
				3487	list = PyList_New(0);
				3488	if (!list)
				3489	return NULL;
				3490
				3491	if (substring == NULL)
				3492	return split_whitespace(self,list,maxcount);
				3493
				3494	else if (substring->length == 1)
				3495	return split_char(self,list,substring->str[0],maxcount);
				3496
				3497	else if (substring->length == 0) {
				3498	Py_DECREF(list);
				3499	PyErr_SetString(PyExc_ValueError, "empty separator");
				3500	return NULL;
				3501	}
				3502	else
				3503	return split_substring(self,list,substring,maxcount);
				3504	}
				3505
				3506	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3507	PyObject replace(PyUnicodeObject self,
				3508	PyUnicodeObject *str1,
				3509	PyUnicodeObject *str2,
				3510	int maxcount)
				3511	{
				3512	PyUnicodeObject *u;
				3513
				3514	if (maxcount < 0)
				3515	maxcount = INT_MAX;
				3516
				3517	if (str1->length == 1 && str2->length == 1) {
				3518	int i;
				3519
				3520	/* replace characters */
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3521	if (!findchar(self->str, self->length, str1->str[0]) &&
				3522	PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3523	/* nothing to replace, return original string */
				3524	Py_INCREF(self);
				3525	u = self;
				3526	} else {
				3527	Py_UNICODE u1 = str1->str[0];
				3528	Py_UNICODE u2 = str2->str[0];
				3529
				3530	u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3531	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3532	self->length
				3533	);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3534	if (u != NULL) {
				3535	Py_UNICODE_COPY(u->str, self->str,
				3536	self->length);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3537	for (i = 0; i < u->length; i++)
				3538	if (u->str[i] == u1) {
				3539	if (--maxcount < 0)
				3540	break;
				3541	u->str[i] = u2;
				3542	}
				3543	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	3544	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3545
				3546	} else {
				3547	int n, i;
				3548	Py_UNICODE *p;
				3549
				3550	/* replace strings */
				3551	n = count(self, 0, self->length, str1);
				3552	if (n > maxcount)
				3553	n = maxcount;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3554	if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3555	/* nothing to replace, return original string */
				3556	Py_INCREF(self);
				3557	u = self;
				3558	} else {
				3559	u = _PyUnicode_New(
				3560	self->length + n * (str2->length - str1->length));
				3561	if (u) {
				3562	i = 0;
				3563	p = u->str;
				3564	while (i <= self->length - str1->length)
				3565	if (Py_UNICODE_MATCH(self, i, str1)) {
				3566	/* replace string segment */
				3567	Py_UNICODE_COPY(p, str2->str, str2->length);
				3568	p += str2->length;
				3569	i += str1->length;
				3570	if (--n <= 0) {
				3571	/* copy remaining part */
				3572	Py_UNICODE_COPY(p, self->str+i, self->length-i);
				3573	break;
				3574	}
				3575	} else
				3576	*p++ = self->str[i++];
				3577	}
				3578	}
				3579	}
				3580
				3581	return (PyObject *) u;
				3582	}
				3583
				3584	/* --- Unicode Object Methods --------------------------------------------- */
				3585
				3586	static char title__doc__[] =
				3587	"S.title() -> unicode\n\
				3588	\n\
				3589	Return a titlecased version of S, i.e. words start with title case\n\
				3590	characters, all remaining cased characters have lower case.";
				3591
				3592	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3593	unicode_title(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3594	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3595	return fixup(self, fixtitle);
				3596	}
				3597
				3598	static char capitalize__doc__[] =
				3599	"S.capitalize() -> unicode\n\
				3600	\n\
				3601	Return a capitalized version of S, i.e. make the first character\n\
				3602	have upper case.";
				3603
				3604	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3605	unicode_capitalize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3606	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3607	return fixup(self, fixcapitalize);
				3608	}
				3609
				3610	#if 0
				3611	static char capwords__doc__[] =
				3612	"S.capwords() -> unicode\n\
				3613	\n\
				3614	Apply .capitalize() to all words in S and return the result with\n\
				3615	normalized whitespace (all whitespace strings are replaced by ' ').";
				3616
				3617	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	3618	unicode_capwords(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3619	{
				3620	PyObject *list;
				3621	PyObject *item;
				3622	int i;
				3623
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3624	/* Split into words */
				3625	list = split(self, NULL, -1);
				3626	if (!list)
				3627	return NULL;
				3628
				3629	/* Capitalize each word */
				3630	for (i = 0; i < PyList_GET_SIZE(list); i++) {
				3631	item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
				3632	fixcapitalize);
				3633	if (item == NULL)
				3634	goto onError;
				3635	Py_DECREF(PyList_GET_ITEM(list, i));
				3636	PyList_SET_ITEM(list, i, item);
				3637	}
				3638
				3639	/* Join the words to form a new string */
				3640	item = PyUnicode_Join(NULL, list);
				3641
				3642	onError:
				3643	Py_DECREF(list);
				3644	return (PyObject *)item;
				3645	}
				3646	#endif
				3647
				3648	static char center__doc__[] =
				3649	"S.center(width) -> unicode\n\
				3650	\n\
				3651	Return S centered in a Unicode string of length width. Padding is done\n\
				3652	using spaces.";
				3653
				3654	static PyObject *
				3655	unicode_center(PyUnicodeObject self, PyObject args)
				3656	{
				3657	int marg, left;
				3658	int width;
				3659
				3660	if (!PyArg_ParseTuple(args, "i:center", &width))
				3661	return NULL;
				3662
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	3663	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3664	Py_INCREF(self);
				3665	return (PyObject*) self;
				3666	}
				3667
				3668	marg = width - self->length;
				3669	left = marg / 2 + (marg & width & 1);
				3670
				3671	return (PyObject*) pad(self, left, marg - left, ' ');
				3672	}
				3673
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3674	#if 0
				3675
				3676	/* This code should go into some future Unicode collation support
				3677	module. The basic comparison should compare ordinals on a naive
Trent Mick	20abf57	2000-08-12 22:14:34 +0000	[diff] [blame]	3678	basis (this is what Java does and thus JPython too). */
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3679
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3680	/* speedy UTF-16 code point order comparison */
				3681	/* gleaned from: */
				3682	/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
				3683
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3684	static short utf16Fixup[32] =
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3685	{
				3686	0, 0, 0, 0, 0, 0, 0, 0,
				3687	0, 0, 0, 0, 0, 0, 0, 0,
				3688	0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3689	0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3690	};
				3691
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3692	static int
				3693	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3694	{
				3695	int len1, len2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3696
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3697	Py_UNICODE *s1 = str1->str;
				3698	Py_UNICODE *s2 = str2->str;
				3699
				3700	len1 = str1->length;
				3701	len2 = str2->length;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3702
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3703	while (len1 > 0 && len2 > 0) {
Marc-André Lemburg	e12896e	2000-07-07 17:51:08 +0000	[diff] [blame]	3704	Py_UNICODE c1, c2;
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3705
				3706	c1 = *s1++;
				3707	c2 = *s2++;
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3708
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3709	if (c1 > (1<<11) * 26)
				3710	c1 += utf16Fixup[c1>>11];
				3711	if (c2 > (1<<11) * 26)
				3712	c2 += utf16Fixup[c2>>11];
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3713	/* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3714
				3715	if (c1 != c2)
				3716	return (c1 < c2) ? -1 : 1;
				3717
Marc-André Lemburg	1e7205a	2000-07-04 09:51:07 +0000	[diff] [blame]	3718	len1--; len2--;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3719	}
				3720
				3721	return (len1 < len2) ? -1 : (len1 != len2);
				3722	}
				3723
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3724	#else
				3725
				3726	static int
				3727	unicode_compare(PyUnicodeObject str1, PyUnicodeObject str2)
				3728	{
				3729	register int len1, len2;
				3730
				3731	Py_UNICODE *s1 = str1->str;
				3732	Py_UNICODE *s2 = str2->str;
				3733
				3734	len1 = str1->length;
				3735	len2 = str2->length;
				3736
				3737	while (len1 > 0 && len2 > 0) {
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3738	Py_UNICODE c1, c2;
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3739
Fredrik Lundh	45714e9	2001-06-26 16:39:36 +0000	[diff] [blame]	3740	c1 = *s1++;
				3741	c2 = *s2++;
				3742
				3743	if (c1 != c2)
				3744	return (c1 < c2) ? -1 : 1;
				3745
Marc-André Lemburg	e503437	2000-08-08 08:04:29 +0000	[diff] [blame]	3746	len1--; len2--;
				3747	}
				3748
				3749	return (len1 < len2) ? -1 : (len1 != len2);
				3750	}
				3751
				3752	#endif
				3753
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3754	int PyUnicode_Compare(PyObject *left,
				3755	PyObject *right)
				3756	{
				3757	PyUnicodeObject u = NULL, v = NULL;
				3758	int result;
				3759
				3760	/* Coerce the two arguments */
				3761	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3762	if (u == NULL)
				3763	goto onError;
				3764	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3765	if (v == NULL)
				3766	goto onError;
				3767
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3768	/* Shortcut for empty or interned objects */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3769	if (v == u) {
				3770	Py_DECREF(u);
				3771	Py_DECREF(v);
				3772	return 0;
				3773	}
				3774
				3775	result = unicode_compare(u, v);
				3776
				3777	Py_DECREF(u);
				3778	Py_DECREF(v);
				3779	return result;
				3780
				3781	onError:
				3782	Py_XDECREF(u);
				3783	Py_XDECREF(v);
				3784	return -1;
				3785	}
				3786
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3787	int PyUnicode_Contains(PyObject *container,
				3788	PyObject *element)
				3789	{
				3790	PyUnicodeObject u = NULL, v = NULL;
				3791	int result;
				3792	register const Py_UNICODE p, e;
				3793	register Py_UNICODE ch;
				3794
				3795	/* Coerce the two arguments */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3796	v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3797	if (v == NULL) {
				3798	PyErr_SetString(PyExc_TypeError,
				3799	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3800	goto onError;
Marc-André Lemburg	7c01468	2000-06-28 08:11:47 +0000	[diff] [blame]	3801	}
Guido van Rossum	9e896b3	2000-04-05 20:11:21 +0000	[diff] [blame]	3802	u = (PyUnicodeObject *)PyUnicode_FromObject(container);
				3803	if (u == NULL) {
				3804	Py_DECREF(v);
				3805	goto onError;
				3806	}
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3807
				3808	/* Check v in u */
				3809	if (PyUnicode_GET_SIZE(v) != 1) {
				3810	PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchling	cb95a14	2000-06-09 14:04:53 +0000	[diff] [blame]	3811	"'in <string>' requires character as left operand");
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	3812	goto onError;
				3813	}
				3814	ch = *PyUnicode_AS_UNICODE(v);
				3815	p = PyUnicode_AS_UNICODE(u);
				3816	e = p + PyUnicode_GET_SIZE(u);
				3817	result = 0;
				3818	while (p < e) {
				3819	if (*p++ == ch) {
				3820	result = 1;
				3821	break;
				3822	}
				3823	}
				3824
				3825	Py_DECREF(u);
				3826	Py_DECREF(v);
				3827	return result;
				3828
				3829	onError:
				3830	Py_XDECREF(u);
				3831	Py_XDECREF(v);
				3832	return -1;
				3833	}
				3834
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3835	/* Concat to string or Unicode object giving a new Unicode object. */
				3836
				3837	PyObject PyUnicode_Concat(PyObject left,
				3838	PyObject *right)
				3839	{
				3840	PyUnicodeObject u = NULL, v = NULL, *w;
				3841
				3842	/* Coerce the two arguments */
				3843	u = (PyUnicodeObject *)PyUnicode_FromObject(left);
				3844	if (u == NULL)
				3845	goto onError;
				3846	v = (PyUnicodeObject *)PyUnicode_FromObject(right);
				3847	if (v == NULL)
				3848	goto onError;
				3849
				3850	/* Shortcuts */
				3851	if (v == unicode_empty) {
				3852	Py_DECREF(v);
				3853	return (PyObject *)u;
				3854	}
				3855	if (u == unicode_empty) {
				3856	Py_DECREF(u);
				3857	return (PyObject *)v;
				3858	}
				3859
				3860	/* Concat the two Unicode strings */
				3861	w = _PyUnicode_New(u->length + v->length);
				3862	if (w == NULL)
				3863	goto onError;
				3864	Py_UNICODE_COPY(w->str, u->str, u->length);
				3865	Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
				3866
				3867	Py_DECREF(u);
				3868	Py_DECREF(v);
				3869	return (PyObject *)w;
				3870
				3871	onError:
				3872	Py_XDECREF(u);
				3873	Py_XDECREF(v);
				3874	return NULL;
				3875	}
				3876
				3877	static char count__doc__[] =
				3878	"S.count(sub[, start[, end]]) -> int\n\
				3879	\n\
				3880	Return the number of occurrences of substring sub in Unicode string\n\
				3881	S[start:end]. Optional arguments start and end are\n\
				3882	interpreted as in slice notation.";
				3883
				3884	static PyObject *
				3885	unicode_count(PyUnicodeObject self, PyObject args)
				3886	{
				3887	PyUnicodeObject *substring;
				3888	int start = 0;
				3889	int end = INT_MAX;
				3890	PyObject *result;
				3891
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	3892	if (!PyArg_ParseTuple(args, "O\|O&O&:count", &substring,
				3893	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3894	return NULL;
				3895
				3896	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				3897	(PyObject *)substring);
				3898	if (substring == NULL)
				3899	return NULL;
				3900
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3901	if (start < 0)
				3902	start += self->length;
				3903	if (start < 0)
				3904	start = 0;
				3905	if (end > self->length)
				3906	end = self->length;
				3907	if (end < 0)
				3908	end += self->length;
				3909	if (end < 0)
				3910	end = 0;
				3911
				3912	result = PyInt_FromLong((long) count(self, start, end, substring));
				3913
				3914	Py_DECREF(substring);
				3915	return result;
				3916	}
				3917
				3918	static char encode__doc__[] =
				3919	"S.encode([encoding[,errors]]) -> string\n\
				3920	\n\
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	3921	Return an encoded string version of S. Default encoding is the current\n\
				3922	default string encoding. errors may be given to set a different error\n\
				3923	handling scheme. Default is 'strict' meaning that encoding errors raise\n\
				3924	a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3925
				3926	static PyObject *
				3927	unicode_encode(PyUnicodeObject self, PyObject args)
				3928	{
				3929	char *encoding = NULL;
				3930	char *errors = NULL;
				3931	if (!PyArg_ParseTuple(args, "\|ss:encode", &encoding, &errors))
				3932	return NULL;
				3933	return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
				3934	}
				3935
				3936	static char expandtabs__doc__[] =
				3937	"S.expandtabs([tabsize]) -> unicode\n\
				3938	\n\
				3939	Return a copy of S where all tab characters are expanded using spaces.\n\
				3940	If tabsize is not given, a tab size of 8 characters is assumed.";
				3941
				3942	static PyObject*
				3943	unicode_expandtabs(PyUnicodeObject self, PyObject args)
				3944	{
				3945	Py_UNICODE *e;
				3946	Py_UNICODE *p;
				3947	Py_UNICODE *q;
				3948	int i, j;
				3949	PyUnicodeObject *u;
				3950	int tabsize = 8;
				3951
				3952	if (!PyArg_ParseTuple(args, "\|i:expandtabs", &tabsize))
				3953	return NULL;
				3954
Thomas Wouters	7e47402	2000-07-16 12:04:32 +0000	[diff] [blame]	3955	/* First pass: determine size of output string */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	3956	i = j = 0;
				3957	e = self->str + self->length;
				3958	for (p = self->str; p < e; p++)
				3959	if (*p == '\t') {
				3960	if (tabsize > 0)
				3961	j += tabsize - (j % tabsize);
				3962	}
				3963	else {
				3964	j++;
				3965	if (p == '\n' \|\| p == '\r') {
				3966	i += j;
				3967	j = 0;
				3968	}
				3969	}
				3970
				3971	/* Second pass: create output string and fill it */
				3972	u = _PyUnicode_New(i + j);
				3973	if (!u)
				3974	return NULL;
				3975
				3976	j = 0;
				3977	q = u->str;
				3978
				3979	for (p = self->str; p < e; p++)
				3980	if (*p == '\t') {
				3981	if (tabsize > 0) {
				3982	i = tabsize - (j % tabsize);
				3983	j += i;
				3984	while (i--)
				3985	*q++ = ' ';
				3986	}
				3987	}
				3988	else {
				3989	j++;
				3990	q++ = p;
				3991	if (p == '\n' \|\| p == '\r')
				3992	j = 0;
				3993	}
				3994
				3995	return (PyObject*) u;
				3996	}
				3997
				3998	static char find__doc__[] =
				3999	"S.find(sub [,start [,end]]) -> int\n\
				4000	\n\
				4001	Return the lowest index in S where substring sub is found,\n\
				4002	such that sub is contained within s[start,end]. Optional\n\
				4003	arguments start and end are interpreted as in slice notation.\n\
				4004	\n\
				4005	Return -1 on failure.";
				4006
				4007	static PyObject *
				4008	unicode_find(PyUnicodeObject self, PyObject args)
				4009	{
				4010	PyUnicodeObject *substring;
				4011	int start = 0;
				4012	int end = INT_MAX;
				4013	PyObject *result;
				4014
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4015	if (!PyArg_ParseTuple(args, "O\|O&O&:find", &substring,
				4016	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4017	return NULL;
				4018	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4019	(PyObject *)substring);
				4020	if (substring == NULL)
				4021	return NULL;
				4022
				4023	result = PyInt_FromLong(findstring(self, substring, start, end, 1));
				4024
				4025	Py_DECREF(substring);
				4026	return result;
				4027	}
				4028
				4029	static PyObject *
				4030	unicode_getitem(PyUnicodeObject *self, int index)
				4031	{
				4032	if (index < 0 \|\| index >= self->length) {
				4033	PyErr_SetString(PyExc_IndexError, "string index out of range");
				4034	return NULL;
				4035	}
				4036
				4037	return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
				4038	}
				4039
				4040	static long
				4041	unicode_hash(PyUnicodeObject *self)
				4042	{
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4043	/* Since Unicode objects compare equal to their ASCII string
				4044	counterparts, they should use the individual character values
				4045	as basis for their hash value. This is needed to assure that
				4046	strings and Unicode objects behave in the same way as
				4047	dictionary keys. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4048
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4049	register int len;
				4050	register Py_UNICODE *p;
				4051	register long x;
				4052
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4053	if (self->hash != -1)
				4054	return self->hash;
Fredrik Lundh	dde6164	2000-07-10 18:27:47 +0000	[diff] [blame]	4055	len = PyUnicode_GET_SIZE(self);
				4056	p = PyUnicode_AS_UNICODE(self);
				4057	x = *p << 7;
				4058	while (--len >= 0)
				4059	x = (1000003x) ^ p++;
				4060	x ^= PyUnicode_GET_SIZE(self);
				4061	if (x == -1)
				4062	x = -2;
				4063	self->hash = x;
				4064	return x;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4065	}
				4066
				4067	static char index__doc__[] =
				4068	"S.index(sub [,start [,end]]) -> int\n\
				4069	\n\
				4070	Like S.find() but raise ValueError when the substring is not found.";
				4071
				4072	static PyObject *
				4073	unicode_index(PyUnicodeObject self, PyObject args)
				4074	{
				4075	int result;
				4076	PyUnicodeObject *substring;
				4077	int start = 0;
				4078	int end = INT_MAX;
				4079
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4080	if (!PyArg_ParseTuple(args, "O\|O&O&:index", &substring,
				4081	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4082	return NULL;
				4083
				4084	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4085	(PyObject *)substring);
				4086	if (substring == NULL)
				4087	return NULL;
				4088
				4089	result = findstring(self, substring, start, end, 1);
				4090
				4091	Py_DECREF(substring);
				4092	if (result < 0) {
				4093	PyErr_SetString(PyExc_ValueError, "substring not found");
				4094	return NULL;
				4095	}
				4096	return PyInt_FromLong(result);
				4097	}
				4098
				4099	static char islower__doc__[] =
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4100	"S.islower() -> bool\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4101	\n\
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4102	Return True if all cased characters in S are lowercase and there is\n\
				4103	at least one cased character in S, False otherwise.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4104
				4105	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4106	unicode_islower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4107	{
				4108	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4109	register const Py_UNICODE *e;
				4110	int cased;
				4111
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4112	/* Shortcut for single character strings */
				4113	if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4114	return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4115
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4116	/* Special case for empty strings */
				4117	if (PyString_GET_SIZE(self) == 0)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4118	return PyBool_FromLong(0);
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4119
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4120	e = p + PyUnicode_GET_SIZE(self);
				4121	cased = 0;
				4122	for (; p < e; p++) {
				4123	register const Py_UNICODE ch = *p;
				4124
				4125	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4126	return PyBool_FromLong(0);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4127	else if (!cased && Py_UNICODE_ISLOWER(ch))
				4128	cased = 1;
				4129	}
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4130	return PyBool_FromLong(cased);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4131	}
				4132
				4133	static char isupper__doc__[] =
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4134	"S.isupper() -> bool\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4135	\n\
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4136	Return True if all cased characters in S are uppercase and there is\n\
				4137	at least one cased character in S, False otherwise.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4138
				4139	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4140	unicode_isupper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4141	{
				4142	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4143	register const Py_UNICODE *e;
				4144	int cased;
				4145
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4146	/* Shortcut for single character strings */
				4147	if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4148	return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4149
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4150	/* Special case for empty strings */
				4151	if (PyString_GET_SIZE(self) == 0)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4152	return PyBool_FromLong(0);
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4153
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4154	e = p + PyUnicode_GET_SIZE(self);
				4155	cased = 0;
				4156	for (; p < e; p++) {
				4157	register const Py_UNICODE ch = *p;
				4158
				4159	if (Py_UNICODE_ISLOWER(ch) \|\| Py_UNICODE_ISTITLE(ch))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4160	return PyBool_FromLong(0);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4161	else if (!cased && Py_UNICODE_ISUPPER(ch))
				4162	cased = 1;
				4163	}
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4164	return PyBool_FromLong(cased);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4165	}
				4166
				4167	static char istitle__doc__[] =
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4168	"S.istitle() -> bool\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4169	\n\
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4170	Return True if S is a titlecased string, i.e. upper- and titlecase\n\
				4171	characters may only follow uncased characters and lowercase characters\n\
				4172	only cased ones. Return False otherwise.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4173
				4174	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4175	unicode_istitle(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4176	{
				4177	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4178	register const Py_UNICODE *e;
				4179	int cased, previous_is_cased;
				4180
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4181	/* Shortcut for single character strings */
				4182	if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4183	return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) \|\|
				4184	(Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4185
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4186	/* Special case for empty strings */
				4187	if (PyString_GET_SIZE(self) == 0)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4188	return PyBool_FromLong(0);
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4189
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4190	e = p + PyUnicode_GET_SIZE(self);
				4191	cased = 0;
				4192	previous_is_cased = 0;
				4193	for (; p < e; p++) {
				4194	register const Py_UNICODE ch = *p;
				4195
				4196	if (Py_UNICODE_ISUPPER(ch) \|\| Py_UNICODE_ISTITLE(ch)) {
				4197	if (previous_is_cased)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4198	return PyBool_FromLong(0);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4199	previous_is_cased = 1;
				4200	cased = 1;
				4201	}
				4202	else if (Py_UNICODE_ISLOWER(ch)) {
				4203	if (!previous_is_cased)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4204	return PyBool_FromLong(0);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4205	previous_is_cased = 1;
				4206	cased = 1;
				4207	}
				4208	else
				4209	previous_is_cased = 0;
				4210	}
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4211	return PyBool_FromLong(cased);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4212	}
				4213
				4214	static char isspace__doc__[] =
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4215	"S.isspace() -> bool\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4216	\n\
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4217	Return True if there are only whitespace characters in S,\n\
				4218	False otherwise.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4219
				4220	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4221	unicode_isspace(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4222	{
				4223	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4224	register const Py_UNICODE *e;
				4225
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4226	/* Shortcut for single character strings */
				4227	if (PyUnicode_GET_SIZE(self) == 1 &&
				4228	Py_UNICODE_ISSPACE(*p))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4229	return PyBool_FromLong(1);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4230
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4231	/* Special case for empty strings */
				4232	if (PyString_GET_SIZE(self) == 0)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4233	return PyBool_FromLong(0);
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4234
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4235	e = p + PyUnicode_GET_SIZE(self);
				4236	for (; p < e; p++) {
				4237	if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4238	return PyBool_FromLong(0);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4239	}
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4240	return PyBool_FromLong(1);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4241	}
				4242
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4243	static char isalpha__doc__[] =
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4244	"S.isalpha() -> bool\n\
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4245	\n\
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4246	Return True if all characters in S are alphabetic\n\
				4247	and there is at least one character in S, False otherwise.";
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4248
				4249	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4250	unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4251	{
				4252	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4253	register const Py_UNICODE *e;
				4254
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4255	/* Shortcut for single character strings */
				4256	if (PyUnicode_GET_SIZE(self) == 1 &&
				4257	Py_UNICODE_ISALPHA(*p))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4258	return PyBool_FromLong(1);
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4259
				4260	/* Special case for empty strings */
				4261	if (PyString_GET_SIZE(self) == 0)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4262	return PyBool_FromLong(0);
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4263
				4264	e = p + PyUnicode_GET_SIZE(self);
				4265	for (; p < e; p++) {
				4266	if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4267	return PyBool_FromLong(0);
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4268	}
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4269	return PyBool_FromLong(1);
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4270	}
				4271
				4272	static char isalnum__doc__[] =
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4273	"S.isalnum() -> bool\n\
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4274	\n\
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4275	Return True if all characters in S are alphanumeric\n\
				4276	and there is at least one character in S, False otherwise.";
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4277
				4278	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4279	unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4280	{
				4281	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4282	register const Py_UNICODE *e;
				4283
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4284	/* Shortcut for single character strings */
				4285	if (PyUnicode_GET_SIZE(self) == 1 &&
				4286	Py_UNICODE_ISALNUM(*p))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4287	return PyBool_FromLong(1);
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4288
				4289	/* Special case for empty strings */
				4290	if (PyString_GET_SIZE(self) == 0)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4291	return PyBool_FromLong(0);
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4292
				4293	e = p + PyUnicode_GET_SIZE(self);
				4294	for (; p < e; p++) {
				4295	if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4296	return PyBool_FromLong(0);
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4297	}
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4298	return PyBool_FromLong(1);
Marc-André Lemburg	a7acf42	2000-07-05 09:49:44 +0000	[diff] [blame]	4299	}
				4300
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4301	static char isdecimal__doc__[] =
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4302	"S.isdecimal() -> bool\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4303	\n\
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4304	Return True if there are only decimal characters in S,\n\
				4305	False otherwise.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4306
				4307	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4308	unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4309	{
				4310	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4311	register const Py_UNICODE *e;
				4312
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4313	/* Shortcut for single character strings */
				4314	if (PyUnicode_GET_SIZE(self) == 1 &&
				4315	Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4316	return PyBool_FromLong(1);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4317
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4318	/* Special case for empty strings */
				4319	if (PyString_GET_SIZE(self) == 0)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4320	return PyBool_FromLong(0);
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4321
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4322	e = p + PyUnicode_GET_SIZE(self);
				4323	for (; p < e; p++) {
				4324	if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4325	return PyBool_FromLong(0);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4326	}
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4327	return PyBool_FromLong(1);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4328	}
				4329
				4330	static char isdigit__doc__[] =
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4331	"S.isdigit() -> bool\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4332	\n\
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4333	Return True if there are only digit characters in S,\n\
				4334	False otherwise.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4335
				4336	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4337	unicode_isdigit(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4338	{
				4339	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4340	register const Py_UNICODE *e;
				4341
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4342	/* Shortcut for single character strings */
				4343	if (PyUnicode_GET_SIZE(self) == 1 &&
				4344	Py_UNICODE_ISDIGIT(*p))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4345	return PyBool_FromLong(1);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4346
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4347	/* Special case for empty strings */
				4348	if (PyString_GET_SIZE(self) == 0)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4349	return PyBool_FromLong(0);
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4350
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4351	e = p + PyUnicode_GET_SIZE(self);
				4352	for (; p < e; p++) {
				4353	if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4354	return PyBool_FromLong(0);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4355	}
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4356	return PyBool_FromLong(1);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4357	}
				4358
				4359	static char isnumeric__doc__[] =
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4360	"S.isnumeric() -> bool\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4361	\n\
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4362	Return True if there are only numeric characters in S,\n\
				4363	False otherwise.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4364
				4365	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4366	unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4367	{
				4368	register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
				4369	register const Py_UNICODE *e;
				4370
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4371	/* Shortcut for single character strings */
				4372	if (PyUnicode_GET_SIZE(self) == 1 &&
				4373	Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4374	return PyBool_FromLong(1);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4375
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4376	/* Special case for empty strings */
				4377	if (PyString_GET_SIZE(self) == 0)
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4378	return PyBool_FromLong(0);
Marc-André Lemburg	60bc809	2000-06-14 09:18:32 +0000	[diff] [blame]	4379
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4380	e = p + PyUnicode_GET_SIZE(self);
				4381	for (; p < e; p++) {
				4382	if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4383	return PyBool_FromLong(0);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4384	}
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4385	return PyBool_FromLong(1);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4386	}
				4387
				4388	static char join__doc__[] =
				4389	"S.join(sequence) -> unicode\n\
				4390	\n\
				4391	Return a string which is the concatenation of the strings in the\n\
				4392	sequence. The separator between elements is S.";
				4393
				4394	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4395	unicode_join(PyObject self, PyObject data)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4396	{
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4397	return PyUnicode_Join(self, data);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4398	}
				4399
				4400	static int
				4401	unicode_length(PyUnicodeObject *self)
				4402	{
				4403	return self->length;
				4404	}
				4405
				4406	static char ljust__doc__[] =
				4407	"S.ljust(width) -> unicode\n\
				4408	\n\
				4409	Return S left justified in a Unicode string of length width. Padding is\n\
				4410	done using spaces.";
				4411
				4412	static PyObject *
				4413	unicode_ljust(PyUnicodeObject self, PyObject args)
				4414	{
				4415	int width;
				4416	if (!PyArg_ParseTuple(args, "i:ljust", &width))
				4417	return NULL;
				4418
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4419	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4420	Py_INCREF(self);
				4421	return (PyObject*) self;
				4422	}
				4423
				4424	return (PyObject*) pad(self, 0, width - self->length, ' ');
				4425	}
				4426
				4427	static char lower__doc__[] =
				4428	"S.lower() -> unicode\n\
				4429	\n\
				4430	Return a copy of the string S converted to lowercase.";
				4431
				4432	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4433	unicode_lower(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4434	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4435	return fixup(self, fixlower);
				4436	}
				4437
Walter Dörwald	de02bcb	2002-04-22 17:42:37 +0000	[diff] [blame^]	4438	#define LEFTSTRIP 0
				4439	#define RIGHTSTRIP 1
				4440	#define BOTHSTRIP 2
				4441
				4442	/* Arrays indexed by above */
				4443	static const char *stripformat[] = {"\|O:lstrip", "\|O:rstrip", "\|O:strip"};
				4444
				4445	#define STRIPNAME(i) (stripformat[i]+3)
				4446
				4447	static const Py_UNICODE *
				4448	unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
				4449	{
				4450	int i;
				4451	for (i = 0; i<n; ++i)
				4452	if (s[i]==c)
				4453	return s+i;
				4454	return NULL;
				4455	}
				4456
				4457	/* externally visible for str.strip(unicode) */
				4458	PyObject *
				4459	_PyUnicode_XStrip(PyUnicodeObject self, int striptype, PyObject sepobj)
				4460	{
				4461	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
				4462	int len = PyUnicode_GET_SIZE(self);
				4463	Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
				4464	int seplen = PyUnicode_GET_SIZE(sepobj);
				4465	int i, j;
				4466
				4467	i = 0;
				4468	if (striptype != RIGHTSTRIP) {
				4469	while (i < len && unicode_memchr(sep, s[i], seplen)) {
				4470	i++;
				4471	}
				4472	}
				4473
				4474	j = len;
				4475	if (striptype != LEFTSTRIP) {
				4476	do {
				4477	j--;
				4478	} while (j >= i && unicode_memchr(sep, s[j], seplen));
				4479	j++;
				4480	}
				4481
				4482	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
				4483	Py_INCREF(self);
				4484	return (PyObject*)self;
				4485	}
				4486	else
				4487	return PyUnicode_FromUnicode(s+i, j-i);
				4488	}
				4489
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4490
				4491	static PyObject *
Walter Dörwald	de02bcb	2002-04-22 17:42:37 +0000	[diff] [blame^]	4492	do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4493	{
Walter Dörwald	de02bcb	2002-04-22 17:42:37 +0000	[diff] [blame^]	4494	Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
				4495	int len = PyUnicode_GET_SIZE(self), i, j;
				4496
				4497	i = 0;
				4498	if (striptype != RIGHTSTRIP) {
				4499	while (i < len && Py_UNICODE_ISSPACE(s[i])) {
				4500	i++;
				4501	}
				4502	}
				4503
				4504	j = len;
				4505	if (striptype != LEFTSTRIP) {
				4506	do {
				4507	j--;
				4508	} while (j >= i && Py_UNICODE_ISSPACE(s[j]));
				4509	j++;
				4510	}
				4511
				4512	if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
				4513	Py_INCREF(self);
				4514	return (PyObject*)self;
				4515	}
				4516	else
				4517	return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4518	}
				4519
Walter Dörwald	de02bcb	2002-04-22 17:42:37 +0000	[diff] [blame^]	4520
				4521	static PyObject *
				4522	do_argstrip(PyUnicodeObject self, int striptype, PyObject args)
				4523	{
				4524	PyObject *sep = NULL;
				4525
				4526	if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
				4527	return NULL;
				4528
				4529	if (sep != NULL && sep != Py_None) {
				4530	if (PyUnicode_Check(sep))
				4531	return _PyUnicode_XStrip(self, striptype, sep);
				4532	else if (PyString_Check(sep)) {
				4533	PyObject *res;
				4534	sep = PyUnicode_FromObject(sep);
				4535	if (sep==NULL)
				4536	return NULL;
				4537	res = _PyUnicode_XStrip(self, striptype, sep);
				4538	Py_DECREF(sep);
				4539	return res;
				4540	}
				4541	else {
				4542	PyErr_Format(PyExc_TypeError,
				4543	"%s arg must be None, unicode or str",
				4544	STRIPNAME(striptype));
				4545	return NULL;
				4546	}
				4547	}
				4548
				4549	return do_strip(self, striptype);
				4550	}
				4551
				4552
				4553	static char strip__doc__[] =
				4554	"S.strip([sep]) -> unicode\n\
				4555	\n\
				4556	Return a copy of the string S with leading and trailing\n\
				4557	whitespace removed.\n\
				4558	If sep is given and not None, remove characters in sep instead.\n\
				4559	If sep is a str, it will be converted to unicode before stripping";
				4560
				4561	static PyObject *
				4562	unicode_strip(PyUnicodeObject self, PyObject args)
				4563	{
				4564	if (PyTuple_GET_SIZE(args) == 0)
				4565	return do_strip(self, BOTHSTRIP); /* Common case */
				4566	else
				4567	return do_argstrip(self, BOTHSTRIP, args);
				4568	}
				4569
				4570
				4571	static char lstrip__doc__[] =
				4572	"S.lstrip([sep]) -> unicode\n\
				4573	\n\
				4574	Return a copy of the string S with leading whitespace removed.\n\
				4575	If sep is given and not None, remove characters in sep instead.\n\
				4576	If sep is a str, it will be converted to unicode before stripping";
				4577
				4578	static PyObject *
				4579	unicode_lstrip(PyUnicodeObject self, PyObject args)
				4580	{
				4581	if (PyTuple_GET_SIZE(args) == 0)
				4582	return do_strip(self, LEFTSTRIP); /* Common case */
				4583	else
				4584	return do_argstrip(self, LEFTSTRIP, args);
				4585	}
				4586
				4587
				4588	static char rstrip__doc__[] =
				4589	"S.rstrip([sep]) -> unicode\n\
				4590	\n\
				4591	Return a copy of the string S with trailing whitespace removed.\n\
				4592	If sep is given and not None, remove characters in sep instead.\n\
				4593	If sep is a str, it will be converted to unicode before stripping";
				4594
				4595	static PyObject *
				4596	unicode_rstrip(PyUnicodeObject self, PyObject args)
				4597	{
				4598	if (PyTuple_GET_SIZE(args) == 0)
				4599	return do_strip(self, RIGHTSTRIP); /* Common case */
				4600	else
				4601	return do_argstrip(self, RIGHTSTRIP, args);
				4602	}
				4603
				4604
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4605	static PyObject*
				4606	unicode_repeat(PyUnicodeObject *str, int len)
				4607	{
				4608	PyUnicodeObject *u;
				4609	Py_UNICODE *p;
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4610	int nchars;
				4611	size_t nbytes;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4612
				4613	if (len < 0)
				4614	len = 0;
				4615
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4616	if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4617	/* no repeat, return original string */
				4618	Py_INCREF(str);
				4619	return (PyObject*) str;
				4620	}
Tim Peters	8f42246	2000-09-09 06:13:41 +0000	[diff] [blame]	4621
				4622	/* ensure # of chars needed doesn't overflow int and # of bytes
				4623	* needed doesn't overflow size_t
				4624	*/
				4625	nchars = len * str->length;
				4626	if (len && nchars / len != str->length) {
				4627	PyErr_SetString(PyExc_OverflowError,
				4628	"repeated string is too long");
				4629	return NULL;
				4630	}
				4631	nbytes = (nchars + 1) * sizeof(Py_UNICODE);
				4632	if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
				4633	PyErr_SetString(PyExc_OverflowError,
				4634	"repeated string is too long");
				4635	return NULL;
				4636	}
				4637	u = _PyUnicode_New(nchars);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4638	if (!u)
				4639	return NULL;
				4640
				4641	p = u->str;
				4642
				4643	while (len-- > 0) {
				4644	Py_UNICODE_COPY(p, str->str, str->length);
				4645	p += str->length;
				4646	}
				4647
				4648	return (PyObject*) u;
				4649	}
				4650
				4651	PyObject PyUnicode_Replace(PyObject obj,
				4652	PyObject *subobj,
				4653	PyObject *replobj,
				4654	int maxcount)
				4655	{
				4656	PyObject *self;
				4657	PyObject *str1;
				4658	PyObject *str2;
				4659	PyObject *result;
				4660
				4661	self = PyUnicode_FromObject(obj);
				4662	if (self == NULL)
				4663	return NULL;
				4664	str1 = PyUnicode_FromObject(subobj);
				4665	if (str1 == NULL) {
				4666	Py_DECREF(self);
				4667	return NULL;
				4668	}
				4669	str2 = PyUnicode_FromObject(replobj);
				4670	if (str2 == NULL) {
				4671	Py_DECREF(self);
				4672	Py_DECREF(str1);
				4673	return NULL;
				4674	}
				4675	result = replace((PyUnicodeObject *)self,
				4676	(PyUnicodeObject *)str1,
				4677	(PyUnicodeObject *)str2,
				4678	maxcount);
				4679	Py_DECREF(self);
				4680	Py_DECREF(str1);
				4681	Py_DECREF(str2);
				4682	return result;
				4683	}
				4684
				4685	static char replace__doc__[] =
				4686	"S.replace (old, new[, maxsplit]) -> unicode\n\
				4687	\n\
				4688	Return a copy of S with all occurrences of substring\n\
				4689	old replaced by new. If the optional argument maxsplit is\n\
				4690	given, only the first maxsplit occurrences are replaced.";
				4691
				4692	static PyObject*
				4693	unicode_replace(PyUnicodeObject self, PyObject args)
				4694	{
				4695	PyUnicodeObject *str1;
				4696	PyUnicodeObject *str2;
				4697	int maxcount = -1;
				4698	PyObject *result;
				4699
				4700	if (!PyArg_ParseTuple(args, "OO\|i:replace", &str1, &str2, &maxcount))
				4701	return NULL;
				4702	str1 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str1);
				4703	if (str1 == NULL)
				4704	return NULL;
				4705	str2 = (PyUnicodeObject )PyUnicode_FromObject((PyObject )str2);
				4706	if (str2 == NULL)
				4707	return NULL;
				4708
				4709	result = replace(self, str1, str2, maxcount);
				4710
				4711	Py_DECREF(str1);
				4712	Py_DECREF(str2);
				4713	return result;
				4714	}
				4715
				4716	static
				4717	PyObject unicode_repr(PyObject unicode)
				4718	{
				4719	return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
				4720	PyUnicode_GET_SIZE(unicode),
				4721	1);
				4722	}
				4723
				4724	static char rfind__doc__[] =
				4725	"S.rfind(sub [,start [,end]]) -> int\n\
				4726	\n\
				4727	Return the highest index in S where substring sub is found,\n\
				4728	such that sub is contained within s[start,end]. Optional\n\
				4729	arguments start and end are interpreted as in slice notation.\n\
				4730	\n\
				4731	Return -1 on failure.";
				4732
				4733	static PyObject *
				4734	unicode_rfind(PyUnicodeObject self, PyObject args)
				4735	{
				4736	PyUnicodeObject *substring;
				4737	int start = 0;
				4738	int end = INT_MAX;
				4739	PyObject *result;
				4740
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4741	if (!PyArg_ParseTuple(args, "O\|O&O&:rfind", &substring,
				4742	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4743	return NULL;
				4744	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4745	(PyObject *)substring);
				4746	if (substring == NULL)
				4747	return NULL;
				4748
				4749	result = PyInt_FromLong(findstring(self, substring, start, end, -1));
				4750
				4751	Py_DECREF(substring);
				4752	return result;
				4753	}
				4754
				4755	static char rindex__doc__[] =
				4756	"S.rindex(sub [,start [,end]]) -> int\n\
				4757	\n\
				4758	Like S.rfind() but raise ValueError when the substring is not found.";
				4759
				4760	static PyObject *
				4761	unicode_rindex(PyUnicodeObject self, PyObject args)
				4762	{
				4763	int result;
				4764	PyUnicodeObject *substring;
				4765	int start = 0;
				4766	int end = INT_MAX;
				4767
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	4768	if (!PyArg_ParseTuple(args, "O\|O&O&:rindex", &substring,
				4769	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4770	return NULL;
				4771	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				4772	(PyObject *)substring);
				4773	if (substring == NULL)
				4774	return NULL;
				4775
				4776	result = findstring(self, substring, start, end, -1);
				4777
				4778	Py_DECREF(substring);
				4779	if (result < 0) {
				4780	PyErr_SetString(PyExc_ValueError, "substring not found");
				4781	return NULL;
				4782	}
				4783	return PyInt_FromLong(result);
				4784	}
				4785
				4786	static char rjust__doc__[] =
				4787	"S.rjust(width) -> unicode\n\
				4788	\n\
				4789	Return S right justified in a Unicode string of length width. Padding is\n\
				4790	done using spaces.";
				4791
				4792	static PyObject *
				4793	unicode_rjust(PyUnicodeObject self, PyObject args)
				4794	{
				4795	int width;
				4796	if (!PyArg_ParseTuple(args, "i:rjust", &width))
				4797	return NULL;
				4798
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4799	if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4800	Py_INCREF(self);
				4801	return (PyObject*) self;
				4802	}
				4803
				4804	return (PyObject*) pad(self, width - self->length, 0, ' ');
				4805	}
				4806
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4807	static PyObject*
				4808	unicode_slice(PyUnicodeObject *self, int start, int end)
				4809	{
				4810	/* standard clamping */
				4811	if (start < 0)
				4812	start = 0;
				4813	if (end < 0)
				4814	end = 0;
				4815	if (end > self->length)
				4816	end = self->length;
Tim Peters	7a29bd5	2001-09-12 03:03:31 +0000	[diff] [blame]	4817	if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4818	/* full slice, return original string */
				4819	Py_INCREF(self);
				4820	return (PyObject*) self;
				4821	}
				4822	if (start > end)
				4823	start = end;
				4824	/* copy slice */
				4825	return (PyObject*) PyUnicode_FromUnicode(self->str + start,
				4826	end - start);
				4827	}
				4828
				4829	PyObject PyUnicode_Split(PyObject s,
				4830	PyObject *sep,
				4831	int maxsplit)
				4832	{
				4833	PyObject *result;
				4834
				4835	s = PyUnicode_FromObject(s);
				4836	if (s == NULL)
				4837	return NULL;
				4838	if (sep != NULL) {
				4839	sep = PyUnicode_FromObject(sep);
				4840	if (sep == NULL) {
				4841	Py_DECREF(s);
				4842	return NULL;
				4843	}
				4844	}
				4845
				4846	result = split((PyUnicodeObject )s, (PyUnicodeObject )sep, maxsplit);
				4847
				4848	Py_DECREF(s);
				4849	Py_XDECREF(sep);
				4850	return result;
				4851	}
				4852
				4853	static char split__doc__[] =
				4854	"S.split([sep [,maxsplit]]) -> list of strings\n\
				4855	\n\
				4856	Return a list of the words in S, using sep as the\n\
				4857	delimiter string. If maxsplit is given, at most maxsplit\n\
				4858	splits are done. If sep is not specified, any whitespace string\n\
				4859	is a separator.";
				4860
				4861	static PyObject*
				4862	unicode_split(PyUnicodeObject self, PyObject args)
				4863	{
				4864	PyObject *substring = Py_None;
				4865	int maxcount = -1;
				4866
				4867	if (!PyArg_ParseTuple(args, "\|Oi:split", &substring, &maxcount))
				4868	return NULL;
				4869
				4870	if (substring == Py_None)
				4871	return split(self, NULL, maxcount);
				4872	else if (PyUnicode_Check(substring))
				4873	return split(self, (PyUnicodeObject *)substring, maxcount);
				4874	else
				4875	return PyUnicode_Split((PyObject *)self, substring, maxcount);
				4876	}
				4877
				4878	static char splitlines__doc__[] =
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4879	"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4880	\n\
				4881	Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4882	Line breaks are not included in the resulting list unless keepends\n\
				4883	is given and true.";
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4884
				4885	static PyObject*
				4886	unicode_splitlines(PyUnicodeObject self, PyObject args)
				4887	{
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4888	int keepends = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4889
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4890	if (!PyArg_ParseTuple(args, "\|i:splitlines", &keepends))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4891	return NULL;
				4892
Guido van Rossum	8666291	2000-04-11 15:38:46 +0000	[diff] [blame]	4893	return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4894	}
				4895
				4896	static
				4897	PyObject unicode_str(PyUnicodeObject self)
				4898	{
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	4899	return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4900	}
				4901
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4902	static char swapcase__doc__[] =
				4903	"S.swapcase() -> unicode\n\
				4904	\n\
				4905	Return a copy of S with uppercase characters converted to lowercase\n\
				4906	and vice versa.";
				4907
				4908	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4909	unicode_swapcase(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4910	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4911	return fixup(self, fixswapcase);
				4912	}
				4913
				4914	static char translate__doc__[] =
				4915	"S.translate(table) -> unicode\n\
				4916	\n\
				4917	Return a copy of the string S, where all characters have been mapped\n\
				4918	through the given translation table, which must be a mapping of\n\
				4919	Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
				4920	are left untouched. Characters mapped to None are deleted.";
				4921
				4922	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4923	unicode_translate(PyUnicodeObject self, PyObject table)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4924	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4925	return PyUnicode_TranslateCharmap(self->str,
				4926	self->length,
				4927	table,
				4928	"ignore");
				4929	}
				4930
				4931	static char upper__doc__[] =
				4932	"S.upper() -> unicode\n\
				4933	\n\
				4934	Return a copy of S converted to uppercase.";
				4935
				4936	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4937	unicode_upper(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4938	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4939	return fixup(self, fixupper);
				4940	}
				4941
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4942	static char zfill__doc__[] =
				4943	"S.zfill(width) -> unicode\n\
				4944	\n\
				4945	Pad a numeric string x with zeros on the left, to fill a field\n\
				4946	of the specified width. The string x is never truncated.";
				4947
				4948	static PyObject *
				4949	unicode_zfill(PyUnicodeObject self, PyObject args)
				4950	{
				4951	int fill;
				4952	PyUnicodeObject *u;
				4953
				4954	int width;
				4955	if (!PyArg_ParseTuple(args, "i:zfill", &width))
				4956	return NULL;
				4957
				4958	if (self->length >= width) {
Walter Dörwald	0fe940c	2002-04-15 18:42:15 +0000	[diff] [blame]	4959	if (PyUnicode_CheckExact(self)) {
				4960	Py_INCREF(self);
				4961	return (PyObject*) self;
				4962	}
				4963	else
				4964	return PyUnicode_FromUnicode(
				4965	PyUnicode_AS_UNICODE(self),
				4966	PyUnicode_GET_SIZE(self)
				4967	);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4968	}
				4969
				4970	fill = width - self->length;
				4971
				4972	u = pad(self, fill, 0, '0');
				4973
Walter Dörwald	068325e	2002-04-15 13:36:47 +0000	[diff] [blame]	4974	if (u == NULL)
				4975	return NULL;
				4976
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4977	if (u->str[fill] == '+' \|\| u->str[fill] == '-') {
				4978	/* move sign to beginning of string */
				4979	u->str[0] = u->str[fill];
				4980	u->str[fill] = '0';
				4981	}
				4982
				4983	return (PyObject*) u;
				4984	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4985
				4986	#if 0
				4987	static PyObject*
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	4988	unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4989	{
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4990	return PyInt_FromLong(unicode_freelist_size);
				4991	}
				4992	#endif
				4993
				4994	static char startswith__doc__[] =
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4995	"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4996	\n\
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	4997	Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	4998	optional start, test S beginning at that position. With optional end, stop\n\
				4999	comparing S at that position.";
				5000
				5001	static PyObject *
				5002	unicode_startswith(PyUnicodeObject *self,
				5003	PyObject *args)
				5004	{
				5005	PyUnicodeObject *substring;
				5006	int start = 0;
				5007	int end = INT_MAX;
				5008	PyObject *result;
				5009
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	5010	if (!PyArg_ParseTuple(args, "O\|O&O&:startswith", &substring,
				5011	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5012	return NULL;
				5013	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				5014	(PyObject *)substring);
				5015	if (substring == NULL)
				5016	return NULL;
				5017
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	5018	result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5019
				5020	Py_DECREF(substring);
				5021	return result;
				5022	}
				5023
				5024
				5025	static char endswith__doc__[] =
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	5026	"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5027	\n\
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	5028	Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5029	optional start, test S beginning at that position. With optional end, stop\n\
				5030	comparing S at that position.";
				5031
				5032	static PyObject *
				5033	unicode_endswith(PyUnicodeObject *self,
				5034	PyObject *args)
				5035	{
				5036	PyUnicodeObject *substring;
				5037	int start = 0;
				5038	int end = INT_MAX;
				5039	PyObject *result;
				5040
Guido van Rossum	b8872e6	2000-05-09 14:14:27 +0000	[diff] [blame]	5041	if (!PyArg_ParseTuple(args, "O\|O&O&:endswith", &substring,
				5042	_PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5043	return NULL;
				5044	substring = (PyUnicodeObject *)PyUnicode_FromObject(
				5045	(PyObject *)substring);
				5046	if (substring == NULL)
				5047	return NULL;
				5048
Guido van Rossum	77f6a65	2002-04-03 22:41:51 +0000	[diff] [blame]	5049	result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5050
				5051	Py_DECREF(substring);
				5052	return result;
				5053	}
				5054
				5055
				5056	static PyMethodDef unicode_methods[] = {
				5057
				5058	/* Order is according to common usage: often used methods should
				5059	appear first, since lookup is done sequentially. */
				5060
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	5061	{"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
				5062	{"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
				5063	{"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
				5064	{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
				5065	{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
				5066	{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
				5067	{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
				5068	{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
				5069	{"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
				5070	{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
				5071	{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
				5072	{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
				5073	{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwald	de02bcb	2002-04-22 17:42:37 +0000	[diff] [blame^]	5074	{"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	5075	/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
				5076	{"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
				5077	{"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
				5078	{"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwald	de02bcb	2002-04-22 17:42:37 +0000	[diff] [blame^]	5079	{"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	5080	{"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwald	de02bcb	2002-04-22 17:42:37 +0000	[diff] [blame^]	5081	{"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	5082	{"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
				5083	{"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
				5084	{"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
				5085	{"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
				5086	{"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
				5087	{"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
				5088	{"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
				5089	{"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
				5090	{"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
				5091	{"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
				5092	{"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
				5093	{"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
				5094	{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
				5095	{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	5096	{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald	068325e	2002-04-15 13:36:47 +0000	[diff] [blame]	5097	#if 0
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	5098	{"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5099	#endif
				5100
				5101	#if 0
				5102	/* This one is just used for debugging the implementation. */
Martin v. Löwis	e3eb1f2	2001-08-16 13:15:00 +0000	[diff] [blame]	5103	{"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5104	#endif
				5105
				5106	{NULL, NULL}
				5107	};
				5108
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5109	static PySequenceMethods unicode_as_sequence = {
				5110	(inquiry) unicode_length, /* sq_length */
				5111	(binaryfunc) PyUnicode_Concat, /* sq_concat */
				5112	(intargfunc) unicode_repeat, /* sq_repeat */
				5113	(intargfunc) unicode_getitem, /* sq_item */
				5114	(intintargfunc) unicode_slice, /* sq_slice */
				5115	0, /* sq_ass_item */
				5116	0, /* sq_ass_slice */
Guido van Rossum	403d68b	2000-03-13 15:55:09 +0000	[diff] [blame]	5117	(objobjproc)PyUnicode_Contains, /sq_contains/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5118	};
				5119
				5120	static int
				5121	unicode_buffer_getreadbuf(PyUnicodeObject *self,
				5122	int index,
				5123	const void **ptr)
				5124	{
				5125	if (index != 0) {
				5126	PyErr_SetString(PyExc_SystemError,
				5127	"accessing non-existent unicode segment");
				5128	return -1;
				5129	}
				5130	ptr = (void ) self->str;
				5131	return PyUnicode_GET_DATA_SIZE(self);
				5132	}
				5133
				5134	static int
				5135	unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
				5136	const void **ptr)
				5137	{
				5138	PyErr_SetString(PyExc_TypeError,
				5139	"cannot use unicode as modifyable buffer");
				5140	return -1;
				5141	}
				5142
				5143	static int
				5144	unicode_buffer_getsegcount(PyUnicodeObject *self,
				5145	int *lenp)
				5146	{
				5147	if (lenp)
				5148	*lenp = PyUnicode_GET_DATA_SIZE(self);
				5149	return 1;
				5150	}
				5151
				5152	static int
				5153	unicode_buffer_getcharbuf(PyUnicodeObject *self,
				5154	int index,
				5155	const void **ptr)
				5156	{
				5157	PyObject *str;
				5158
				5159	if (index != 0) {
				5160	PyErr_SetString(PyExc_SystemError,
				5161	"accessing non-existent unicode segment");
				5162	return -1;
				5163	}
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5164	str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5165	if (str == NULL)
				5166	return -1;
				5167	ptr = (void ) PyString_AS_STRING(str);
				5168	return PyString_GET_SIZE(str);
				5169	}
				5170
				5171	/* Helpers for PyUnicode_Format() */
				5172
				5173	static PyObject *
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5174	getnextarg(PyObject args, int arglen, int p_argidx)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5175	{
				5176	int argidx = *p_argidx;
				5177	if (argidx < arglen) {
				5178	(*p_argidx)++;
				5179	if (arglen < 0)
				5180	return args;
				5181	else
				5182	return PyTuple_GetItem(args, argidx);
				5183	}
				5184	PyErr_SetString(PyExc_TypeError,
				5185	"not enough arguments for format string");
				5186	return NULL;
				5187	}
				5188
				5189	#define F_LJUST (1<<0)
				5190	#define F_SIGN (1<<1)
				5191	#define F_BLANK (1<<2)
				5192	#define F_ALT (1<<3)
				5193	#define F_ZERO (1<<4)
				5194
				5195	static
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5196	int usprintf(register Py_UNICODE buffer, char format, ...)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5197	{
				5198	register int i;
				5199	int len;
				5200	va_list va;
				5201	char *charbuffer;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5202	va_start(va, format);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5203
				5204	/* First, format the string as char array, then expand to Py_UNICODE
				5205	array. */
				5206	charbuffer = (char *)buffer;
				5207	len = vsprintf(charbuffer, format, va);
				5208	for (i = len - 1; i >= 0; i--)
				5209	buffer[i] = (Py_UNICODE) charbuffer[i];
				5210
				5211	va_end(va);
				5212	return len;
				5213	}
				5214
				5215	static int
				5216	formatfloat(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5217	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5218	int flags,
				5219	int prec,
				5220	int type,
				5221	PyObject *v)
				5222	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5223	/* fmt = '%#.' + `prec` + `type`
				5224	worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5225	char fmt[20];
				5226	double x;
				5227
				5228	x = PyFloat_AsDouble(v);
				5229	if (x == -1.0 && PyErr_Occurred())
				5230	return -1;
				5231	if (prec < 0)
				5232	prec = 6;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5233	if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
				5234	type = 'g';
Barry Warsaw	e5c492d	2001-11-28 21:00:41 +0000	[diff] [blame]	5235	PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
				5236	(flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5237	/* worst case length calc to ensure no buffer overrun:
				5238	fmt = %#.<prec>g
				5239	buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
				5240	for any double rep.)
				5241	len = 1 + prec + 1 + 2 + 5 = 9 + prec
				5242	If prec=0 the effective precision is 1 (the leading digit is
				5243	always given), therefore increase by one to 10+prec. */
				5244	if (buflen <= (size_t)10 + (size_t)prec) {
				5245	PyErr_SetString(PyExc_OverflowError,
				5246	"formatted float is too long (precision too long?)");
				5247	return -1;
				5248	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5249	return usprintf(buf, fmt, x);
				5250	}
				5251
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5252	static PyObject*
				5253	formatlong(PyObject *val, int flags, int prec, int type)
				5254	{
				5255	char *buf;
				5256	int i, len;
				5257	PyObject str; / temporary string object. */
				5258	PyUnicodeObject *result;
				5259
				5260	str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
				5261	if (!str)
				5262	return NULL;
				5263	result = _PyUnicode_New(len);
				5264	for (i = 0; i < len; i++)
				5265	result->str[i] = buf[i];
				5266	result->str[len] = 0;
				5267	Py_DECREF(str);
				5268	return (PyObject*)result;
				5269	}
				5270
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5271	static int
				5272	formatint(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5273	size_t buflen,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5274	int flags,
				5275	int prec,
				5276	int type,
				5277	PyObject *v)
				5278	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5279	/* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre	5e9c80d	2002-02-28 11:38:24 +0000	[diff] [blame]	5280	* worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
				5281	* + 1 + 1
				5282	* = 24
				5283	*/
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5284	char fmt[64]; /* plenty big enough! */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5285	long x;
				5286
				5287	x = PyInt_AsLong(v);
				5288	if (x == -1 && PyErr_Occurred())
Andrew MacIntyre	5e9c80d	2002-02-28 11:38:24 +0000	[diff] [blame]	5289	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5290	if (prec < 0)
Andrew MacIntyre	5e9c80d	2002-02-28 11:38:24 +0000	[diff] [blame]	5291	prec = 1;
				5292
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5293	/* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre	5e9c80d	2002-02-28 11:38:24 +0000	[diff] [blame]	5294	* worst case buf = '0x' + [0-9]*prec, where prec >= 11
				5295	*/
				5296	if (buflen <= 13 \|\| buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5297	PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre	5e9c80d	2002-02-28 11:38:24 +0000	[diff] [blame]	5298	"formatted integer is too long (precision too large?)");
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5299	return -1;
				5300	}
Andrew MacIntyre	5e9c80d	2002-02-28 11:38:24 +0000	[diff] [blame]	5301
				5302	if ((flags & F_ALT) &&
				5303	(type == 'x' \|\| type == 'X')) {
				5304	/* When converting under %#x or %#X, there are a number
				5305	* of issues that cause pain:
				5306	* - when 0 is being converted, the C standard leaves off
				5307	* the '0x' or '0X', which is inconsistent with other
				5308	* %#x/%#X conversions and inconsistent with Python's
				5309	* hex() function
				5310	* - there are platforms that violate the standard and
				5311	* convert 0 with the '0x' or '0X'
				5312	* (Metrowerks, Compaq Tru64)
				5313	* - there are platforms that give '0x' when converting
				5314	* under %#X, but convert 0 in accordance with the
				5315	* standard (OS/2 EMX)
				5316	*
				5317	* We can achieve the desired consistency by inserting our
				5318	* own '0x' or '0X' prefix, and substituting %x/%X in place
				5319	* of %#x/%#X.
				5320	*
				5321	* Note that this is the same approach as used in
				5322	* formatint() in stringobject.c
Andrew MacIntyre	c487439	2002-02-26 11:36:35 +0000	[diff] [blame]	5323	*/
Andrew MacIntyre	5e9c80d	2002-02-28 11:38:24 +0000	[diff] [blame]	5324	PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
				5325	type, prec, type);
Andrew MacIntyre	c487439	2002-02-26 11:36:35 +0000	[diff] [blame]	5326	}
Andrew MacIntyre	5e9c80d	2002-02-28 11:38:24 +0000	[diff] [blame]	5327	else {
				5328	PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
				5329	(flags&F_ALT) ? "#" : "",
				5330	prec, type);
Tim Peters	b3d8d1f	2001-04-28 05:38:26 +0000	[diff] [blame]	5331	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5332	return usprintf(buf, fmt, x);
				5333	}
				5334
				5335	static int
				5336	formatchar(Py_UNICODE *buf,
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5337	size_t buflen,
				5338	PyObject *v)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5339	{
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5340	/* presume that the buffer is at least 2 characters long */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5341	if (PyUnicode_Check(v)) {
				5342	if (PyUnicode_GET_SIZE(v) != 1)
				5343	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5344	buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5345	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5346
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5347	else if (PyString_Check(v)) {
				5348	if (PyString_GET_SIZE(v) != 1)
				5349	goto onError;
				5350	buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
				5351	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5352
				5353	else {
				5354	/* Integer input truncated to a character */
				5355	long x;
				5356	x = PyInt_AsLong(v);
				5357	if (x == -1 && PyErr_Occurred())
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5358	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5359	buf[0] = (char) x;
				5360	}
				5361	buf[1] = '\0';
				5362	return 1;
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5363
				5364	onError:
				5365	PyErr_SetString(PyExc_TypeError,
				5366	"%c requires int or char");
				5367	return -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5368	}
				5369
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5370	/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
				5371
				5372	FORMATBUFLEN is the length of the buffer in which the floats, ints, &
				5373	chars are formatted. XXX This is a magic number. Each formatting
				5374	routine does bounds checking to ensure no overflow, but a better
				5375	solution may be to malloc a buffer of appropriate size for each
				5376	format. For now, the current solution is sufficient.
				5377	*/
				5378	#define FORMATBUFLEN (size_t)120
				5379
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5380	PyObject PyUnicode_Format(PyObject format,
				5381	PyObject *args)
				5382	{
				5383	Py_UNICODE fmt, res;
				5384	int fmtcnt, rescnt, reslen, arglen, argidx;
				5385	int args_owned = 0;
				5386	PyUnicodeObject *result = NULL;
				5387	PyObject *dict = NULL;
				5388	PyObject *uformat;
				5389
				5390	if (format == NULL \|\| args == NULL) {
				5391	PyErr_BadInternalCall();
				5392	return NULL;
				5393	}
				5394	uformat = PyUnicode_FromObject(format);
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5395	if (uformat == NULL)
				5396	return NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5397	fmt = PyUnicode_AS_UNICODE(uformat);
				5398	fmtcnt = PyUnicode_GET_SIZE(uformat);
				5399
				5400	reslen = rescnt = fmtcnt + 100;
				5401	result = _PyUnicode_New(reslen);
				5402	if (result == NULL)
				5403	goto onError;
				5404	res = PyUnicode_AS_UNICODE(result);
				5405
				5406	if (PyTuple_Check(args)) {
				5407	arglen = PyTuple_Size(args);
				5408	argidx = 0;
				5409	}
				5410	else {
				5411	arglen = -1;
				5412	argidx = -2;
				5413	}
				5414	if (args->ob_type->tp_as_mapping)
				5415	dict = args;
				5416
				5417	while (--fmtcnt >= 0) {
				5418	if (*fmt != '%') {
				5419	if (--rescnt < 0) {
				5420	rescnt = fmtcnt + 100;
				5421	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5422	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5423	return NULL;
				5424	res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
				5425	--rescnt;
				5426	}
				5427	res++ = fmt++;
				5428	}
				5429	else {
				5430	/* Got a format specifier */
				5431	int flags = 0;
				5432	int width = -1;
				5433	int prec = -1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5434	Py_UNICODE c = '\0';
				5435	Py_UNICODE fill;
				5436	PyObject *v = NULL;
				5437	PyObject *temp = NULL;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5438	Py_UNICODE *pbuf;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5439	Py_UNICODE sign;
				5440	int len;
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5441	Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5442
				5443	fmt++;
				5444	if (*fmt == '(') {
				5445	Py_UNICODE *keystart;
				5446	int keylen;
				5447	PyObject *key;
				5448	int pcount = 1;
				5449
				5450	if (dict == NULL) {
				5451	PyErr_SetString(PyExc_TypeError,
				5452	"format requires a mapping");
				5453	goto onError;
				5454	}
				5455	++fmt;
				5456	--fmtcnt;
				5457	keystart = fmt;
				5458	/* Skip over balanced parentheses */
				5459	while (pcount > 0 && --fmtcnt >= 0) {
				5460	if (*fmt == ')')
				5461	--pcount;
				5462	else if (*fmt == '(')
				5463	++pcount;
				5464	fmt++;
				5465	}
				5466	keylen = fmt - keystart - 1;
				5467	if (fmtcnt < 0 \|\| pcount > 0) {
				5468	PyErr_SetString(PyExc_ValueError,
				5469	"incomplete format key");
				5470	goto onError;
				5471	}
Marc-André Lemburg	72f8213	2001-11-20 15:18:49 +0000	[diff] [blame]	5472	#if 0
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5473	/* keys are converted to strings using UTF-8 and
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5474	then looked up since Python uses strings to hold
				5475	variables names etc. in its namespaces and we
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5476	wouldn't want to break common idioms. */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5477	key = PyUnicode_EncodeUTF8(keystart,
				5478	keylen,
				5479	NULL);
Marc-André Lemburg	72f8213	2001-11-20 15:18:49 +0000	[diff] [blame]	5480	#else
				5481	key = PyUnicode_FromUnicode(keystart, keylen);
				5482	#endif
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5483	if (key == NULL)
				5484	goto onError;
				5485	if (args_owned) {
				5486	Py_DECREF(args);
				5487	args_owned = 0;
				5488	}
				5489	args = PyObject_GetItem(dict, key);
				5490	Py_DECREF(key);
				5491	if (args == NULL) {
				5492	goto onError;
				5493	}
				5494	args_owned = 1;
				5495	arglen = -1;
				5496	argidx = -2;
				5497	}
				5498	while (--fmtcnt >= 0) {
				5499	switch (c = *fmt++) {
				5500	case '-': flags \|= F_LJUST; continue;
				5501	case '+': flags \|= F_SIGN; continue;
				5502	case ' ': flags \|= F_BLANK; continue;
				5503	case '#': flags \|= F_ALT; continue;
				5504	case '0': flags \|= F_ZERO; continue;
				5505	}
				5506	break;
				5507	}
				5508	if (c == '*') {
				5509	v = getnextarg(args, arglen, &argidx);
				5510	if (v == NULL)
				5511	goto onError;
				5512	if (!PyInt_Check(v)) {
				5513	PyErr_SetString(PyExc_TypeError,
				5514	"* wants int");
				5515	goto onError;
				5516	}
				5517	width = PyInt_AsLong(v);
				5518	if (width < 0) {
				5519	flags \|= F_LJUST;
				5520	width = -width;
				5521	}
				5522	if (--fmtcnt >= 0)
				5523	c = *fmt++;
				5524	}
				5525	else if (c >= '0' && c <= '9') {
				5526	width = c - '0';
				5527	while (--fmtcnt >= 0) {
				5528	c = *fmt++;
				5529	if (c < '0' \|\| c > '9')
				5530	break;
				5531	if ((width*10) / 10 != width) {
				5532	PyErr_SetString(PyExc_ValueError,
				5533	"width too big");
				5534	goto onError;
				5535	}
				5536	width = width*10 + (c - '0');
				5537	}
				5538	}
				5539	if (c == '.') {
				5540	prec = 0;
				5541	if (--fmtcnt >= 0)
				5542	c = *fmt++;
				5543	if (c == '*') {
				5544	v = getnextarg(args, arglen, &argidx);
				5545	if (v == NULL)
				5546	goto onError;
				5547	if (!PyInt_Check(v)) {
				5548	PyErr_SetString(PyExc_TypeError,
				5549	"* wants int");
				5550	goto onError;
				5551	}
				5552	prec = PyInt_AsLong(v);
				5553	if (prec < 0)
				5554	prec = 0;
				5555	if (--fmtcnt >= 0)
				5556	c = *fmt++;
				5557	}
				5558	else if (c >= '0' && c <= '9') {
				5559	prec = c - '0';
				5560	while (--fmtcnt >= 0) {
				5561	c = Py_CHARMASK(*fmt++);
				5562	if (c < '0' \|\| c > '9')
				5563	break;
				5564	if ((prec*10) / 10 != prec) {
				5565	PyErr_SetString(PyExc_ValueError,
				5566	"prec too big");
				5567	goto onError;
				5568	}
				5569	prec = prec*10 + (c - '0');
				5570	}
				5571	}
				5572	} /* prec */
				5573	if (fmtcnt >= 0) {
				5574	if (c == 'h' \|\| c == 'l' \|\| c == 'L') {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5575	if (--fmtcnt >= 0)
				5576	c = *fmt++;
				5577	}
				5578	}
				5579	if (fmtcnt < 0) {
				5580	PyErr_SetString(PyExc_ValueError,
				5581	"incomplete format");
				5582	goto onError;
				5583	}
				5584	if (c != '%') {
				5585	v = getnextarg(args, arglen, &argidx);
				5586	if (v == NULL)
				5587	goto onError;
				5588	}
				5589	sign = 0;
				5590	fill = ' ';
				5591	switch (c) {
				5592
				5593	case '%':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5594	pbuf = formatbuf;
				5595	/* presume that buffer length is at least 1 */
				5596	pbuf[0] = '%';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5597	len = 1;
				5598	break;
				5599
				5600	case 's':
				5601	case 'r':
				5602	if (PyUnicode_Check(v) && c == 's') {
				5603	temp = v;
				5604	Py_INCREF(temp);
				5605	}
				5606	else {
				5607	PyObject *unicode;
				5608	if (c == 's')
				5609	temp = PyObject_Str(v);
				5610	else
				5611	temp = PyObject_Repr(v);
				5612	if (temp == NULL)
				5613	goto onError;
				5614	if (!PyString_Check(temp)) {
				5615	/* XXX Note: this should never happen, since
				5616	PyObject_Repr() and PyObject_Str() assure
				5617	this */
				5618	Py_DECREF(temp);
				5619	PyErr_SetString(PyExc_TypeError,
				5620	"%s argument has non-string str()");
				5621	goto onError;
				5622	}
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5623	unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5624	PyString_GET_SIZE(temp),
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5625	NULL,
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5626	"strict");
				5627	Py_DECREF(temp);
				5628	temp = unicode;
				5629	if (temp == NULL)
				5630	goto onError;
				5631	}
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5632	pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5633	len = PyUnicode_GET_SIZE(temp);
				5634	if (prec >= 0 && len > prec)
				5635	len = prec;
				5636	break;
				5637
				5638	case 'i':
				5639	case 'd':
				5640	case 'u':
				5641	case 'o':
				5642	case 'x':
				5643	case 'X':
				5644	if (c == 'i')
				5645	c = 'd';
Tim Peters	a3a3a03	2000-11-30 05:22:44 +0000	[diff] [blame]	5646	if (PyLong_Check(v)) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5647	temp = formatlong(v, flags, prec, c);
				5648	if (!temp)
				5649	goto onError;
				5650	pbuf = PyUnicode_AS_UNICODE(temp);
				5651	len = PyUnicode_GET_SIZE(temp);
				5652	/* unbounded ints can always produce
				5653	a sign character! */
				5654	sign = 1;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5655	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5656	else {
				5657	pbuf = formatbuf;
				5658	len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5659	flags, prec, c, v);
				5660	if (len < 0)
				5661	goto onError;
				5662	/* only d conversion is signed */
				5663	sign = c == 'd';
				5664	}
				5665	if (flags & F_ZERO)
				5666	fill = '0';
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5667	break;
				5668
				5669	case 'e':
				5670	case 'E':
				5671	case 'f':
				5672	case 'g':
				5673	case 'G':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5674	pbuf = formatbuf;
				5675	len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
				5676	flags, prec, c, v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5677	if (len < 0)
				5678	goto onError;
				5679	sign = 1;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5680	if (flags & F_ZERO)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5681	fill = '0';
				5682	break;
				5683
				5684	case 'c':
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5685	pbuf = formatbuf;
				5686	len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5687	if (len < 0)
				5688	goto onError;
				5689	break;
				5690
				5691	default:
				5692	PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling	6ca8917	2000-12-15 13:07:46 +0000	[diff] [blame]	5693	"unsupported format character '%c' (0x%x) "
				5694	"at index %i",
Andrew M. Kuchling	f947ffe	2000-12-19 22:49:06 +0000	[diff] [blame]	5695	(31<=c && c<=126) ? c : '?',
				5696	c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5697	goto onError;
				5698	}
				5699	if (sign) {
Marc-André Lemburg	f28dd83	2000-06-30 10:29:57 +0000	[diff] [blame]	5700	if (pbuf == '-' \|\| pbuf == '+') {
				5701	sign = *pbuf++;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5702	len--;
				5703	}
				5704	else if (flags & F_SIGN)
				5705	sign = '+';
				5706	else if (flags & F_BLANK)
				5707	sign = ' ';
				5708	else
				5709	sign = 0;
				5710	}
				5711	if (width < len)
				5712	width = len;
				5713	if (rescnt < width + (sign != 0)) {
				5714	reslen -= rescnt;
				5715	rescnt = width + fmtcnt + 100;
				5716	reslen += rescnt;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5717	if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5718	return NULL;
				5719	res = PyUnicode_AS_UNICODE(result)
				5720	+ reslen - rescnt;
				5721	}
				5722	if (sign) {
				5723	if (fill != ' ')
				5724	*res++ = sign;
				5725	rescnt--;
				5726	if (width > len)
				5727	width--;
				5728	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5729	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
				5730	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5731	assert(pbuf[1] == c);
				5732	if (fill != ' ') {
				5733	res++ = pbuf++;
				5734	res++ = pbuf++;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5735	}
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5736	rescnt -= 2;
				5737	width -= 2;
				5738	if (width < 0)
				5739	width = 0;
				5740	len -= 2;
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5741	}
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5742	if (width > len && !(flags & F_LJUST)) {
				5743	do {
				5744	--rescnt;
				5745	*res++ = fill;
				5746	} while (--width > len);
				5747	}
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5748	if (fill == ' ') {
				5749	if (sign)
				5750	*res++ = sign;
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5751	if ((flags & F_ALT) && (c == 'x' \|\| c == 'X')) {
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5752	assert(pbuf[0] == '0');
Tim Peters	fff5325	2001-04-12 18:38:48 +0000	[diff] [blame]	5753	assert(pbuf[1] == c);
Tim Peters	38fd5b6	2000-09-21 05:43:11 +0000	[diff] [blame]	5754	res++ = pbuf++;
				5755	res++ = pbuf++;
				5756	}
				5757	}
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5758	Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5759	res += len;
				5760	rescnt -= len;
				5761	while (--width >= len) {
				5762	--rescnt;
				5763	*res++ = ' ';
				5764	}
				5765	if (dict && (argidx < arglen) && c != '%') {
				5766	PyErr_SetString(PyExc_TypeError,
				5767	"not all arguments converted");
				5768	goto onError;
				5769	}
				5770	Py_XDECREF(temp);
				5771	} /* '%' */
				5772	} /* until end */
				5773	if (argidx < arglen && !dict) {
				5774	PyErr_SetString(PyExc_TypeError,
				5775	"not all arguments converted");
				5776	goto onError;
				5777	}
				5778
				5779	if (args_owned) {
				5780	Py_DECREF(args);
				5781	}
				5782	Py_DECREF(uformat);
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5783	if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5784	goto onError;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5785	return (PyObject *)result;
				5786
				5787	onError:
				5788	Py_XDECREF(result);
				5789	Py_DECREF(uformat);
				5790	if (args_owned) {
				5791	Py_DECREF(args);
				5792	}
				5793	return NULL;
				5794	}
				5795
				5796	static PyBufferProcs unicode_as_buffer = {
				5797	(getreadbufferproc) unicode_buffer_getreadbuf,
				5798	(getwritebufferproc) unicode_buffer_getwritebuf,
				5799	(getsegcountproc) unicode_buffer_getsegcount,
				5800	(getcharbufferproc) unicode_buffer_getcharbuf,
				5801	};
				5802
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5803	staticforward PyObject *
				5804	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds);
				5805
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5806	static PyObject *
				5807	unicode_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5808	{
				5809	PyObject *x = NULL;
				5810	static char *kwlist[] = {"string", "encoding", "errors", 0};
				5811	char *encoding = NULL;
				5812	char *errors = NULL;
				5813
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5814	if (type != &PyUnicode_Type)
				5815	return unicode_subtype_new(type, args, kwds);
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5816	if (!PyArg_ParseTupleAndKeywords(args, kwds, "\|Oss:unicode",
				5817	kwlist, &x, &encoding, &errors))
				5818	return NULL;
				5819	if (x == NULL)
				5820	return (PyObject *)_PyUnicode_New(0);
Guido van Rossum	b8c65bc	2001-10-19 02:01:31 +0000	[diff] [blame]	5821	if (encoding == NULL && errors == NULL)
				5822	return PyObject_Unicode(x);
				5823	else
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5824	return PyUnicode_FromEncodedObject(x, encoding, errors);
				5825	}
				5826
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5827	static PyObject *
				5828	unicode_subtype_new(PyTypeObject type, PyObject args, PyObject *kwds)
				5829	{
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5830	PyUnicodeObject tmp, pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5831	int n;
				5832
				5833	assert(PyType_IsSubtype(type, &PyUnicode_Type));
				5834	tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
				5835	if (tmp == NULL)
				5836	return NULL;
				5837	assert(PyUnicode_Check(tmp));
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5838	pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
				5839	if (pnew == NULL)
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5840	return NULL;
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5841	pnew->str = PyMem_NEW(Py_UNICODE, n+1);
				5842	if (pnew->str == NULL) {
				5843	_Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer	58aa861	2002-04-12 03:07:20 +0000	[diff] [blame]	5844	PyObject_Del(pnew);
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5845	return NULL;
				5846	}
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5847	Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
				5848	pnew->length = n;
				5849	pnew->hash = tmp->hash;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5850	Py_DECREF(tmp);
Tim Peters	af90b3e	2001-09-12 05:18:58 +0000	[diff] [blame]	5851	return (PyObject *)pnew;
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5852	}
				5853
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5854	static char unicode_doc[] =
				5855	"unicode(string [, encoding[, errors]]) -> object\n\
				5856	\n\
				5857	Create a new Unicode object from the given encoded string.\n\
				5858	encoding defaults to the current default string encoding and \n\
				5859	errors, defining the error handling, to 'strict'.";
				5860
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5861	PyTypeObject PyUnicode_Type = {
				5862	PyObject_HEAD_INIT(&PyType_Type)
				5863	0, /* ob_size */
				5864	"unicode", /* tp_name */
				5865	sizeof(PyUnicodeObject), /* tp_size */
				5866	0, /* tp_itemsize */
				5867	/* Slots */
Guido van Rossum	9475a23	2001-10-05 20:51:39 +0000	[diff] [blame]	5868	(destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5869	0, /* tp_print */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5870	0, /* tp_getattr */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5871	0, /* tp_setattr */
				5872	(cmpfunc) unicode_compare, /* tp_compare */
				5873	(reprfunc) unicode_repr, /* tp_repr */
				5874	0, /* tp_as_number */
				5875	&unicode_as_sequence, /* tp_as_sequence */
				5876	0, /* tp_as_mapping */
				5877	(hashfunc) unicode_hash, /* tp_hash*/
				5878	0, /* tp_call*/
				5879	(reprfunc) unicode_str, /* tp_str */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5880	PyObject_GenericGetAttr, /* tp_getattro */
				5881	0, /* tp_setattro */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5882	&unicode_as_buffer, /* tp_as_buffer */
Guido van Rossum	e023fe0	2001-08-30 03:12:59 +0000	[diff] [blame]	5883	Py_TPFLAGS_DEFAULT \| Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters	6d6c1a3	2001-08-02 04:15:00 +0000	[diff] [blame]	5884	unicode_doc, /* tp_doc */
				5885	0, /* tp_traverse */
				5886	0, /* tp_clear */
				5887	0, /* tp_richcompare */
				5888	0, /* tp_weaklistoffset */
				5889	0, /* tp_iter */
				5890	0, /* tp_iternext */
				5891	unicode_methods, /* tp_methods */
				5892	0, /* tp_members */
				5893	0, /* tp_getset */
				5894	0, /* tp_base */
				5895	0, /* tp_dict */
				5896	0, /* tp_descr_get */
				5897	0, /* tp_descr_set */
				5898	0, /* tp_dictoffset */
				5899	0, /* tp_init */
				5900	0, /* tp_alloc */
				5901	unicode_new, /* tp_new */
Neil Schemenauer	58aa861	2002-04-12 03:07:20 +0000	[diff] [blame]	5902	PyObject_Del, /* tp_free */
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5903	};
				5904
				5905	/* Initialize the Unicode implementation */
				5906
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5907	void _PyUnicode_Init(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5908	{
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5909	int i;
				5910
Fred Drake	e4315f5	2000-05-09 19:53:39 +0000	[diff] [blame]	5911	/* Init the implementation */
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5912	unicode_freelist = NULL;
				5913	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5914	unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg	90e8147	2000-06-07 09:13:21 +0000	[diff] [blame]	5915	strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5916	for (i = 0; i < 256; i++)
				5917	unicode_latin1[i] = NULL;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5918	}
				5919
				5920	/* Finalize the Unicode implementation */
				5921
				5922	void
Thomas Wouters	7889010	2000-07-22 19:25:51 +0000	[diff] [blame]	5923	_PyUnicode_Fini(void)
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5924	{
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5925	PyUnicodeObject *u;
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5926	int i;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5927
Guido van Rossum	4ae8ef8	2000-10-03 18:09:04 +0000	[diff] [blame]	5928	Py_XDECREF(unicode_empty);
				5929	unicode_empty = NULL;
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5930
Marc-André Lemburg	8155e0e	2001-04-23 14:44:21 +0000	[diff] [blame]	5931	for (i = 0; i < 256; i++) {
				5932	if (unicode_latin1[i]) {
				5933	Py_DECREF(unicode_latin1[i]);
				5934	unicode_latin1[i] = NULL;
				5935	}
				5936	}
				5937
Barry Warsaw	5b4c228	2000-10-03 20:45:26 +0000	[diff] [blame]	5938	for (u = unicode_freelist; u != NULL;) {
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5939	PyUnicodeObject *v = u;
				5940	u = (PyUnicodeObject *)u;
Guido van Rossum	fd4b957	2000-04-10 13:51:10 +0000	[diff] [blame]	5941	if (v->str)
Guido van Rossum	b18618d	2000-05-03 23:44:39 +0000	[diff] [blame]	5942	PyMem_DEL(v->str);
Marc-André Lemburg	bff879c	2000-08-03 18:46:08 +0000	[diff] [blame]	5943	Py_XDECREF(v->defenc);
Neil Schemenauer	58aa861	2002-04-12 03:07:20 +0000	[diff] [blame]	5944	PyObject_Del(v);
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5945	}
Marc-André Lemburg	d4ab4a5	2000-06-08 17:54:00 +0000	[diff] [blame]	5946	unicode_freelist = NULL;
				5947	unicode_freelist_size = 0;
Guido van Rossum	d57fd91	2000-03-10 22:53:23 +0000	[diff] [blame]	5948	}